Skip to content

Commit cdbc78c

Browse files
committed
extract from local files
1 parent 13a3c7b commit cdbc78c

File tree

4 files changed

+956
-1
lines changed

4 files changed

+956
-1
lines changed
Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,292 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Match local books in ~/Documents/Books with categorizations from CSV
4+
and copy them to eleven_upload folders.
5+
"""
6+
import argparse
7+
import csv
8+
import os
9+
import re
10+
import shutil
11+
from difflib import SequenceMatcher
12+
13+
14+
def normalize_text(text):
15+
"""Normalize text for comparison by removing punctuation and lowercasing."""
16+
if not text:
17+
return ""
18+
# Remove common punctuation and extra spaces
19+
normalized = re.sub(r"[^\w\s]", " ", text.lower())
20+
normalized = re.sub(r"\s+", " ", normalized).strip()
21+
return normalized
22+
23+
24+
def extract_title_author_from_filename(filename):
25+
"""
26+
Extract likely title and author from filename.
27+
Common patterns:
28+
- Title - Author - (Year, Publisher).ext
29+
- Title by Author.ext
30+
- Title_ Author - (Year, Publisher).ext
31+
"""
32+
# Remove extension
33+
name = os.path.splitext(filename)[0]
34+
35+
# Try pattern: "Title - Author - (Year, Publisher)"
36+
match = re.match(r"^(.+?)\s*-\s*(.+?)\s*-\s*\(", name)
37+
if match:
38+
return match.group(1).strip(), match.group(2).strip()
39+
40+
# Try pattern: "Title by Author"
41+
match = re.match(r"^(.+?)\s+by\s+(.+?)(?:\s*\(|$)", name)
42+
if match:
43+
return match.group(1).strip(), match.group(2).strip()
44+
45+
# Try pattern: "Title - Author"
46+
match = re.match(r"^(.+?)\s*-\s*(.+?)(?:\s*\(|$)", name)
47+
if match:
48+
return match.group(1).strip(), match.group(2).strip()
49+
50+
# Try pattern: "Title_ Author"
51+
match = re.match(r"^(.+?)_\s*(.+?)(?:\s*-\s*\(|$)", name)
52+
if match:
53+
return match.group(1).strip(), match.group(2).strip()
54+
55+
# If no pattern matches, return the whole name as title
56+
return name.strip(), ""
57+
58+
59+
def similarity_score(str1, str2):
60+
"""Calculate similarity between two strings (0-1)."""
61+
norm1 = normalize_text(str1)
62+
norm2 = normalize_text(str2)
63+
64+
if not norm1 or not norm2:
65+
return 0.0
66+
67+
# Use SequenceMatcher for fuzzy matching
68+
return SequenceMatcher(None, norm1, norm2).ratio()
69+
70+
71+
def find_best_match(local_title, local_author, csv_books):
72+
"""
73+
Find the best matching book from CSV based on title and author.
74+
Returns (best_match_dict, score) or (None, 0) if no good match.
75+
"""
76+
best_match = None
77+
best_score = 0.0
78+
79+
for book in csv_books:
80+
csv_title = book["title"]
81+
csv_author = book["author"]
82+
83+
# Calculate title similarity
84+
title_sim = similarity_score(local_title, csv_title)
85+
86+
# Calculate author similarity
87+
author_sim = similarity_score(local_author, csv_author) if local_author else 0.5
88+
89+
# Combined score: weight title more heavily (70% title, 30% author)
90+
combined_score = 0.7 * title_sim + 0.3 * author_sim
91+
92+
if combined_score > best_score:
93+
best_score = combined_score
94+
best_match = book
95+
96+
return best_match, best_score
97+
98+
99+
def sanitize_folder_name(name):
100+
"""Sanitize folder name by removing invalid characters."""
101+
sanitized = re.sub(r'[<>:"/\\|?*]', "", name)
102+
sanitized = sanitized.strip(". ")
103+
return sanitized if sanitized else "Uncategorized"
104+
105+
106+
def load_csv_books(csv_path):
107+
"""Load books from CSV file."""
108+
books = []
109+
with open(csv_path, "r", encoding="utf-8") as f:
110+
reader = csv.DictReader(f)
111+
for row in reader:
112+
books.append(row)
113+
return books
114+
115+
116+
def organize_local_books(books_dir, csv_path, dest_dir, min_score=0.5, dry_run=True):
117+
"""
118+
Match local books with CSV categorizations and copy to eleven_upload.
119+
120+
Args:
121+
books_dir: Directory containing local book files
122+
csv_path: Path to CSV with book categorizations
123+
dest_dir: Destination directory for organized books
124+
min_score: Minimum similarity score to accept a match (0-1)
125+
dry_run: If True, only print what would be done
126+
"""
127+
# Load CSV books
128+
csv_books = load_csv_books(csv_path)
129+
print(f"Loaded {len(csv_books)} books from CSV")
130+
131+
# Find all book files
132+
book_files = []
133+
for fname in os.listdir(books_dir):
134+
if fname.startswith("."):
135+
continue
136+
fpath = os.path.join(books_dir, fname)
137+
if os.path.isfile(fpath) and fname.lower().endswith((".epub", ".pdf")):
138+
book_files.append(fpath)
139+
140+
print(f"Found {len(book_files)} book files in {books_dir}\n")
141+
142+
# Match and categorize
143+
matched = []
144+
unmatched = []
145+
146+
for book_path in sorted(book_files):
147+
filename = os.path.basename(book_path)
148+
local_title, local_author = extract_title_author_from_filename(filename)
149+
150+
# Find best match
151+
best_match, score = find_best_match(local_title, local_author, csv_books)
152+
153+
if score >= min_score and best_match:
154+
matched.append(
155+
{
156+
"local_path": book_path,
157+
"local_title": local_title,
158+
"local_author": local_author,
159+
"csv_title": best_match["title"],
160+
"csv_author": best_match["author"],
161+
"category": best_match["bookshelf"],
162+
"score": score,
163+
}
164+
)
165+
else:
166+
unmatched.append(
167+
{
168+
"local_path": book_path,
169+
"local_title": local_title,
170+
"local_author": local_author,
171+
"best_guess": best_match["title"] if best_match else "None",
172+
"score": score,
173+
}
174+
)
175+
176+
# Print results
177+
print(f"Matched: {len(matched)} books")
178+
print(f"Unmatched: {len(unmatched)} books (score < {min_score})\n")
179+
180+
# Group by category
181+
by_category = {}
182+
for book in matched:
183+
category = book["category"]
184+
if category not in by_category:
185+
by_category[category] = []
186+
by_category[category].append(book)
187+
188+
# Copy books
189+
total_copied = 0
190+
for category in sorted(by_category.keys()):
191+
books = by_category[category]
192+
safe_category = sanitize_folder_name(category)
193+
dest_category_dir = os.path.join(dest_dir, safe_category)
194+
195+
print(f"\nCategory: {category} ({len(books)} books)")
196+
197+
if not dry_run:
198+
os.makedirs(dest_category_dir, exist_ok=True)
199+
200+
for book in books:
201+
src_file = book["local_path"]
202+
dest_file = os.path.join(dest_category_dir, os.path.basename(src_file))
203+
204+
if dry_run:
205+
print(f" Would copy: {book['local_title']}")
206+
print(f" Matched with: {book['csv_title']} (score: {book['score']:.2f})")
207+
print(f" From: {src_file}")
208+
print(f" To: {dest_file}")
209+
else:
210+
try:
211+
shutil.copy2(src_file, dest_file)
212+
print(
213+
f" Copied: {book['local_title']} -> {category} (score: {book['score']:.2f})"
214+
)
215+
total_copied += 1
216+
except Exception as e:
217+
print(f" Error copying {book['local_title']}: {e}")
218+
219+
# Report unmatched
220+
if unmatched:
221+
print(f"\n{'='*60}")
222+
print(f"UNMATCHED BOOKS ({len(unmatched)}):")
223+
print(f"{'='*60}")
224+
for book in unmatched[:20]: # Show first 20
225+
print(f"\n Local: {book['local_title']}")
226+
if book["local_author"]:
227+
print(f" Author: {book['local_author']}")
228+
print(f" Best guess: {book['best_guess']} (score: {book['score']:.2f})")
229+
if len(unmatched) > 20:
230+
print(f"\n ... and {len(unmatched) - 20} more")
231+
232+
if not dry_run:
233+
print(f"\nTotal books copied: {total_copied}")
234+
else:
235+
print(f"\nDry run complete. Would copy {len(matched)} books.")
236+
print("Run with --copy to actually copy the files.")
237+
238+
239+
def main():
240+
parser = argparse.ArgumentParser(
241+
description="Match local books with CSV categorizations and copy to eleven_upload"
242+
)
243+
parser.add_argument(
244+
"--copy", action="store_true", help="Actually copy the files (default is dry run)"
245+
)
246+
parser.add_argument(
247+
"--books-dir",
248+
default="~/Documents/Books",
249+
help="Directory containing local book files",
250+
)
251+
parser.add_argument(
252+
"--csv",
253+
default="~/Downloads/unfinished_books.csv",
254+
help="CSV file with book categorizations",
255+
)
256+
parser.add_argument(
257+
"--dest",
258+
default="~/Documents/Books/eleven_upload",
259+
help="Destination directory for organized books",
260+
)
261+
parser.add_argument(
262+
"--min-score",
263+
type=float,
264+
default=0.5,
265+
help="Minimum similarity score to accept a match (0-1, default: 0.5)",
266+
)
267+
args = parser.parse_args()
268+
269+
books_dir = os.path.expanduser(args.books_dir)
270+
csv_path = os.path.expanduser(args.csv)
271+
dest_dir = os.path.expanduser(args.dest)
272+
273+
if not os.path.exists(books_dir):
274+
print(f"Error: Books directory not found: {books_dir}")
275+
return
276+
277+
if not os.path.exists(csv_path):
278+
print(f"Error: CSV file not found: {csv_path}")
279+
return
280+
281+
if not args.copy:
282+
print("=== Dry Run ===")
283+
organize_local_books(books_dir, csv_path, dest_dir, args.min_score, dry_run=True)
284+
print("\nDry run complete. Use --copy to actually copy the files.")
285+
else:
286+
print("=== Copying Books ===")
287+
organize_local_books(books_dir, csv_path, dest_dir, args.min_score, dry_run=False)
288+
print("\nDone!")
289+
290+
291+
if __name__ == "__main__":
292+
main()

0 commit comments

Comments
 (0)