|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Match local books in ~/Documents/Books with categorizations from CSV |
| 4 | +and copy them to eleven_upload folders. |
| 5 | +""" |
| 6 | +import argparse |
| 7 | +import csv |
| 8 | +import os |
| 9 | +import re |
| 10 | +import shutil |
| 11 | +from difflib import SequenceMatcher |
| 12 | + |
| 13 | + |
| 14 | +def normalize_text(text): |
| 15 | + """Normalize text for comparison by removing punctuation and lowercasing.""" |
| 16 | + if not text: |
| 17 | + return "" |
| 18 | + # Remove common punctuation and extra spaces |
| 19 | + normalized = re.sub(r"[^\w\s]", " ", text.lower()) |
| 20 | + normalized = re.sub(r"\s+", " ", normalized).strip() |
| 21 | + return normalized |
| 22 | + |
| 23 | + |
| 24 | +def extract_title_author_from_filename(filename): |
| 25 | + """ |
| 26 | + Extract likely title and author from filename. |
| 27 | + Common patterns: |
| 28 | + - Title - Author - (Year, Publisher).ext |
| 29 | + - Title by Author.ext |
| 30 | + - Title_ Author - (Year, Publisher).ext |
| 31 | + """ |
| 32 | + # Remove extension |
| 33 | + name = os.path.splitext(filename)[0] |
| 34 | + |
| 35 | + # Try pattern: "Title - Author - (Year, Publisher)" |
| 36 | + match = re.match(r"^(.+?)\s*-\s*(.+?)\s*-\s*\(", name) |
| 37 | + if match: |
| 38 | + return match.group(1).strip(), match.group(2).strip() |
| 39 | + |
| 40 | + # Try pattern: "Title by Author" |
| 41 | + match = re.match(r"^(.+?)\s+by\s+(.+?)(?:\s*\(|$)", name) |
| 42 | + if match: |
| 43 | + return match.group(1).strip(), match.group(2).strip() |
| 44 | + |
| 45 | + # Try pattern: "Title - Author" |
| 46 | + match = re.match(r"^(.+?)\s*-\s*(.+?)(?:\s*\(|$)", name) |
| 47 | + if match: |
| 48 | + return match.group(1).strip(), match.group(2).strip() |
| 49 | + |
| 50 | + # Try pattern: "Title_ Author" |
| 51 | + match = re.match(r"^(.+?)_\s*(.+?)(?:\s*-\s*\(|$)", name) |
| 52 | + if match: |
| 53 | + return match.group(1).strip(), match.group(2).strip() |
| 54 | + |
| 55 | + # If no pattern matches, return the whole name as title |
| 56 | + return name.strip(), "" |
| 57 | + |
| 58 | + |
| 59 | +def similarity_score(str1, str2): |
| 60 | + """Calculate similarity between two strings (0-1).""" |
| 61 | + norm1 = normalize_text(str1) |
| 62 | + norm2 = normalize_text(str2) |
| 63 | + |
| 64 | + if not norm1 or not norm2: |
| 65 | + return 0.0 |
| 66 | + |
| 67 | + # Use SequenceMatcher for fuzzy matching |
| 68 | + return SequenceMatcher(None, norm1, norm2).ratio() |
| 69 | + |
| 70 | + |
| 71 | +def find_best_match(local_title, local_author, csv_books): |
| 72 | + """ |
| 73 | + Find the best matching book from CSV based on title and author. |
| 74 | + Returns (best_match_dict, score) or (None, 0) if no good match. |
| 75 | + """ |
| 76 | + best_match = None |
| 77 | + best_score = 0.0 |
| 78 | + |
| 79 | + for book in csv_books: |
| 80 | + csv_title = book["title"] |
| 81 | + csv_author = book["author"] |
| 82 | + |
| 83 | + # Calculate title similarity |
| 84 | + title_sim = similarity_score(local_title, csv_title) |
| 85 | + |
| 86 | + # Calculate author similarity |
| 87 | + author_sim = similarity_score(local_author, csv_author) if local_author else 0.5 |
| 88 | + |
| 89 | + # Combined score: weight title more heavily (70% title, 30% author) |
| 90 | + combined_score = 0.7 * title_sim + 0.3 * author_sim |
| 91 | + |
| 92 | + if combined_score > best_score: |
| 93 | + best_score = combined_score |
| 94 | + best_match = book |
| 95 | + |
| 96 | + return best_match, best_score |
| 97 | + |
| 98 | + |
| 99 | +def sanitize_folder_name(name): |
| 100 | + """Sanitize folder name by removing invalid characters.""" |
| 101 | + sanitized = re.sub(r'[<>:"/\\|?*]', "", name) |
| 102 | + sanitized = sanitized.strip(". ") |
| 103 | + return sanitized if sanitized else "Uncategorized" |
| 104 | + |
| 105 | + |
| 106 | +def load_csv_books(csv_path): |
| 107 | + """Load books from CSV file.""" |
| 108 | + books = [] |
| 109 | + with open(csv_path, "r", encoding="utf-8") as f: |
| 110 | + reader = csv.DictReader(f) |
| 111 | + for row in reader: |
| 112 | + books.append(row) |
| 113 | + return books |
| 114 | + |
| 115 | + |
| 116 | +def organize_local_books(books_dir, csv_path, dest_dir, min_score=0.5, dry_run=True): |
| 117 | + """ |
| 118 | + Match local books with CSV categorizations and copy to eleven_upload. |
| 119 | +
|
| 120 | + Args: |
| 121 | + books_dir: Directory containing local book files |
| 122 | + csv_path: Path to CSV with book categorizations |
| 123 | + dest_dir: Destination directory for organized books |
| 124 | + min_score: Minimum similarity score to accept a match (0-1) |
| 125 | + dry_run: If True, only print what would be done |
| 126 | + """ |
| 127 | + # Load CSV books |
| 128 | + csv_books = load_csv_books(csv_path) |
| 129 | + print(f"Loaded {len(csv_books)} books from CSV") |
| 130 | + |
| 131 | + # Find all book files |
| 132 | + book_files = [] |
| 133 | + for fname in os.listdir(books_dir): |
| 134 | + if fname.startswith("."): |
| 135 | + continue |
| 136 | + fpath = os.path.join(books_dir, fname) |
| 137 | + if os.path.isfile(fpath) and fname.lower().endswith((".epub", ".pdf")): |
| 138 | + book_files.append(fpath) |
| 139 | + |
| 140 | + print(f"Found {len(book_files)} book files in {books_dir}\n") |
| 141 | + |
| 142 | + # Match and categorize |
| 143 | + matched = [] |
| 144 | + unmatched = [] |
| 145 | + |
| 146 | + for book_path in sorted(book_files): |
| 147 | + filename = os.path.basename(book_path) |
| 148 | + local_title, local_author = extract_title_author_from_filename(filename) |
| 149 | + |
| 150 | + # Find best match |
| 151 | + best_match, score = find_best_match(local_title, local_author, csv_books) |
| 152 | + |
| 153 | + if score >= min_score and best_match: |
| 154 | + matched.append( |
| 155 | + { |
| 156 | + "local_path": book_path, |
| 157 | + "local_title": local_title, |
| 158 | + "local_author": local_author, |
| 159 | + "csv_title": best_match["title"], |
| 160 | + "csv_author": best_match["author"], |
| 161 | + "category": best_match["bookshelf"], |
| 162 | + "score": score, |
| 163 | + } |
| 164 | + ) |
| 165 | + else: |
| 166 | + unmatched.append( |
| 167 | + { |
| 168 | + "local_path": book_path, |
| 169 | + "local_title": local_title, |
| 170 | + "local_author": local_author, |
| 171 | + "best_guess": best_match["title"] if best_match else "None", |
| 172 | + "score": score, |
| 173 | + } |
| 174 | + ) |
| 175 | + |
| 176 | + # Print results |
| 177 | + print(f"Matched: {len(matched)} books") |
| 178 | + print(f"Unmatched: {len(unmatched)} books (score < {min_score})\n") |
| 179 | + |
| 180 | + # Group by category |
| 181 | + by_category = {} |
| 182 | + for book in matched: |
| 183 | + category = book["category"] |
| 184 | + if category not in by_category: |
| 185 | + by_category[category] = [] |
| 186 | + by_category[category].append(book) |
| 187 | + |
| 188 | + # Copy books |
| 189 | + total_copied = 0 |
| 190 | + for category in sorted(by_category.keys()): |
| 191 | + books = by_category[category] |
| 192 | + safe_category = sanitize_folder_name(category) |
| 193 | + dest_category_dir = os.path.join(dest_dir, safe_category) |
| 194 | + |
| 195 | + print(f"\nCategory: {category} ({len(books)} books)") |
| 196 | + |
| 197 | + if not dry_run: |
| 198 | + os.makedirs(dest_category_dir, exist_ok=True) |
| 199 | + |
| 200 | + for book in books: |
| 201 | + src_file = book["local_path"] |
| 202 | + dest_file = os.path.join(dest_category_dir, os.path.basename(src_file)) |
| 203 | + |
| 204 | + if dry_run: |
| 205 | + print(f" Would copy: {book['local_title']}") |
| 206 | + print(f" Matched with: {book['csv_title']} (score: {book['score']:.2f})") |
| 207 | + print(f" From: {src_file}") |
| 208 | + print(f" To: {dest_file}") |
| 209 | + else: |
| 210 | + try: |
| 211 | + shutil.copy2(src_file, dest_file) |
| 212 | + print( |
| 213 | + f" Copied: {book['local_title']} -> {category} (score: {book['score']:.2f})" |
| 214 | + ) |
| 215 | + total_copied += 1 |
| 216 | + except Exception as e: |
| 217 | + print(f" Error copying {book['local_title']}: {e}") |
| 218 | + |
| 219 | + # Report unmatched |
| 220 | + if unmatched: |
| 221 | + print(f"\n{'='*60}") |
| 222 | + print(f"UNMATCHED BOOKS ({len(unmatched)}):") |
| 223 | + print(f"{'='*60}") |
| 224 | + for book in unmatched[:20]: # Show first 20 |
| 225 | + print(f"\n Local: {book['local_title']}") |
| 226 | + if book["local_author"]: |
| 227 | + print(f" Author: {book['local_author']}") |
| 228 | + print(f" Best guess: {book['best_guess']} (score: {book['score']:.2f})") |
| 229 | + if len(unmatched) > 20: |
| 230 | + print(f"\n ... and {len(unmatched) - 20} more") |
| 231 | + |
| 232 | + if not dry_run: |
| 233 | + print(f"\nTotal books copied: {total_copied}") |
| 234 | + else: |
| 235 | + print(f"\nDry run complete. Would copy {len(matched)} books.") |
| 236 | + print("Run with --copy to actually copy the files.") |
| 237 | + |
| 238 | + |
| 239 | +def main(): |
| 240 | + parser = argparse.ArgumentParser( |
| 241 | + description="Match local books with CSV categorizations and copy to eleven_upload" |
| 242 | + ) |
| 243 | + parser.add_argument( |
| 244 | + "--copy", action="store_true", help="Actually copy the files (default is dry run)" |
| 245 | + ) |
| 246 | + parser.add_argument( |
| 247 | + "--books-dir", |
| 248 | + default="~/Documents/Books", |
| 249 | + help="Directory containing local book files", |
| 250 | + ) |
| 251 | + parser.add_argument( |
| 252 | + "--csv", |
| 253 | + default="~/Downloads/unfinished_books.csv", |
| 254 | + help="CSV file with book categorizations", |
| 255 | + ) |
| 256 | + parser.add_argument( |
| 257 | + "--dest", |
| 258 | + default="~/Documents/Books/eleven_upload", |
| 259 | + help="Destination directory for organized books", |
| 260 | + ) |
| 261 | + parser.add_argument( |
| 262 | + "--min-score", |
| 263 | + type=float, |
| 264 | + default=0.5, |
| 265 | + help="Minimum similarity score to accept a match (0-1, default: 0.5)", |
| 266 | + ) |
| 267 | + args = parser.parse_args() |
| 268 | + |
| 269 | + books_dir = os.path.expanduser(args.books_dir) |
| 270 | + csv_path = os.path.expanduser(args.csv) |
| 271 | + dest_dir = os.path.expanduser(args.dest) |
| 272 | + |
| 273 | + if not os.path.exists(books_dir): |
| 274 | + print(f"Error: Books directory not found: {books_dir}") |
| 275 | + return |
| 276 | + |
| 277 | + if not os.path.exists(csv_path): |
| 278 | + print(f"Error: CSV file not found: {csv_path}") |
| 279 | + return |
| 280 | + |
| 281 | + if not args.copy: |
| 282 | + print("=== Dry Run ===") |
| 283 | + organize_local_books(books_dir, csv_path, dest_dir, args.min_score, dry_run=True) |
| 284 | + print("\nDry run complete. Use --copy to actually copy the files.") |
| 285 | + else: |
| 286 | + print("=== Copying Books ===") |
| 287 | + organize_local_books(books_dir, csv_path, dest_dir, args.min_score, dry_run=False) |
| 288 | + print("\nDone!") |
| 289 | + |
| 290 | + |
| 291 | +if __name__ == "__main__": |
| 292 | + main() |
0 commit comments