-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdftotts.py
More file actions
1414 lines (1239 loc) · 96.9 KB
/
pdftotts.py
File metadata and controls
1414 lines (1239 loc) · 96.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import sys
from pathlib import Path
# --- Add Owl directory to Python path ---
script_dir = Path(__file__).resolve().parent
owl_dir = script_dir.parent
if str(owl_dir) not in sys.path:
sys.path.insert(0, str(owl_dir))
root_dir = owl_dir.parent
if str(root_dir) not in sys.path:
sys.path.insert(0, str(root_dir))
# --- End Path Addition ---
from enum import Enum
import subprocess
import glob
import argparse
import logging
import re # Import regex module
from typing import List, Optional, Tuple, Type, Dict, Any, Union, Set
from time import sleep
import tempfile
import datetime # Import datetime for append message
import shutil # Import for directory removal
import json # Import JSON for parsing state
import concurrent.futures # <--- Added for threading
# --- ADDED: Import PyPDF2 ---
try:
import PyPDF2
from PyPDF2.errors import PdfReadError, DependencyError
except ImportError:
print("Error: PyPDF2 library not found. Please install it: pip install pypdf2")
sys.exit(1)
from dotenv import load_dotenv
# --- Import necessary CAMEL components ---
from camel.toolkits import ImageAnalysisToolkit
from camel.models import ModelFactory, BaseModelBackend
from camel.messages import BaseMessage
from camel.types import ModelPlatformType, ModelType, UnifiedModelType
# --- Model Definitions (Example Unified Models) ---
# Keep your custom models if needed, example remains
class mymodel(UnifiedModelType, Enum):
OPENROUTER_QUASAR = "openrouter/quasar-alpha"
OPENROUTER_SCOUT = "meta-llama/llama-4-scout:free"
OPENROUTER_MAVERICK = "meta-llama/llama-4-maverick:free"
OPENROUTER_GEMINI_PRO_2_5 = "google/gemini-2.5-pro-exp-03-25:free"
OPENROUTER_GEMINI_FLASH_THINKING_2_0 = "google/gemini-2.0-flash-thinking-exp:free"
OPENROUTER_GEMINI_FLASH_2_0 = "google/gemini-2.0-flash-exp:free"
OPENROUTER_GEMINI_LEARN_LM = "google/learnlm-1.5-pro-experimental:free"
OPENROUTER_QWEN_2_5_VL_3B = "qwen/qwen2.5-vl-3b-instruct:free"
OPENROUTER_QWEN_2_5_VL_32B = "qwen/qwen2.5-vl-32b-instruct:free"
# --- Default Models & Failover Configuration ---
DEFAULT_TRANSCRIPTION_MODEL = "gemini-2.0-flash-lite-preview-02-05" # Using a Gemini model as default
DEFAULT_ANALYSIS_MODEL = "google/gemini-2.5-pro-exp-03-25:free" # Default analysis model
DEFAULT_BACKUP_MODEL = "google/gemini-2.0-flash-exp:free" # Default backup model
FAILURE_THRESHOLD = 3 # Number of consecutive failures to trigger switch
# --- Prompt Templates ---
# Transcription Prompt (Keep PROMPT_TEMPLATE as before)
PROMPT_TEMPLATE = """
# Prompt for TTS Book Page Transcription (Optimized for Continuity & Excluding References/Markers)
*Objective:** Transcribe the text from the **middle** book page image provided into clear, natural-sounding prose optimized for Text-to-Speech (TTS) playback. The key goals are a smooth narrative flow between pages, completely **excluding** any reference to external sources (books, papers) or internal structural pointers and markers (page numbers, figure/table/equation numbers, chapter/section numbers, list numbers, running headers/footers, etc.).
*Input:** Three images are provided side-by-side:
1. Previous Page (Left Image) - **CONTEXT ONLY for visuals**
2. **Target Page (Middle Image) - TRANSCRIBE THIS PAGE**
3. Next Page (Right Image) - **CONTEXT ONLY for visuals**
*Context from Previous Page's Transcription:**
{previous_transcription_context_section}
*Critical Instructions:**
1. **Focus EXCLUSIVELY on the Middle Page:** Do NOT transcribe any content from the left (previous) or right (next) page images. They are provided solely for contextual understanding when describing visuals *on the middle page*.
2. **Ancillary Page Check (Middle Page Only):**
* **CRITICAL Check:** If the **middle page** is clearly identifiable as **primarily** containing content such as a book cover, title page, copyright information, Preface, Foreword, Dedication, Table of Contents, List of Figures/Tables, Acknowledgments, Introduction (if clearly marked as introductory material *before* Chapter 1 or the main narrative), Notes section, Bibliography, References, Index, Appendix, Glossary, or other front/back matter **not part of the main narrative body/chapters**, **output the single word: `SKIPPED`**.
* **Strict Adherence:** **Do NOT transcribe these ancillary page types.** If the page header, title, or main content block clearly indicates it's one of these types (e.g., starts with "Preface", "Table of Contents", "Index"), it **must** be skipped. Only transcribe pages that are part of the core chapters or narrative content.
*Transcription Guidelines (For Non-Skipped Middle Pages):**
The following guidelines ensure the transcription flows naturally as spoken prose, focusing on the core content while omitting disruptive elements like references and structural markers.*
1. **Continuity from Previous Page:**
* If `<previous_transcription>` context is provided, the transcription for the *middle page* **must seamlessly concatenate** with the very end of the provided context. This means:
* **Start Precisely:** Begin the new transcription *exactly* where the previous context finished, even if it's mid-sentence or mid-paragraph.
* **No Interruptions:** Add absolutely **no** extra pauses, breaks, paragraph starts, or introductory filler words between the context and the new text. The join must be direct and immediate.
* **No Repetitions:** **Critically avoid repeating** any words, phrases, or sentences from the end of the previous context. **Specifically, if the text at the very top of the middle page begins with words identical to the end of the `previous_transcription` context, you MUST skip these duplicated words on the middle page** and start the transcription immediately following them.
* **No Broken Flow:** Ensure sentences and paragraphs flow naturally across the page boundary without being artificially broken, cut off mid-word (unless hyphenated in source), or restarted incorrectly.
The final combined text must read as a single, uninterrupted piece of prose, making the transition between the previous context and the new transcription **completely undetectable** in the final audio output.
* If no `<previous_transcription>` context is provided (e.g., for the very first page being transcribed), start the transcription naturally based on the middle page content.
2. **Verbatim Text & Order (Middle Page):** Transcribe the textual content **from the main body** of the middle page accurately, preserving the original order of information, *subject to the critical omissions below and the continuity guideline above*. Focus on the primary block(s) of text containing the narrative or core information, excluding isolated header/footer lines unless they introduce a new section directly within that main block.
3. **Handling Visuals (Images, Tables, Charts, Diagrams, Equations on Middle Page):**
* **Describe the Content/Purpose:** When a visual element appears in the text flow, describe **what it shows or illustrates** rather than just noting its presence. Focus on its main point, key takeaway, or the information it conveys relevant to the surrounding text.
* **If a visual has NO caption:** Insert a concise, contextual description. Start with varied, natural phrasing (e.g., "Here is a diagram illustrating...", "The following table compares...", "A photograph shows...", "This chart displays trends in...", "An equation shows the relationship between..."). Use surrounding text (and context from previous/next pages if helpful) to inform the description.
* **If a visual HAS a caption:** Integrate the caption smoothly into the narration *as part of the description*. Do not just state "Caption reads..."; weave it in naturally (e.g., "As shown in the illustration depicting regional deforestation rates...", "The table, captioned '[caption text]', highlights...", "An equation, described as '[caption text]', defines..."). Read the essential parts of the caption text itself as part of the description.
* **Brevity for Visuals:** Keep descriptions concise. Summarize key trends, comparisons, the main subject of an image, or the purpose of an equation. Avoid listing exhaustive details or data points from tables/charts.
* **Crucially: Omit Numbers/Labels in Description:** When describing the visual or integrating its caption, do **not** mention its original number or label (e.g., 'Figure 3.1', 'Table 2', 'Equation 5'). The description should stand alone, focusing solely on the visual's content and purpose.
4. **Omissions (Middle Page Text):**
* **a. Page Numbers:** Omit entirely (from headers, footers, or body text).
* **b. Structural Numbers (Chapters, Sections, Headers, Lists):** Omit numbers that identify chapters, sections, sub-sections, or items in a numbered list, *when these numbers appear in titles, headers, or at the start of the list item itself*. (e.g., Read "Chapter: Title" not "Chapter 1: Title"; transcribe list items as continuous prose without the numbers).
* **c. Figure/Table/Equation Labels:** Omit standalone labels like "Figure 3.1", "Table 2a", "Chart 5", "Eq. 5".
* **d. Footnote/Endnote Markers:** Omit symbols or numbers indicating footnotes/endnotes (e.g., *, †, ¹, ²).
* **e. URLs/Web Addresses:** Omit entirely.
* **f. Citations, References, and Internal Pointers:** **MOST CRITICAL OMISSION:** **Absolutely and completely omit *any* text, marker, or phrase that refers to external sources, bibliographic entries, other authors' works, specific publications, OR internal pointers within the *same* book (page numbers, figure numbers, etc.).** Connect surrounding sentences smoothly. (Examples: `[15]`, `(Smith 2023)`, `"see Fig. 4"`, `"on page 123"` MUST BE OMITTED).
* **g. Running Headers/Footers:** **CRITICAL OMISSION:** Omit text appearing consistently in the header (top margin) or footer (bottom margin) area, clearly separate from the main text block(s), especially if it repeats chapter/section titles. **Focus transcription solely on the primary narrative content block(s).** Do *not* repeat a chapter or section title if it appears as a running header/footer on a page *after* the section has already begun. However, if a title or heading clearly marks the *beginning* of a new chapter or major section *within the main text block* on *this specific page*, it **should** be included (unless it's just a number, per 4b).
5. **Replacements & Formatting for TTS:**
* **a. Symbols:** Replace with spoken equivalents ("%" -> "percent"). Describe math meaning, don't read complex symbols literally.
* **b. Abbreviations:** Spell out common ones ("Figure" not "Fig."). Use common acronyms (NASA) if used as such.
* **c. Punctuation:** Use effectively for natural pauses.
* **d. Formatting Conversion:** Convert bullet points/numbered lists into flowing prose, **omitting the bullets/numbers**. Use connecting phrases if appropriate (e.g., "First,... Second,... Finally,...").
* **e. Disruptive Characters:** Avoid #, *, excessive parentheses.
*Output Format:**
* If the page is skipped (per Instruction #2), output **only** the single word `SKIPPED`.
* If the page is transcribed, provide **only** the final, clean transcribed text.
* **Crucially: Do NOT wrap the output text in ``` ```, backticks, code blocks, quotation marks, or any other formatting markers.** The output should be plain text suitable for direct use.
---
*Now, please process the provided middle book page image based on these strictly revised guidelines, ensuring smooth continuity if previous transcription context is provided.**
"""
# --- ADDED: TOC Analysis Prompt ---
PROMPT_TOC_ANALYSIS = """
# Prompt for PDF Table of Contents (TOC) Analysis
**Objective:** Analyze the provided Table of Contents (TOC) text extracted from a PDF book and determine the page range corresponding to the **main narrative content** (i.e., the core chapters or primary sections). Exclude pages belonging to typical front matter and back matter.
**Input:** The following text represents the Table of Contents extracted from the PDF:
```text
{toc_text}
```
**Definitions:**
* **Main Narrative Content:** The primary body of the book, usually organized into chapters (e.g., Chapter 1, Chapter 2...) or major titled sections that constitute the core story, argument, or information being presented.
* **Front Matter (Exclude):** Pages typically appearing *before* the main narrative. Examples include: Book Cover, Title Page, Copyright Page, Dedication, Epigraph, Table of Contents (the TOC itself), List of Figures, List of Tables, Foreword, Preface, Acknowledgments, Introduction (if it serves as preliminary material *before* the first main chapter/section).
* **Back Matter (Exclude):** Pages typically appearing *after* the main narrative. Examples include: Appendix/Appendices, Notes, Glossary, Bibliography, References, Index, Colophon, About the Author.
**Instructions:**
1. **Analyze the TOC:** Carefully examine the structure and titles in the provided `{toc_text}`. Identify the entries that most likely mark the beginning and end of the main narrative content, according to the definitions above.
2. **Identify Start Page:** Determine the page number associated with the *first* entry you identify as part of the main narrative content (e.g., the start of "Chapter 1", "Part I", or the first main section).
3. **Identify End Page:** Determine the page number associated with the *last* entry you identify as part of the main narrative content (e.g., the end of the last chapter or main section, *before* any back matter like Appendices or Index begins). If the end page isn't explicitly listed for the last main content item, use the page number of the *next* item (likely the start of back matter) minus one, or make a reasonable estimate based on surrounding page numbers.
4. **Handle Ambiguity:** If the TOC structure is unclear, or if it's difficult to definitively separate main content from front/back matter, make your best reasonable judgment. If you cannot confidently determine a range, state this in your reasoning.
5. **Output Format:** Respond **only** with a valid JSON object containing the determined start and end page numbers. Use `null` if a value cannot be determined.
```json
{{
"reasoning": "Brief explanation of how the start and end pages were identified (e.g., 'Main content starts with Chapter 1 on page 5 and ends with Chapter 10 on page 250, before the Appendix on page 251.'). If unable to determine, explain why.",
"start_page": <start_page_number_or_null>,
"end_page": <end_page_number_or_null>
}}
```
**Example Output 1 (Successful):**
```json
{{
"reasoning": "Identified 'Chapter 1: The Beginning' starting on page 12 as the main content start. The last main section 'Chapter 20: Conclusions' appears to end before the 'Appendix A' which starts on page 345. Estimated end page as 344.",
"start_page": 12,
"end_page": 344
}}
```
**Example Output 2 (Cannot Determine):**
```json
{{
"reasoning": "The provided TOC is very sparse and lacks clear chapter/section markers or page numbers. Cannot reliably distinguish main content from other sections.",
"start_page": null,
"end_page": null
}}
```
**Now, analyze the provided TOC text and return the JSON output.**
"""
# --- Logging Setup ---
logger = logging.getLogger(__name__)
# Basic config will be set later based on args
# Load environment variables from .env file
load_dotenv(".env", verbose=True)
# --- Constants ---
RESUME_STATE_FILENAME = "_pdftotts_resume_state.json"
END_MARKER = "The End."
TEMP_IMAGE_SUBDIR = ".pdftotts_images" # Used ONLY in --book-dir mode
SYSTEM_TEMP_DIR_PATTERN = "/tmp/image_*.png" # For Camel toolkit temp files
# --- Helper Functions ---
def extract_page_number(filename: Path) -> Optional[int]:
"""Extracts the page number from filenames like 'prefix-digits.ext'. (Used only for --input-dir mode)"""
match = re.search(r"-(\d+)\.\w+$", filename.name)
if match:
try:
return int(match.group(1))
except ValueError:
logger.warning(f"Could not convert extracted number to int in filename: {filename.name}")
return None
else:
# Try alternative: only digits before extension
match_alt = re.search(r"(\d+)\.\w+$", filename.name)
if match_alt:
try:
# Less stringent check for image mode
logger.debug(f"Found digits {match_alt.group(1)} in {filename.name}. Assuming it's the page number.")
return int(match_alt.group(1))
except ValueError:
logger.warning(f"Could not convert extracted number to int in filename: {filename.name}")
return None
else:
logger.warning(f"Could not extract page number (format '-digits.' or 'digits.') from filename: {filename.name}")
return None
# --- MODIFIED: Helper Function to generate composite image (can be run in thread) ---
def create_composite_image_task(image_paths: List[Path], output_dir: Path, base_name: str) -> Optional[Path]:
"""Creates a temporary composite image from exactly 3 images using ImageMagick's montage tool.
This is the function to be executed in a background thread."""
if not image_paths or len(image_paths) != 3:
logger.warning(f"Cannot create composite task: Need exactly 3 image paths, got {len(image_paths)} for base {base_name}.")
return None
# Ensure all input paths actually exist before attempting montage
missing_files = [p for p in image_paths if not p.exists()]
if missing_files:
logger.error(f"Cannot create composite task: Missing input image(s): {[str(p.name) for p in missing_files]} for base {base_name}.")
return None
image_paths_str = ' '.join(f'"{path}"' for path in image_paths)
# Composites will go directly into the specified output_dir
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{base_name}_composite.png"
# If composite already exists, return it (might happen on retry/overlap)
if output_path.exists():
logger.debug(f"Composite image {output_path.name} already exists. Returning path.")
return output_path
command = f"montage -mode concatenate -tile x1 {image_paths_str} \"{output_path}\""
try:
logger.debug(f"Running montage command in background thread: {command}")
result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True, encoding='utf-8')
if output_path.exists():
logger.info(f"Background task created composite image: {output_path.name}")
return output_path
else:
logger.error(f"Montage command ran (bg) but output file not found: {output_path.name}")
logger.error(f"Montage (bg) stdout: {result.stdout}")
logger.error(f"Montage (bg) stderr: {result.stderr}")
return None
except subprocess.CalledProcessError as e:
logger.error(f"Error creating composite image (bg) with command '{command}': {e}")
logger.error(f"Montage (bg) stdout: {e.stdout}")
logger.error(f"Montage (bg) stderr: {e.stderr}")
# Raise the error so the Future captures it
raise e
except FileNotFoundError:
logger.error("Error: 'montage' command not found. Please ensure ImageMagick is installed and in your system's PATH.")
raise RuntimeError("ImageMagick 'montage' command not found.")
except Exception as e:
logger.error(f"Unexpected error creating composite image {base_name} (bg): {e}", exc_info=True)
# Raise the error so the Future captures it
raise e
def transcribe_image(
active_toolkit: ImageAnalysisToolkit,
image_path: Path,
original_middle_page_name: str,
previous_transcription: Optional[str] = None,
prompt_template: str = PROMPT_TEMPLATE
) -> Tuple[Optional[str], bool]:
"""
Uses the provided ImageAnalysisToolkit instance to transcribe text from a composite image.
Handles internal retries but signals overall success/failure for external failover logic.
Returns:
Tuple[Optional[str], bool]: (Transcription text or None, success_flag)
success_flag is True if API call worked AND returned valid content.
False otherwise (API error, timeout, empty/invalid content).
"""
logger.info(f"Transcribing composite image representing middle page: {original_middle_page_name} using {active_toolkit.model.model_type.value}")
if previous_transcription:
max_context_chars = 2000
truncated_context = previous_transcription[-max_context_chars:]
context_section = f"<previous_transcription>\n{truncated_context}\n</previous_transcription>"
logger.debug(f"Providing last ~{len(truncated_context)} chars of previous transcription as context.")
else:
context_section = "<previous_transcription>No previous transcription context provided.</previous_transcription>"
logger.debug("No previous transcription context provided.")
final_prompt = prompt_template.format(previous_transcription_context_section=context_section)
transcription = None
try:
retry_delay = 2
max_retries = 3 # Internal retries per toolkit attempt
retries = 0
while retries < max_retries:
call_success = False
try:
# Call the model
transcription = active_toolkit.ask_question_about_image(
image_path=str(image_path),
question=final_prompt
)
call_success = True # If no exception, API call itself succeeded
except Exception as e_inner:
logger.warning(f"Transcription attempt {retries + 1}/{max_retries} API call failed for {original_middle_page_name} (using {active_toolkit.model.model_type.value}): {e_inner}. Retrying in {retry_delay}s...")
# call_success remains False
# Process the result if the API call worked
if call_success:
if transcription:
# Clean up potential markdown/code blocks
transcription = transcription.strip()
transcription = re.sub(r'^```(text|markdown)?\s*', '', transcription, flags=re.IGNORECASE | re.MULTILINE)
transcription = re.sub(r'\s*```$', '', transcription, flags=re.IGNORECASE | re.MULTILINE)
transcription = transcription.strip('"\'') # Remove leading/trailing quotes
if transcription and "Analysis failed" not in transcription and len(transcription) > 5:
logger.info(f"Transcription successful for {original_middle_page_name} (attempt {retries+1} with {active_toolkit.model.model_type.value}). Length: {len(transcription)}")
return transcription, True # Return result and signal external success
else:
logger.warning(f"Transcription attempt {retries + 1}/{max_retries} produced short/invalid output for {original_middle_page_name} (using {active_toolkit.model.model_type.value}). Content: '{transcription[:50]}...'. Treating as failure for this attempt.")
transcription = None # Reset for retry
# Do NOT proceed to retry wait/increment here; let the outer loop handle it
else: # API call succeeded but returned empty transcription
logger.warning(f"Received empty transcription on attempt {retries + 1}/{max_retries} for {original_middle_page_name} (using {active_toolkit.model.model_type.value}). Treating as failure for this attempt.")
# Do NOT proceed to retry wait/increment here; let the outer loop handle it
# Retry logic (only if needed based on call_success or invalid content)
if not call_success or transcription is None:
sleep(retry_delay)
retry_delay *= 2
retries += 1
continue # Go to next retry
else: # Should only happen if content was valid
break # Exit the loop, transcription is valid
# If loop finished without returning valid data
if transcription is None:
logger.error(f"Transcription FAILED after {max_retries} internal attempts for {original_middle_page_name} (using {active_toolkit.model.model_type.value}).")
return None, False # Return failure signal
# This path should ideally not be reached if logic is correct, but as safety:
return transcription, True # Assume success if we broke loop with valid transcription
except Exception as e:
logger.error(f"Unhandled error during transcription retry loop for {image_path.name} ({original_middle_page_name}): {e}", exc_info=True)
return None, False # Return failure signal
finally:
cleanup_system_temp_files() # Cleanup Camel's temp files
# --- NEW: Function to convert a single PDF page ---
def convert_pdf_page_to_image(
pdf_path: Path,
page_number: int,
output_dir: Path,
image_format: str = 'png',
dpi: int = 300
) -> Optional[Path]:
"""
Converts a single page of a PDF file to an image using pdftoppm.
Args:
pdf_path: Path to the input PDF file.
page_number: The 1-based page number to convert.
output_dir: The directory to save the output image.
image_format: Output image format (e.g., 'png', 'jpeg').
dpi: Resolution for the output image.
Returns:
The Path to the created image file, or None if conversion failed.
"""
if not pdf_path.is_file():
logger.error(f"PDF file not found for page conversion: {pdf_path}")
return None
if page_number < 1:
logger.error(f"Invalid page number {page_number}. Must be 1 or greater.")
return None
if shutil.which("pdftoppm") is None:
logger.critical("Critical Error: 'pdftoppm' command not found. Please ensure poppler-utils is installed.")
raise RuntimeError("pdftoppm not found") # Raise error to stop processing
output_dir.mkdir(parents=True, exist_ok=True)
# Determine padding based on total pages
total_pages = get_pdf_page_count(pdf_path) # Use helper function
if total_pages is None:
logger.warning(f"Could not get total page count for {pdf_path.name}, using default padding.")
padding_width = 3
else:
padding_width = len(str(total_pages))
# Construct expected filename based on pdftoppm behavior
output_prefix_path_part = output_dir / pdf_path.stem
expected_output_filename = f"{pdf_path.stem}-{page_number:0{padding_width}d}.{image_format}"
expected_output_path = output_dir / expected_output_filename
# If the file already exists, assume it's correct and return path
if expected_output_path.exists():
logger.debug(f"Image for page {page_number} already exists: {expected_output_path.name}. Skipping conversion.")
return expected_output_path
command = [
"pdftoppm",
f"-{image_format}",
"-r", str(dpi),
"-f", str(page_number), # First page to process
"-l", str(page_number), # Last page to process
str(pdf_path),
str(output_prefix_path_part) # Base path/prefix for output
]
try:
logger.debug(f"Converting page {page_number} of '{pdf_path.name}' using command: {' '.join(command)}")
result = subprocess.run(command, check=True, capture_output=True, text=True, encoding='utf-8')
logger.debug(f"pdftoppm stdout for page {page_number}: {result.stdout}")
if result.stderr:
logger.debug(f"pdftoppm stderr for page {page_number}: {result.stderr}")
# Verify the specific output file was created
if expected_output_path.exists():
logger.info(f"Successfully converted page {page_number} to: {expected_output_path.name}")
return expected_output_path
else:
logger.error(f"pdftoppm command ran for page {page_number} but expected output file not found: {expected_output_path.name}")
# Log directory contents for debugging
try:
files_in_dir = list(output_dir.glob(f'{pdf_path.stem}-*.{image_format}')) # Be more specific
logger.error(f"PDF-related files currently in {output_dir}: {[f.name for f in files_in_dir]}")
# Attempt to find a file with the correct number but maybe different padding
alternative_match = list(output_dir.glob(f"{pdf_path.stem}-{page_number}.{image_format}")) # Check without strict padding
if alternative_match:
logger.warning(f"Found match with potentially different padding: {alternative_match[0].name}. Returning this.")
return alternative_match[0]
else: logger.error("No alternative file match found either.")
except Exception as e_list: logger.error(f"Could not list directory contents for debugging: {e_list}")
return None # Indicate failure
except subprocess.CalledProcessError as e:
logger.error(f"Error converting page {page_number} of '{pdf_path.name}': {e}")
logger.error(f"pdftoppm stdout: {e.stdout}")
logger.error(f"pdftoppm stderr: {e.stderr}")
return None
except Exception as e:
logger.error(f"Unexpected error during PDF page {page_number} conversion for {pdf_path.name}: {e}", exc_info=True)
return None
# --- State Management Functions ---
# load_resume_state and save_resume_state remain unchanged
def load_resume_state(state_file_path: Path) -> Dict[str, Any]:
"""Loads the resume state from a JSON file."""
if not state_file_path.exists():
return {}
try:
with open(state_file_path, 'r', encoding='utf-8') as f:
raw_state_data = json.load(f)
logger.info(f"Loaded resume state from {state_file_path}")
# Check format and convert if necessary (keeping previous logic)
converted_state_data = {}
needs_resave = False
for pdf_filename, data in raw_state_data.items():
if isinstance(data, int): # Old format
logger.warning(f"Detected old state format for '{pdf_filename}'. Converting.")
converted_state_data[pdf_filename] = {
"last_processed": data,
"detected_start": None, "detected_end": None
}
needs_resave = True
elif isinstance(data, dict): # Assume new format
if "last_processed" not in data: data["last_processed"] = None
converted_state_data[pdf_filename] = data
else: logger.warning(f"Skipping unrecognized state entry for '{pdf_filename}'. Data type: {type(data)}")
if needs_resave:
logger.info("Resaving state file in new format after conversion.")
save_resume_state(state_file_path, converted_state_data)
return converted_state_data
except json.JSONDecodeError: logger.warning(f"Error decoding JSON state: {state_file_path}. Starting fresh."); return {}
except IOError as e: logger.error(f"Error reading state file {state_file_path}: {e}. Starting fresh."); return {}
except Exception as e: logger.error(f"Unexpected error loading state {state_file_path}: {e}. Starting fresh.", exc_info=True); return {}
def save_resume_state(state_file_path: Path, state_data: Dict[str, Any]):
"""Saves the resume state (new format) to a JSON file."""
try:
state_file_path.parent.mkdir(parents=True, exist_ok=True)
# Filter out entries that are completely empty or only have nulls
cleaned_state = {}
for pdf, data in state_data.items():
if isinstance(data, dict) and any(v is not None for v in data.values()):
cleaned_state[pdf] = data
if not cleaned_state:
logger.debug(f"State data is empty, removing state file: {state_file_path}")
if state_file_path.exists(): state_file_path.unlink(missing_ok=True)
return
with open(state_file_path, 'w', encoding='utf-8') as f:
json.dump(cleaned_state, f, indent=4)
logger.debug(f"Saved resume state to {state_file_path}")
except IOError as e: logger.error(f"Error writing state file {state_file_path}: {e}")
except Exception as e: logger.error(f"Unexpected error saving state {state_file_path}: {e}", exc_info=True)
# --- MODIFIED: PDF Page Processing Function handles failover state ---
def process_pdf_pages(
pdf_path: Path,
num_pdf_pages: int,
pdf_image_dir: Path,
output_path: Optional[Path],
append_mode: bool,
start_page: Optional[int],
end_page: Optional[int],
failover_state: Dict[str, Any], # Pass mutable state dictionary
prompt_template: str,
initial_context: Optional[str],
pdf_filename: str,
state_file_path: Optional[Path],
state_data: Dict[str, Any],
image_format: str = 'png',
dpi: int = 300,
executor: concurrent.futures.ThreadPoolExecutor = None
) -> Tuple[bool, Optional[str]]:
"""
Processes pages of a PDF with async composite generation and uses failover models.
Updates the passed `failover_state` dictionary directly.
"""
logger.info(f"Processing PDF: {pdf_filename} [Effective Pages {start_page or 1} to {end_page or num_pdf_pages}] with Async Composite")
# Get toolkit instances from failover state for easier access
t_primary_tk = failover_state['t_primary_tk']
t_backup_tk = failover_state['t_backup_tk']
# t_active_tk and t_failures will be accessed directly via failover_state dict
files_processed_count = 0
last_successful_transcription = initial_context
processing_successful = True
current_images_on_disk: Set[Path] = set()
if not executor:
logger.critical("ThreadPoolExecutor must be provided for process_pdf_pages.")
return False, initial_context
temp_composite_dir = pdf_image_dir / ".composites"
def get_image_path(page_num: int) -> Path:
# Reusing get_pdf_page_count to determine padding needed
total_pages = get_pdf_page_count(pdf_path)
padding_width = len(str(total_pages)) if total_pages else 3
return pdf_image_dir / f"{pdf_path.stem}-{page_num:0{padding_width}d}.{image_format}"
def ensure_page_image_exists(page_num: int) -> Optional[Path]:
if page_num < 1 or page_num > num_pdf_pages: return None
img_path = get_image_path(page_num)
if img_path not in current_images_on_disk and not img_path.exists():
logger.info(f"Generating missing image for page {page_num}...")
generated_path = convert_pdf_page_to_image(pdf_path, page_num, pdf_image_dir, image_format, dpi)
if generated_path: current_images_on_disk.add(generated_path); return generated_path
else: logger.error(f"Failed to generate required image for page {page_num}."); return None
elif img_path.exists():
if img_path not in current_images_on_disk: current_images_on_disk.add(img_path)
return img_path
else: logger.error(f"Logic error: Image path {img_path.name} not in cache set and does not exist."); return None
def delete_page_image(page_num: int):
if page_num < 1: return
img_path = get_image_path(page_num)
if img_path in current_images_on_disk:
try: img_path.unlink(); logger.debug(f"Deleted page image: {img_path.name}"); current_images_on_disk.remove(img_path)
except OSError as e_del: logger.warning(f"Could not delete page image {img_path.name}: {e_del}")
elif img_path.exists():
try: img_path.unlink(); logger.debug(f"Deleted stray page image: {img_path.name}")
except OSError as e_del_stray: logger.warning(f"Could not delete stray page image {img_path.name}: {e_del_stray}")
try:
pdf_image_dir.mkdir(parents=True, exist_ok=True)
temp_composite_dir.mkdir(exist_ok=True)
logger.info(f"Using temporary image directory: {pdf_image_dir}")
logger.info(f"Using temporary composite directory: {temp_composite_dir}")
except OSError as e_comp_dir:
logger.critical(f"Could not create temporary dirs under {pdf_image_dir}: {e_comp_dir}. Cannot process PDF.")
return False, initial_context
if output_path and not append_mode:
try:
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f_init: f_init.truncate(0)
logger.info(f"Output file {output_path} truncated (or created).")
except IOError as e: logger.error(f"Error preparing output file {output_path}: {e}"); return False, initial_context
loop_start = start_page if start_page is not None else 1
loop_end = end_page if end_page is not None else num_pdf_pages
logger.info(f"Looping through pages {loop_start} to {loop_end} (inclusive) for {pdf_filename}.")
pdf_state = state_data.get(pdf_filename, {})
current_composite_future: Optional[concurrent.futures.Future] = None
next_composite_future: Optional[concurrent.futures.Future] = None
# Pre-submission logic (remains the same)
first_transcribe_page = max(loop_start, 2 if loop_start < 2 else loop_start) # Ensure we don't try to get page 0
if first_transcribe_page <= loop_end:
logger.info(f"Pre-submitting composite generation task for first transcribable page: {first_transcribe_page}")
p_prev_first, p_curr_first, p_next_first = first_transcribe_page - 1, first_transcribe_page, first_transcribe_page + 1
if p_prev_first >= 1 and p_next_first <= num_pdf_pages:
paths_first = [ensure_page_image_exists(p) for p in [p_prev_first, p_curr_first, p_next_first]]
if all(p is not None for p in paths_first):
base_name_first = f"page_{p_curr_first:04d}"
current_composite_future = executor.submit(create_composite_image_task, paths_first, temp_composite_dir, base_name_first)
logger.debug(f"Submitted initial composite task for page {p_curr_first}")
else: logger.error(f"Could not generate necessary images for the first composite ({p_curr_first}). Cannot start pre-processing."); processing_successful = False
else: logger.info(f"First transcribable page {first_transcribe_page} too close to edge {1}-{num_pdf_pages}. Will handle in loop.")
else: logger.info(f"No pages within the specified range {loop_start}-{loop_end} to process."); processing_successful = True
try:
for page_num in range(loop_start, loop_end + 1):
if not processing_successful:
logger.warning(f"Terminating loop for {pdf_filename} due to prior failure.")
break # Exit loop if a fatal error occurred
logger.info(f"--- Current target page: {page_num}/{loop_end} ({pdf_filename}) ---")
pdf_state = state_data.get(pdf_filename, {}) # Refresh state in case it was modified
# Submit next composite task in background (same logic)
next_composite_page_num = page_num + 1
p_curr_next, p_next_next, p_after_next = next_composite_page_num - 1, next_composite_page_num, next_composite_page_num + 1
if next_composite_page_num <= loop_end and p_after_next <= num_pdf_pages:
paths_next = [ensure_page_image_exists(p) for p in [p_curr_next, p_next_next, p_after_next]]
if all(p is not None for p in paths_next):
base_name_next = f"page_{next_composite_page_num:04d}"
next_composite_future = executor.submit(create_composite_image_task, paths_next, temp_composite_dir, base_name_next)
logger.info(f"Submitted composite task for next page {next_composite_page_num}")
else: logger.warning(f"Could not generate images for next composite ({next_composite_page_num})."); next_composite_future = None
else: logger.debug(f"Not submitting next composite task: page {next_composite_page_num} is near end or out of bounds."); next_composite_future = None
p_prev = page_num - 1
p_curr = page_num
p_next = page_num + 1
# --- Wait + Transcription + Failover Logic ---
# Only transcribe if we can form the 3-page window
if p_prev < 1 or p_next > num_pdf_pages:
logger.info(f"Skipping transcription for edge page {page_num}: Cannot form 3-page window.")
if state_file_path:
pdf_state["last_processed"] = page_num
state_data[pdf_filename] = pdf_state
save_resume_state(state_file_path, state_data)
# Move to next composite immediately if available
current_composite_future = next_composite_future
next_composite_future = None
delete_page_image(page_num - 2) # Cleanup previous page image
continue # Skip to next page_num in loop
# We can form a window, wait for the *current* composite future
if current_composite_future:
logger.debug(f"Waiting for composite result for page {page_num}...")
temp_composite_img_path = None
composite_exception = None
try:
temp_composite_img_path = current_composite_future.result(timeout=600) # Wait up to 10 mins
except concurrent.futures.TimeoutError: logger.error(f"Timeout waiting for composite generation for page {page_num}."); composite_exception = TimeoutError("Composite generation timed out")
except Exception as e_future: logger.error(f"Composite generation failed for page {page_num}: {e_future}"); composite_exception = e_future
if temp_composite_img_path and temp_composite_img_path.exists():
logger.info(f"Composite ready for page {page_num}: {temp_composite_img_path.name}")
files_processed_count += 1
middle_page_name = get_image_path(p_curr).name
transcription = None
call_success = False
try:
# --- Transcribe using the ACTIVE toolkit ---
# transcribe_image internally handles retries for the ACTIVE toolkit, but returns success/fail status
transcription, call_success = transcribe_image(
active_toolkit=failover_state['t_active_tk'], # Use active from state
image_path=temp_composite_img_path,
original_middle_page_name=middle_page_name,
previous_transcription=last_successful_transcription,
prompt_template=prompt_template
)
# --- Handle Transcription Result and Failover ---
if call_success:
# SUCCESS: Reset failure counter
failover_state['t_failures'] = 0
logger.debug(f"Transcription API call successful for page {page_num}, reset failure counter to 0.")
# Process the transcription (skipped or content)
transcription = transcription.strip() # transcribe_image should have cleaned, but be safe
if transcription.upper() == "SKIPPED":
logger.info(f"SKIPPED page {page_num} ({middle_page_name}) due to ancillary content.")
# Update state, no context change, NOT a failure
if state_file_path:
pdf_state["last_processed"] = page_num
state_data[pdf_filename] = pdf_state
save_resume_state(state_file_path, state_data)
else:
# Write to output
if output_path:
try:
with open(output_path, 'a', encoding='utf-8') as f: f.write(transcription + "\n\n")
logger.info(f"Appended transcription for page {page_num} to {output_path.name}")
except IOError as e: logger.error(f"Error writing to {output_path}: {e}"); processing_successful = False; break # Break if write fails
else: print(f"--- Page {page_num} ({middle_page_name}) ---\n{transcription}\n--- End ---\n")
last_successful_transcription = transcription # Update context FOR NEXT PAGE
# Update resume state ONLY after successful processing & write
if state_file_path:
pdf_state["last_processed"] = page_num
state_data[pdf_filename] = pdf_state
save_resume_state(state_file_path, state_data)
else: # FAILURE (API error, timeout, or invalid content from transcribe_image)
logger.warning(f"--- Transcription FAILED for page {page_num} ({middle_page_name}) using {failover_state['t_active_tk'].model.model_type.value} ---")
failover_state['t_failures'] += 1
logger.info(f"Transcription failure count incremented to: {failover_state['t_failures']}")
# Do NOT update last_processed or context on failure
processing_successful = False # Signal failure for this book
# Check if failover threshold reached
if failover_state['t_failures'] >= FAILURE_THRESHOLD:
current_active_model = failover_state['t_active_tk'].model.model_type.value
if failover_state['t_active_tk'] == t_primary_tk and t_backup_tk:
failover_state['t_active_tk'] = t_backup_tk
logger.warning(f"Reached {FAILURE_THRESHOLD} consecutive transcription failures on PRIMARY ({current_active_model}). Switching to BACKUP model: {t_backup_tk.model.model_type.value}")
elif failover_state['t_active_tk'] == t_backup_tk and t_primary_tk:
failover_state['t_active_tk'] = t_primary_tk
logger.warning(f"Reached {FAILURE_THRESHOLD} consecutive transcription failures on BACKUP ({current_active_model}). Switching back to PRIMARY model: {t_primary_tk.model.model_type.value}")
else:
logger.error(f"Reached {FAILURE_THRESHOLD} failures, but cannot switch from {current_active_model} - primary or backup missing or same?")
if t_primary_tk != t_backup_tk: # Only reset if models actually switched
failover_state['t_failures'] = 0 # Reset counter after switching
logger.info("Transcription failure counter reset after model switch.")
else:
logger.warning("Primary and backup models are the same, fail counter not reset after threshold.")
# Break the loop for this PDF on failure
break
except Exception as e_transcribe:
logger.error(f"Unhandled error during transcription block for page {page_num} ({middle_page_name}): {e_transcribe}", exc_info=True)
processing_successful = False
# Count exception as failure for failover
failover_state['t_failures'] += 1
logger.info(f"Transcription failure count incremented to: {failover_state['t_failures']} due to exception.")
# Repeat failover check logic (consider refactoring this)
if failover_state['t_failures'] >= FAILURE_THRESHOLD:
current_active_model = failover_state['t_active_tk'].model.model_type.value
if failover_state['t_active_tk'] == t_primary_tk and t_backup_tk: failover_state['t_active_tk'] = t_backup_tk; logger.warning(f"Reached {FAILURE_THRESHOLD} failures (exception). Switching to BACKUP: {t_backup_tk.model.model_type.value}")
elif failover_state['t_active_tk'] == t_backup_tk and t_primary_tk: failover_state['t_active_tk'] = t_primary_tk; logger.warning(f"Reached {FAILURE_THRESHOLD} failures (exception). Switching back to PRIMARY: {t_primary_tk.model.model_type.value}")
else: logger.error(f"Reached {FAILURE_THRESHOLD} failures (exception), but cannot switch from {current_active_model}.")
if t_primary_tk != t_backup_tk: failover_state['t_failures'] = 0; logger.info("Transcription failure counter reset after model switch.")
else: logger.warning("Primary and backup models are the same, fail counter not reset after threshold.")
break # Break loop on unhandled error
finally:
if temp_composite_img_path and temp_composite_img_path.exists():
try: temp_composite_img_path.unlink(); logger.debug(f"Deleted used composite: {temp_composite_img_path.name}")
except OSError as e_del_comp: logger.warning(f"Could not delete used composite {temp_composite_img_path.name}: {e_del_comp}")
elif composite_exception:
logger.error(f"Skipping transcription for page {page_num} because composite generation failed: {composite_exception}")
processing_successful = False; break # Fail this book
else:
logger.error(f"Composite future completed for page {page_num}, but path is invalid or file missing. Skipping.")
processing_successful = False; break # Fail this book
else:
logger.warning(f"No current composite future available to wait for page {page_num}. Check pre-submission logic. Skipping transcription attempt.")
# Don't necessarily fail the whole book, but log warning
# --- Advance to next page's composite future ---
current_composite_future = next_composite_future
next_composite_future = None # Clear it so we know to submit next iteration
delete_page_image(page_num - 2) # Cleanup image from two pages ago
# End of page loop
if not processing_successful:
logger.error(f"Loop terminated prematurely for PDF {pdf_filename} due to failures.")
finally:
logger.debug(f"Performing final image cleanup for {pdf_filename}...")
images_to_delete = list(current_images_on_disk)
for img_path in images_to_delete:
try: delete_page_image(int(re.search(r"-(\d+)\.[^.]+$", img_path.name).group(1)))
except (AttributeError, ValueError, TypeError) as e_parse: logger.warning(f"Could not parse page num from {img_path.name} for final cleanup: {e_parse}")
except Exception as e_final_del: logger.warning(f"Error in final cleanup helper for {img_path.name}: {e_final_del}")
if temp_composite_dir.exists():
try: shutil.rmtree(temp_composite_dir); logger.debug(f"Cleaned up composites directory: {temp_composite_dir}")
except OSError as e_final_clean: logger.warning(f"Could not clean up composites directory {temp_composite_dir}: {e_final_clean}")
logger.info(f"Finished loop for PDF {pdf_filename}. Processed {files_processed_count} pages within effective range.")
return processing_successful, last_successful_transcription
# --- Model Parsing and Platform Inference ---
# parse_model_string_to_enum remains mostly unchanged
def parse_model_string_to_enum(model_string: str, model_purpose: str) -> Optional[Union[ModelType, str]]:
"""Parses a model string to a base ModelType enum or a recognized unified string."""
logger.debug(f"Attempting to parse model string for {model_purpose}: '{model_string}'")
if not isinstance(model_string, str): logger.error(f"Invalid model string type ({type(model_string)})."); return None
# 1. Try base ModelType enum
try:
model_enum = ModelType(model_string); logger.info(f"Parsed '{model_string}' as base ModelType: {model_enum}"); return model_enum
except ValueError: logger.debug(f"'{model_string}' not a base ModelType value.")
# 2. Try custom 'mymodel' enum
try:
custom_enum_member = mymodel(model_string); logger.info(f"Parsed '{model_string}' as custom 'mymodel'. Returning value: '{custom_enum_member.value}'"); return custom_enum_member.value
except ValueError: logger.debug(f"'{model_string}' not a custom 'mymodel' value.")
# 3. Check for unified 'provider/model' format (if not already matched by custom enum)
if "/" in model_string:
logger.info(f"'{model_string}' looks like a generic unified model string."); return model_string
# 4. Fallback
logger.error(f"Invalid or unrecognized model string for {model_purpose}: '{model_string}'.")
return None
# get_platform_from_model_type remains mostly unchanged
def get_platform_from_model_type(model_type_repr: Union[ModelType, str]) -> Optional[ModelPlatformType]:
"""Infers the ModelPlatformType from a ModelType enum or model string."""
unified_platform_map = {
"openrouter": ModelPlatformType.OPENROUTER, "meta-llama": ModelPlatformType.OPENROUTER,
"google": ModelPlatformType.OPENROUTER, # Assume OpenRouter for google/ prefix
"qwen": ModelPlatformType.QWEN, "anthropic": ModelPlatformType.ANTHROPIC,
"deepseek": ModelPlatformType.DEEPSEEK, "groq": ModelPlatformType.GROQ,
}
original_input = model_type_repr
if isinstance(model_type_repr, str):
model_string_cleaned = model_type_repr.split(':')[0]
try: enum_match = ModelType(model_string_cleaned); model_type_repr = enum_match # Try convert to base enum
except ValueError:
if "/" in model_string_cleaned:
provider = model_string_cleaned.split('/')[0].lower()
platform = unified_platform_map.get(provider)
if platform: logger.debug(f"Inferred platform '{platform}' from unified: {original_input}"); return platform
else: logger.warning(f"Unknown provider prefix '{provider}'. Defaulting to OPENAI."); return ModelPlatformType.OPENAI
else: logger.error(f"Could not determine platform for string: {original_input}."); return None
if isinstance(model_type_repr, ModelType):
model_name = model_type_repr.name.upper()
if model_name.startswith("GPT") or model_name.startswith("O3"): return ModelPlatformType.OPENAI
if model_name.startswith("GEMINI"): return ModelPlatformType.GEMINI # Direct Google API
if model_name.startswith("AZURE"): return ModelPlatformType.AZURE
if model_name.startswith("QWEN"): return ModelPlatformType.QWEN
if model_name.startswith("DEEPSEEK"): return ModelPlatformType.DEEPSEEK
if model_name.startswith("GROQ"): return ModelPlatformType.GROQ
if "LLAMA" in model_name or "MIXTRAL" in model_name: logger.warning(f"Assuming OPENAI compatibility for {model_name}. Check env."); return ModelPlatformType.OPENAI
if model_name.startswith("CLAUDE"): return ModelPlatformType.ANTHROPIC
logger.warning(f"Could not infer platform for enum {model_type_repr}. Defaulting to OPENAI."); return ModelPlatformType.OPENAI
else: logger.error(f"Invalid type '{type(original_input)}' in get_platform_from_model_type."); return None
# check_if_file_completed remains unchanged
def check_if_file_completed(file_path: Path, marker: str) -> bool:
"""Checks if the file exists and ends with the specified marker."""
if not file_path.exists() or file_path.stat().st_size == 0: return False
try:
with open(file_path, 'r', encoding='utf-8') as f:
read_size = len(marker) + 100; f.seek(max(0, file_path.stat().st_size - read_size))
return f.read().strip().endswith(marker)
except Exception as e: logger.warning(f"Could not read end of {file_path}: {e}"); return False
# --- ADDED: TOC Extraction Function ---
# extract_toc remains unchanged
def extract_toc(pdf_path: Path) -> Optional[str]:
"""Extracts the table of contents (outline) from a PDF."""
toc_entries = []
try:
reader = PyPDF2.PdfReader(str(pdf_path))
if not reader.outline: logger.warning(f"No Outline/TOC found in '{pdf_path.name}'"); return None
logger.info(f"Extracting TOC from '{pdf_path.name}'...")
def _recursive_outline_extract(items, level=0):
for item in items:
if isinstance(item, list): _recursive_outline_extract(item, level + 1)
else:
try:
page_num = item.page_number + 1 # PyPDF2 v3+ preference
title = item.title.strip() if item.title else "Untitled"
toc_entries.append(f"{' ' * level}- {title} (Page {page_num})")
except AttributeError: logger.warning(f"Skipping TOC item without standard page_number/title: {item}")
except Exception as e_item: logger.warning(f"Error processing TOC item '{getattr(item, 'title', 'N/A')}': {e_item}")
_recursive_outline_extract(reader.outline)
if not toc_entries: logger.warning(f"PDF '{pdf_path.name}' has outline, but no entries extracted."); return None
toc_text = "\n".join(toc_entries); logger.info(f"Extracted {len(toc_entries)} TOC entries."); return toc_text
except (PdfReadError, DependencyError) as e: logger.error(f"Error reading PDF '{pdf_path.name}' for TOC: {e}"); return None
except Exception as e: logger.error(f"Unexpected TOC extraction error '{pdf_path.name}': {e}", exc_info=True); return None
# --- MODIFIED: TOC Analysis Function handles failover state ---
def get_main_content_page_range(
toc_text: str,
failover_state: Dict[str, Any], # Pass mutable state
prompt_template: str = PROMPT_TOC_ANALYSIS
) -> Tuple[Optional[int], Optional[int]]:
"""
Uses the active AI model (from failover state) to analyze TOC text.
Handles failover logic internally. Updates failover_state.
"""
# Get model backends from failover state
a_primary_backend = failover_state['a_primary_backend']
a_backup_backend = failover_state['a_backup_backend']
# a_active_backend and a_failures are accessed directly
logger.info(f"Analyzing TOC with ACTIVE model: {failover_state['a_active_backend'].model_type.value}")
final_prompt = prompt_template.format(toc_text=toc_text)
start_page, end_page = None, None
try:
retry_delay = 2; max_retries = 2; retries = 0; raw_response = None; call_success = False
while retries < max_retries:
active_backend = failover_state['a_active_backend'] # Use current active
try:
messages = [{"role": "user", "content": final_prompt}]
response_obj = active_backend.run(messages=messages)
if response_obj and response_obj.choices and response_obj.choices[0].message:
raw_response = response_obj.choices[0].message.content
logger.debug(f"Raw analysis response (attempt {retries + 1} using {active_backend.model_type.value}):\n{raw_response[:500]}...")
call_success = True; break # Success
else: logger.warning(f"TOC analysis ({active_backend.model_type.value}) returned invalid response structure (attempt {retries+1})."); raw_response = None; call_success = False
except Exception as e_inner: logger.warning(f"TOC analysis API call failed (attempt {retries + 1}/{max_retries} using {active_backend.model_type.value}): {e_inner}. Retrying in {retry_delay}s..."); call_success = False
if not call_success: sleep(retry_delay); retry_delay *= 2; retries += 1
else: break # Should have broken already
# --- Handle Analysis Result and Failover ---
if call_success and raw_response:
failover_state['a_failures'] = 0 # SUCCESS: Reset failure counter
logger.debug("Analysis API call successful, reset failure counter.")
try:
json_match = re.search(r"```json\s*(\{.*?\})\s*```", raw_response, re.DOTALL | re.IGNORECASE) or re.search(r"(\{.*?\})", raw_response, re.DOTALL)
if json_match:
json_str = json_match.group(1); data = json.loads(json_str)
logger.info(f"Parsed TOC analysis result: {data.get('reasoning', 'N/A')}")
s_page, e_page = data.get("start_page"), data.get("end_page")
if isinstance(s_page, int) and s_page > 0: start_page = s_page
elif s_page is not None: logger.warning(f"Invalid start_page: {s_page}.")
if isinstance(e_page, int) and e_page > 0: end_page = e_page
elif e_page is not None: logger.warning(f"Invalid end_page: {e_page}.")
if start_page is not None and end_page is not None and start_page > end_page: logger.warning(f"Start > end ({start_page} > {end_page}). Discarding."); return None, None
logger.info(f"Determined main content range: Start={start_page}, End={end_page}"); return start_page, end_page
else: logger.error(f"Could not find JSON in analysis response:\n{raw_response}"); call_success = False # Treat as failure
except (json.JSONDecodeError, Exception) as e_parse: logger.error(f"Error processing analysis JSON '{e_parse}':\n{raw_response}", exc_info=True); call_success = False # Treat as failure
# --- Handle Failure and Failover (if call_success is False) ---
if not call_success:
logger.error(f"Failed to get valid analysis after {max_retries} attempts using {failover_state['a_active_backend'].model_type.value}.")
failover_state['a_failures'] += 1
logger.info(f"Analysis failure count incremented to: {failover_state['a_failures']}")
if failover_state['a_failures'] >= FAILURE_THRESHOLD:
current_active_model = failover_state['a_active_backend'].model_type.value
switched = False
if failover_state['a_active_backend'] == a_primary_backend and a_backup_backend:
failover_state['a_active_backend'] = a_backup_backend; logger.warning(f"Reached {FAILURE_THRESHOLD} analysis failures on PRIMARY ({current_active_model}). Switching to BACKUP: {a_backup_backend.model_type.value}"); switched = True
elif failover_state['a_active_backend'] == a_backup_backend and a_primary_backend:
failover_state['a_active_backend'] = a_primary_backend; logger.warning(f"Reached {FAILURE_THRESHOLD} analysis failures on BACKUP ({current_active_model}). Switching back to PRIMARY: {a_primary_backend.model_type.value}"); switched = True
else: logger.error(f"Reached {FAILURE_THRESHOLD} failures, cannot switch analysis model from {current_active_model}.")
if switched and a_primary_backend != a_backup_backend: failover_state['a_failures'] = 0; logger.info("Analysis failure counter reset after model switch.")
return None, None # Analysis failed
except Exception as e: # Catch unhandled errors in the try block
logger.error(f"Unhandled error during TOC analysis: {e}", exc_info=True)
failover_state['a_failures'] += 1 # Count exception as failure
logger.info(f"Analysis failure count incremented to: {failover_state['a_failures']} due to exception.")
if failover_state['a_failures'] >= FAILURE_THRESHOLD:
current_active_model = failover_state['a_active_backend'].model_type.value; switched=False
if failover_state['a_active_backend'] == a_primary_backend and a_backup_backend: failover_state['a_active_backend'] = a_backup_backend; logger.warning(f"Reached {FAILURE_THRESHOLD} failures (exception). Switching analysis to BACKUP: {a_backup_backend.model_type.value}"); switched=True
elif failover_state['a_active_backend'] == a_backup_backend and a_primary_backend: failover_state['a_active_backend'] = a_primary_backend; logger.warning(f"Reached {FAILURE_THRESHOLD} failures (exception). Switching analysis back to PRIMARY: {a_primary_backend.model_type.value}"); switched=True
else: logger.error(f"Reached {FAILURE_THRESHOLD} failures (exception), cannot switch analysis from {current_active_model}.")
if switched and a_primary_backend != a_backup_backend: failover_state['a_failures'] = 0; logger.info("Analysis failure counter reset after model switch.")
return None, None
# --- System Temp File Cleanup ---
# cleanup_system_temp_files unchanged
def cleanup_system_temp_files():
"""Finds and deletes temporary files created by the toolkit in /tmp."""
logger.debug(f"Attempting periodic cleanup of '{SYSTEM_TEMP_DIR_PATTERN}'...")
count = 0; deleted_something = False
try:
for temp_file_path_str in glob.iglob(SYSTEM_TEMP_DIR_PATTERN):
try: