Skip to content

Commit f37af7d

Browse files
committed
dotsocr, 잘못된 결과 리턴시 재시도 기능 추가
1 parent 28bf4ca commit f37af7d

2 files changed

Lines changed: 89 additions & 34 deletions

File tree

docling/datamodel/pipeline_options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ class GenosLayoutOptions(BaseModel):
359359
model: str = "dots-mocr"
360360
max_completion_tokens: int = 6000
361361
timeout: int = 3600
362+
retry_count: int = 2 # Number of retries on abnormal VLM responses
362363

363364

364365
class LayoutOptions(BaseLayoutOptions):

docling/models/genos_dots_ocr_layout_model.py

Lines changed: 88 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,16 @@ def __init__(self, pipeline_options: PdfPipelineOptions) -> None:
8686
self.model = getattr(self.dotsocr_options, "model", "dots-mocr")
8787
self.max_completion_tokens = self.dotsocr_options.max_completion_tokens
8888
self.timeout = self.dotsocr_options.timeout
89+
retry_count = getattr(self.dotsocr_options, "retry_count", 2)
90+
try:
91+
retry_count = int(retry_count)
92+
except (TypeError, ValueError):
93+
_log.warning(
94+
"Invalid genos_layout_options.retry_count=%r. Falling back to 2.",
95+
retry_count,
96+
)
97+
retry_count = 2
98+
self.retry_count = max(0, retry_count)
8999

90100
def _use_dotsocr_table_structure(self) -> bool:
91101
return (
@@ -585,47 +595,68 @@ def _process_page(self, conv_res: ConversionResult, page: Page) -> Page:
585595
# 바이트 스트림을 base64로 인코딩
586596
base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
587597

588-
response_text = call_vlm_server(
589-
prompt=prompt,
590-
base64_image=base64_image,
591-
url=self.dotocr_endpoint,
592-
api_key=self.api_key,
593-
model=self.model,
594-
max_completion_tokens=self.max_completion_tokens,
595-
timeout=self.timeout,
596-
)
598+
total_attempts = self.retry_count + 1
599+
response = None
600+
result = None
601+
for attempt in range(1, total_attempts + 1):
602+
try:
603+
response_text = call_vlm_server(
604+
prompt=prompt,
605+
base64_image=base64_image,
606+
url=self.dotocr_endpoint,
607+
api_key=self.api_key,
608+
model=self.model,
609+
max_completion_tokens=self.max_completion_tokens,
610+
timeout=self.timeout,
611+
)
612+
if not isinstance(response_text, str) or not response_text.strip():
613+
raise ValueError("Empty VLM response text")
614+
response = _parse_vlm_json_response(response_text)
615+
except Exception:
616+
if attempt >= total_attempts:
617+
raise
618+
_log.warning(
619+
"DotsOCR layout request failed (page=%s, attempt=%d/%d). Retrying...",
620+
page.page_no,
621+
attempt,
622+
total_attempts,
623+
exc_info=True,
624+
)
625+
continue
626+
627+
result = _extract_layout_result_items(response)
628+
if isinstance(result, list):
629+
if attempt > 1:
630+
_log.info(
631+
"DotsOCR layout request recovered after retry (page=%s, attempt=%d/%d).",
632+
page.page_no,
633+
attempt,
634+
total_attempts,
635+
)
636+
break
637+
638+
if attempt < total_attempts:
639+
_log.warning(
640+
"Unexpected VLM response schema (page=%s, attempt=%d/%d). Retrying. Parsed type=%s; value=%r",
641+
page.page_no,
642+
attempt,
643+
total_attempts,
644+
type(response).__name__,
645+
response,
646+
)
647+
continue
597648

598-
# 디버그용으로 response_text 화면에 출력
599-
# print("VLM Response Data:", json.dumps(json.loads(response_text), indent=2, ensure_ascii=False))
600-
601-
response = _parse_vlm_json_response(response_text)
602-
if isinstance(response, dict):
603-
result = response.get("result")
604-
if result is None:
605-
# Fallback for providers that use a different list key.
606-
result = response.get("items")
607-
elif isinstance(response, list):
608-
result = response
609-
else:
610-
result = None
611-
612-
if isinstance(result, str):
613-
nested = _parse_vlm_json_response(result)
614-
if isinstance(nested, dict):
615-
result = nested.get("result")
616-
if result is None:
617-
result = nested.get("items")
618-
elif isinstance(nested, list):
619-
result = nested
620-
621-
if not isinstance(result, list):
622649
_log.warning(
623-
"Unexpected VLM response schema. Parsed type=%s; falling back to empty predictions. value=%r",
650+
"Unexpected VLM response schema after retries (page=%s, attempts=%d). Falling back to empty predictions. Parsed type=%s; value=%r",
651+
page.page_no,
652+
total_attempts,
624653
type(response).__name__,
625654
response,
626655
)
627656
result = []
628657

658+
assert isinstance(result, list)
659+
629660
clusters = []
630661
raw_table_html_by_cluster_id: dict[int, str] = {}
631662
raw_formula_latex_by_cluster_id: dict[int, str] = {}
@@ -905,6 +936,29 @@ def call_vlm_server(
905936
raise ValueError(f"응답 파싱 오류: {e}\n응답 본문: {response.text}") from e
906937

907938

939+
def _extract_layout_result_items(response):
940+
if isinstance(response, dict):
941+
result = response.get("result")
942+
if result is None:
943+
# Fallback for providers that use a different list key.
944+
result = response.get("items")
945+
elif isinstance(response, list):
946+
result = response
947+
else:
948+
result = None
949+
950+
if isinstance(result, str):
951+
nested = _parse_vlm_json_response(result)
952+
if isinstance(nested, dict):
953+
result = nested.get("result")
954+
if result is None:
955+
result = nested.get("items")
956+
elif isinstance(nested, list):
957+
result = nested
958+
959+
return result
960+
961+
908962
def _parse_vlm_json_response(response_text: str):
909963
if not isinstance(response_text, str):
910964
raise TypeError(

0 commit comments

Comments
 (0)