Skip to content

Commit 3adb4e9

Browse files
authored
fix: improve JSON repair handling for markdown code blocks (bytedance#841)
* fix: improve JSON repair handling for markdown code blocks * unified import path * compress_crawl_udf * fix * reverse
1 parent 756421c commit 3adb4e9

File tree

4 files changed

+394
-6
lines changed

4 files changed

+394
-6
lines changed

src/graph/nodes.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -332,9 +332,12 @@ def planner_node(
332332
logger.debug(f"Current state messages: {state['messages']}")
333333
logger.info(f"Planner response: {full_response}")
334334

335+
# Clean the response first to handle markdown code blocks (```json, ```ts, etc.)
336+
cleaned_response = repair_json_output(full_response)
337+
335338
# Validate explicitly that response content is valid JSON before proceeding to parse it
336-
if not full_response.strip().startswith('{') and not full_response.strip().startswith('['):
337-
logger.warning("Planner response does not appear to be valid JSON")
339+
if not cleaned_response.strip().startswith('{') and not cleaned_response.strip().startswith('['):
340+
logger.warning("Planner response does not appear to be valid JSON after cleanup")
338341
if plan_iterations > 0:
339342
return Command(
340343
update=preserve_state_meta_fields(state),
@@ -347,7 +350,7 @@ def planner_node(
347350
)
348351

349352
try:
350-
curr_plan = json.loads(repair_json_output(full_response))
353+
curr_plan = json.loads(cleaned_response)
351354
# Need to extract the plan from the full_response
352355
curr_plan_content = extract_plan_content(curr_plan)
353356
# load the current_plan
@@ -1428,4 +1431,4 @@ async def analyst_node(
14281431
config,
14291432
"analyst",
14301433
[], # No tools - pure reasoning
1431-
)
1434+
)

src/tools/crawl.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88

99
from langchain_core.tools import tool
1010

11+
from src.crawler.article import Article
1112
from src.crawler import Crawler
12-
1313
from .decorators import log_io
1414

1515
logger = logging.getLogger(__name__)
@@ -43,8 +43,18 @@ def crawl_tool(
4343
try:
4444
crawler = Crawler()
4545
article = crawler.crawl(url)
46-
return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]}, ensure_ascii=False)
46+
article_content = compress_crawl_content(article)
47+
return json.dumps({"url": url, "crawled_content": article_content}, ensure_ascii=False)
4748
except BaseException as e:
4849
error_msg = f"Failed to crawl. Error: {repr(e)}"
4950
logger.error(error_msg)
5051
return error_msg
52+
53+
54+
def compress_crawl_content(article: Article) -> str:
55+
"""
56+
Compress user-defined function for article content.
57+
We can customize this function to implement different compression strategies.
58+
Currently, it truncates the markdown content to the first 1000 characters.
59+
"""
60+
return article.to_markdown()[:1000]

src/utils/json_utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing import Any
88

99
import json_repair
10+
import re
1011

1112
logger = logging.getLogger(__name__)
1213

@@ -121,6 +122,27 @@ def repair_json_output(content: str) -> str:
121122
if not content:
122123
return content
123124

125+
# Handle markdown code blocks (```json, ```ts, or ```)
126+
# This must be checked first, as content may start with ``` instead of { or [
127+
if "```" in content:
128+
# Remove opening markdown code block markers (```json, ```ts, or ```), allowing
129+
# optional leading spaces and multiple blank lines after the fence.
130+
content = re.sub(
131+
r'^[ \t]*```(?:json|ts)?[ \t]*\n+',
132+
'',
133+
content,
134+
flags=re.IGNORECASE | re.MULTILINE,
135+
)
136+
# Remove closing markdown code block markers (```), allowing optional
137+
# leading newlines and trailing spaces.
138+
content = re.sub(
139+
r'\n*```[ \t]*$',
140+
'',
141+
content,
142+
flags=re.MULTILINE,
143+
)
144+
content = content.strip()
145+
124146
# First attempt: try to extract valid JSON if there are extra tokens
125147
content = _extract_json_from_content(content)
126148

0 commit comments

Comments
 (0)