fix: improve JSON repair handling for markdown code blocks (bytedance#841)

xunliu · web-flow · commit 3adb4e90cbf1 · 2026-01-30T08:47:23.000+08:00
* fix: improve JSON repair handling for markdown code blocks

* unified import path

* compress_crawl_udf

* fix

* reverse
diff --git a/src/graph/nodes.py b/src/graph/nodes.py
@@ -332,9 +332,12 @@ def planner_node(
     logger.debug(f"Current state messages: {state['messages']}")
     logger.info(f"Planner response: {full_response}")
 
+    # Clean the response first to handle markdown code blocks (```json, ```ts, etc.)
+    cleaned_response = repair_json_output(full_response)
+
     # Validate explicitly that response content is valid JSON before proceeding to parse it
-    if not full_response.strip().startswith('{') and not full_response.strip().startswith('['):
-        logger.warning("Planner response does not appear to be valid JSON")
+    if not cleaned_response.strip().startswith('{') and not cleaned_response.strip().startswith('['):
+        logger.warning("Planner response does not appear to be valid JSON after cleanup")
         if plan_iterations > 0:
             return Command(
                 update=preserve_state_meta_fields(state),
@@ -347,7 +350,7 @@ def planner_node(
             )
 
     try:
-        curr_plan = json.loads(repair_json_output(full_response))
+        curr_plan = json.loads(cleaned_response)
         # Need to extract the plan from the full_response
         curr_plan_content = extract_plan_content(curr_plan)
         # load the current_plan
@@ -1428,4 +1431,4 @@ async def analyst_node(
         config,
         "analyst",
         [],  # No tools - pure reasoning
-    )
+    )
diff --git a/src/tools/crawl.py b/src/tools/crawl.py
@@ -8,8 +8,8 @@
 
 from langchain_core.tools import tool
 
+from src.crawler.article import Article
 from src.crawler import Crawler
-
 from .decorators import log_io
 
 logger = logging.getLogger(__name__)
@@ -43,8 +43,18 @@ def crawl_tool(
     try:
         crawler = Crawler()
         article = crawler.crawl(url)
-        return json.dumps({"url": url, "crawled_content": article.to_markdown()[:1000]}, ensure_ascii=False)
+        article_content = compress_crawl_content(article)
+        return json.dumps({"url": url, "crawled_content": article_content}, ensure_ascii=False)
     except BaseException as e:
         error_msg = f"Failed to crawl. Error: {repr(e)}"
         logger.error(error_msg)
         return error_msg
+
+
+def compress_crawl_content(article: Article) -> str:
+    """
+    Compress user-defined function for article content.
+    We can customize this function to implement different compression strategies.
+    Currently, it truncates the markdown content to the first 1000 characters.
+    """
+    return article.to_markdown()[:1000]
diff --git a/src/utils/json_utils.py b/src/utils/json_utils.py
@@ -7,6 +7,7 @@
 from typing import Any
 
 import json_repair
+import re
 
 logger = logging.getLogger(__name__)
 
@@ -121,6 +122,27 @@ def repair_json_output(content: str) -> str:
     if not content:
         return content
 
+    # Handle markdown code blocks (```json, ```ts, or ```)
+    # This must be checked first, as content may start with ``` instead of { or [
+    if "```" in content:
+        # Remove opening markdown code block markers (```json, ```ts, or ```), allowing
+        # optional leading spaces and multiple blank lines after the fence.
+        content = re.sub(
+            r'^[ \t]*```(?:json|ts)?[ \t]*\n+',
+            '',
+            content,
+            flags=re.IGNORECASE | re.MULTILINE,
+        )
+        # Remove closing markdown code block markers (```), allowing optional
+        # leading newlines and trailing spaces.
+        content = re.sub(
+            r'\n*```[ \t]*$',
+            '',
+            content,
+            flags=re.MULTILINE,
+        )
+        content = content.strip()
+
     # First attempt: try to extract valid JSON if there are extra tokens
     content = _extract_json_from_content(content)
 
diff --git a/tests/unit/utils/test_json_utils.py b/tests/unit/utils/test_json_utils.py