From afbff474233ab471c28cd0465e1e5302378d718b Mon Sep 17 00:00:00 2001 From: Krish Garg Date: Thu, 21 May 2026 19:32:56 -0700 Subject: [PATCH] fix(fetch): fall back when readability extraction fails --- src/fetch/src/mcp_server_fetch/__init__.py | 7 ++- src/fetch/src/mcp_server_fetch/server.py | 59 ++++++++++++++++++---- src/fetch/tests/test_server.py | 39 ++++++++++++++ 3 files changed, 93 insertions(+), 12 deletions(-) diff --git a/src/fetch/src/mcp_server_fetch/__init__.py b/src/fetch/src/mcp_server_fetch/__init__.py index 09744ce319..94089436b4 100644 --- a/src/fetch/src/mcp_server_fetch/__init__.py +++ b/src/fetch/src/mcp_server_fetch/__init__.py @@ -16,9 +16,14 @@ def main(): help="Ignore robots.txt restrictions", ) parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests") + parser.add_argument( + "--no-readability", + action="store_true", + help="Disable Readability.js extraction and use the Python HTML simplifier only", + ) args = parser.parse_args() - asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url)) + asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url, not args.no_readability)) if __name__ == "__main__": diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..caad9836e9 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -24,7 +24,7 @@ DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)" -def extract_content_from_html(html: str) -> str: +def extract_content_from_html(html: str, use_readability: bool = True) -> str: """Extract and convert HTML content to Markdown format. Args: @@ -33,16 +33,38 @@ def extract_content_from_html(html: str) -> str: Returns: Simplified markdown version of the content """ - ret = readabilipy.simple_json.simple_json_from_html_string( - html, use_readability=True - ) - if not ret["content"]: + try: + ret = readabilipy.simple_json.simple_json_from_html_string( + html, use_readability=use_readability + ) + except Exception: + if not use_readability: + return "Page failed to be simplified from HTML" + try: + ret = readabilipy.simple_json.simple_json_from_html_string( + html, use_readability=False + ) + except Exception: + return "Page failed to be simplified from HTML" + + content = ret.get("content") if isinstance(ret, dict) else None + if not content: + if use_readability: + try: + fallback_ret = readabilipy.simple_json.simple_json_from_html_string( + html, use_readability=False + ) + except Exception: + fallback_ret = {} + content = fallback_ret.get("content") if isinstance(fallback_ret, dict) else None + + if not content: return "Page failed to be simplified from HTML" content = markdownify.markdownify( - ret["content"], + content, heading_style=markdownify.ATX, ) - return content + return content if content.strip() else "Page failed to be simplified from HTML" def get_robots_txt_url(url: str) -> str: @@ -109,7 +131,11 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url: async def fetch_url( - url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None + url: str, + user_agent: str, + force_raw: bool = False, + proxy_url: str | None = None, + use_readability: bool = True, ) -> Tuple[str, str]: """ Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information. @@ -140,7 +166,7 @@ async def fetch_url( ) if is_page_html and not force_raw: - return extract_content_from_html(page_raw), "" + return extract_content_from_html(page_raw, use_readability=use_readability), "" return ( page_raw, @@ -182,6 +208,7 @@ async def serve( custom_user_agent: str | None = None, ignore_robots_txt: bool = False, proxy_url: str | None = None, + use_readability: bool = True, ) -> None: """Run the fetch MCP server. @@ -189,6 +216,7 @@ async def serve( custom_user_agent: Optional custom User-Agent string to use for requests ignore_robots_txt: Whether to ignore robots.txt restrictions proxy_url: Optional proxy URL to use for requests + use_readability: Whether to use Readability.js while simplifying HTML """ server = Server("mcp-fetch") user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS @@ -235,7 +263,11 @@ async def call_tool(name, arguments: dict) -> list[TextContent]: await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url) content, prefix = await fetch_url( - url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url + url, + user_agent_autonomous, + force_raw=args.raw, + proxy_url=proxy_url, + use_readability=use_readability, ) original_length = len(content) if args.start_index >= original_length: @@ -262,7 +294,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult: url = arguments["url"] try: - content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url) + content, prefix = await fetch_url( + url, + user_agent_manual, + proxy_url=proxy_url, + use_readability=use_readability, + ) # TODO: after SDK bug is addressed, don't catch the exception except McpError as e: return GetPromptResult( diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py index 96c1cb38c7..ddc7d52660 100644 --- a/src/fetch/tests/test_server.py +++ b/src/fetch/tests/test_server.py @@ -81,6 +81,45 @@ def test_html_with_links(self): result = extract_content_from_html(html) assert "Example" in result + def test_falls_back_when_readability_is_unavailable(self): + """Test that Readability.js failures fall back to the Python simplifier.""" + calls = [] + + def simplify(html, use_readability): + calls.append(use_readability) + if use_readability: + raise RuntimeError("node is not available") + return {"content": "

Fallback content

"} + + with patch( + "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", + side_effect=simplify, + ): + result = extract_content_from_html("Fallback content") + + assert "Fallback content" in result + assert calls == [True, False] + + def test_can_disable_readability(self): + """Test that callers can skip Readability.js entirely.""" + calls = [] + + def simplify(html, use_readability): + calls.append(use_readability) + return {"content": "

Python-only content

"} + + with patch( + "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", + side_effect=simplify, + ): + result = extract_content_from_html( + "Python-only content", + use_readability=False, + ) + + assert "Python-only content" in result + assert calls == [False] + def test_empty_content_returns_error(self): """Test that empty/invalid HTML returns error message.""" html = ""