diff --git a/src/fetch/src/mcp_server_fetch/__init__.py b/src/fetch/src/mcp_server_fetch/__init__.py
index 09744ce319..94089436b4 100644
--- a/src/fetch/src/mcp_server_fetch/__init__.py
+++ b/src/fetch/src/mcp_server_fetch/__init__.py
@@ -16,9 +16,14 @@ def main():
help="Ignore robots.txt restrictions",
)
parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests")
+ parser.add_argument(
+ "--no-readability",
+ action="store_true",
+ help="Disable Readability.js extraction and use the Python HTML simplifier only",
+ )
args = parser.parse_args()
- asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url))
+ asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url, not args.no_readability))
if __name__ == "__main__":
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index b42c7b1f6b..caad9836e9 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -24,7 +24,7 @@
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
-def extract_content_from_html(html: str) -> str:
+def extract_content_from_html(html: str, use_readability: bool = True) -> str:
"""Extract and convert HTML content to Markdown format.
Args:
@@ -33,16 +33,38 @@ def extract_content_from_html(html: str) -> str:
Returns:
Simplified markdown version of the content
"""
- ret = readabilipy.simple_json.simple_json_from_html_string(
- html, use_readability=True
- )
- if not ret["content"]:
+ try:
+ ret = readabilipy.simple_json.simple_json_from_html_string(
+ html, use_readability=use_readability
+ )
+ except Exception:
+ if not use_readability:
+ return "Page failed to be simplified from HTML"
+ try:
+ ret = readabilipy.simple_json.simple_json_from_html_string(
+ html, use_readability=False
+ )
+ except Exception:
+ return "Page failed to be simplified from HTML"
+
+ content = ret.get("content") if isinstance(ret, dict) else None
+ if not content:
+ if use_readability:
+ try:
+ fallback_ret = readabilipy.simple_json.simple_json_from_html_string(
+ html, use_readability=False
+ )
+ except Exception:
+ fallback_ret = {}
+ content = fallback_ret.get("content") if isinstance(fallback_ret, dict) else None
+
+ if not content:
return "Page failed to be simplified from HTML"
content = markdownify.markdownify(
- ret["content"],
+ content,
heading_style=markdownify.ATX,
)
- return content
+ return content if content.strip() else "Page failed to be simplified from HTML"
def get_robots_txt_url(url: str) -> str:
@@ -109,7 +131,11 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
async def fetch_url(
- url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
+ url: str,
+ user_agent: str,
+ force_raw: bool = False,
+ proxy_url: str | None = None,
+ use_readability: bool = True,
) -> Tuple[str, str]:
"""
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
@@ -140,7 +166,7 @@ async def fetch_url(
)
if is_page_html and not force_raw:
- return extract_content_from_html(page_raw), ""
+ return extract_content_from_html(page_raw, use_readability=use_readability), ""
return (
page_raw,
@@ -182,6 +208,7 @@ async def serve(
custom_user_agent: str | None = None,
ignore_robots_txt: bool = False,
proxy_url: str | None = None,
+ use_readability: bool = True,
) -> None:
"""Run the fetch MCP server.
@@ -189,6 +216,7 @@ async def serve(
custom_user_agent: Optional custom User-Agent string to use for requests
ignore_robots_txt: Whether to ignore robots.txt restrictions
proxy_url: Optional proxy URL to use for requests
+ use_readability: Whether to use Readability.js while simplifying HTML
"""
server = Server("mcp-fetch")
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
@@ -235,7 +263,11 @@ async def call_tool(name, arguments: dict) -> list[TextContent]:
await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)
content, prefix = await fetch_url(
- url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
+ url,
+ user_agent_autonomous,
+ force_raw=args.raw,
+ proxy_url=proxy_url,
+ use_readability=use_readability,
)
original_length = len(content)
if args.start_index >= original_length:
@@ -262,7 +294,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
url = arguments["url"]
try:
- content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url)
+ content, prefix = await fetch_url(
+ url,
+ user_agent_manual,
+ proxy_url=proxy_url,
+ use_readability=use_readability,
+ )
# TODO: after SDK bug is addressed, don't catch the exception
except McpError as e:
return GetPromptResult(
diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
index 96c1cb38c7..ddc7d52660 100644
--- a/src/fetch/tests/test_server.py
+++ b/src/fetch/tests/test_server.py
@@ -81,6 +81,45 @@ def test_html_with_links(self):
result = extract_content_from_html(html)
assert "Example" in result
+ def test_falls_back_when_readability_is_unavailable(self):
+ """Test that Readability.js failures fall back to the Python simplifier."""
+ calls = []
+
+ def simplify(html, use_readability):
+ calls.append(use_readability)
+ if use_readability:
+ raise RuntimeError("node is not available")
+ return {"content": "Fallback content
"}
+
+ with patch(
+ "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string",
+ side_effect=simplify,
+ ):
+ result = extract_content_from_html("
Fallback content")
+
+ assert "Fallback content" in result
+ assert calls == [True, False]
+
+ def test_can_disable_readability(self):
+ """Test that callers can skip Readability.js entirely."""
+ calls = []
+
+ def simplify(html, use_readability):
+ calls.append(use_readability)
+ return {"content": "Python-only content
"}
+
+ with patch(
+ "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string",
+ side_effect=simplify,
+ ):
+ result = extract_content_from_html(
+ "Python-only content",
+ use_readability=False,
+ )
+
+ assert "Python-only content" in result
+ assert calls == [False]
+
def test_empty_content_returns_error(self):
"""Test that empty/invalid HTML returns error message."""
html = ""