Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/fetch/src/mcp_server_fetch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,14 @@ def main():
help="Ignore robots.txt restrictions",
)
parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests")
parser.add_argument(
"--no-readability",
action="store_true",
help="Disable Readability.js extraction and use the Python HTML simplifier only",
)

args = parser.parse_args()
asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url))
asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url, not args.no_readability))


if __name__ == "__main__":
Expand Down
59 changes: 48 additions & 11 deletions src/fetch/src/mcp_server_fetch/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"


def extract_content_from_html(html: str) -> str:
def extract_content_from_html(html: str, use_readability: bool = True) -> str:
"""Extract and convert HTML content to Markdown format.

Args:
Expand All @@ -33,16 +33,38 @@ def extract_content_from_html(html: str) -> str:
Returns:
Simplified markdown version of the content
"""
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=True
)
if not ret["content"]:
try:
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=use_readability
)
except Exception:
if not use_readability:
return "<error>Page failed to be simplified from HTML</error>"
try:
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=False
)
except Exception:
return "<error>Page failed to be simplified from HTML</error>"

content = ret.get("content") if isinstance(ret, dict) else None
if not content:
if use_readability:
try:
fallback_ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=False
)
except Exception:
fallback_ret = {}
content = fallback_ret.get("content") if isinstance(fallback_ret, dict) else None

if not content:
return "<error>Page failed to be simplified from HTML</error>"
content = markdownify.markdownify(
ret["content"],
content,
heading_style=markdownify.ATX,
)
return content
return content if content.strip() else "<error>Page failed to be simplified from HTML</error>"


def get_robots_txt_url(url: str) -> str:
Expand Down Expand Up @@ -109,7 +131,11 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:


async def fetch_url(
url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
url: str,
user_agent: str,
force_raw: bool = False,
proxy_url: str | None = None,
use_readability: bool = True,
) -> Tuple[str, str]:
"""
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
Expand Down Expand Up @@ -140,7 +166,7 @@ async def fetch_url(
)

if is_page_html and not force_raw:
return extract_content_from_html(page_raw), ""
return extract_content_from_html(page_raw, use_readability=use_readability), ""

return (
page_raw,
Expand Down Expand Up @@ -182,13 +208,15 @@ async def serve(
custom_user_agent: str | None = None,
ignore_robots_txt: bool = False,
proxy_url: str | None = None,
use_readability: bool = True,
) -> None:
"""Run the fetch MCP server.

Args:
custom_user_agent: Optional custom User-Agent string to use for requests
ignore_robots_txt: Whether to ignore robots.txt restrictions
proxy_url: Optional proxy URL to use for requests
use_readability: Whether to use Readability.js while simplifying HTML
"""
server = Server("mcp-fetch")
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
Expand Down Expand Up @@ -235,7 +263,11 @@ async def call_tool(name, arguments: dict) -> list[TextContent]:
await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)

content, prefix = await fetch_url(
url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
url,
user_agent_autonomous,
force_raw=args.raw,
proxy_url=proxy_url,
use_readability=use_readability,
)
original_length = len(content)
if args.start_index >= original_length:
Expand All @@ -262,7 +294,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
url = arguments["url"]

try:
content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url)
content, prefix = await fetch_url(
url,
user_agent_manual,
proxy_url=proxy_url,
use_readability=use_readability,
)
# TODO: after SDK bug is addressed, don't catch the exception
except McpError as e:
return GetPromptResult(
Expand Down
39 changes: 39 additions & 0 deletions src/fetch/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,45 @@ def test_html_with_links(self):
result = extract_content_from_html(html)
assert "Example" in result

def test_falls_back_when_readability_is_unavailable(self):
"""Test that Readability.js failures fall back to the Python simplifier."""
calls = []

def simplify(html, use_readability):
calls.append(use_readability)
if use_readability:
raise RuntimeError("node is not available")
return {"content": "<main><p>Fallback content</p></main>"}

with patch(
"mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string",
side_effect=simplify,
):
result = extract_content_from_html("<html><body>Fallback content</body></html>")

assert "Fallback content" in result
assert calls == [True, False]

def test_can_disable_readability(self):
"""Test that callers can skip Readability.js entirely."""
calls = []

def simplify(html, use_readability):
calls.append(use_readability)
return {"content": "<main><p>Python-only content</p></main>"}

with patch(
"mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string",
side_effect=simplify,
):
result = extract_content_from_html(
"<html><body>Python-only content</body></html>",
use_readability=False,
)

assert "Python-only content" in result
assert calls == [False]

def test_empty_content_returns_error(self):
"""Test that empty/invalid HTML returns error message."""
html = ""
Expand Down
Loading