Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/fetch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ The fetch tool will truncate the response, but by using the `start_index` argume
- `max_length` (integer, optional): Maximum number of characters to return (default: 5000)
- `start_index` (integer, optional): Start content from this character index (default: 0)
- `raw` (boolean, optional): Get raw content without markdown conversion (default: false)
- `distill` (boolean, optional): Aggressively clean HTML to minimize token usage. Removes scripts, styles, navigation, headers, footers, ads, and other non-essential content. Reduces token count by 60-85%. Recommended for cost optimization when only core content is needed. Has no effect when `raw=True` (default: false)

### Prompts

Expand Down
98 changes: 93 additions & 5 deletions src/fetch/src/mcp_server_fetch/server.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re
from typing import Annotated, Tuple
from urllib.parse import urlparse, urlunparse

from bs4 import BeautifulSoup, Comment
import markdownify
import readabilipy.simple_json
from mcp.shared.exceptions import McpError
Expand All @@ -23,12 +25,73 @@
DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"

# Tags that are never part of the main content
_NON_CONTENT_TAGS = frozenset({
'script', 'style', 'nav', 'header', 'footer', 'aside',
'iframe', 'noscript', 'svg', 'form', 'button', 'input',
'select', 'textarea',
})

# Class/ID keywords that signal non-content elements
_NON_CONTENT_KEYWORDS = re.compile(
r'\b(ad|ads|advert|advertisement|banner|sidebar|menu|nav|navigation|'
r'header|footer|popup|modal|cookie|consent|social|share|sharing|'
r'widget|promo|promotional)\b',
re.IGNORECASE,
)


def distill_html(html: str) -> str:
"""Aggressively clean HTML to minimize token usage.

Uses BeautifulSoup for reliable HTML parsing instead of regex.
This function is applied *after* Readability extraction to avoid
interfering with Readability's content-detection heuristics.

Removes:
- Non-content tags (scripts, styles, nav, header, footer, etc.)
- HTML comments
- Elements with ad/navigation class names or IDs
- Empty elements

Args:
html: HTML content to clean (typically Readability output)

Returns:
Cleaned HTML with only essential content
"""
soup = BeautifulSoup(html, "lxml")

# Remove HTML comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()

# Remove non-content tags
for tag in soup.find_all(_NON_CONTENT_TAGS):
tag.decompose()

# Remove elements with ad/navigation class names or IDs
for tag in soup.find_all(True):
classes = " ".join(tag.get("class", []))
tag_id = tag.get("id", "")
if _NON_CONTENT_KEYWORDS.search(classes) or _NON_CONTENT_KEYWORDS.search(tag_id):
tag.decompose()

def extract_content_from_html(html: str) -> str:
# Remove empty elements (no text, no children with text)
for tag in soup.find_all(True):
if not tag.get_text(strip=True) and tag.name not in ('br', 'hr', 'img'):
tag.decompose()

return str(soup)


def extract_content_from_html(html: str, distill: bool = False) -> str:
"""Extract and convert HTML content to Markdown format.

Args:
html: Raw HTML content to process
distill: If True, aggressively clean the Readability output to minimize tokens.
Has no effect when raw=True (HTML is returned as-is).

Returns:
Simplified markdown version of the content
Expand All @@ -38,8 +101,14 @@ def extract_content_from_html(html: str) -> str:
)
if not ret["content"]:
return "<error>Page failed to be simplified from HTML</error>"

content_html = ret["content"]

if distill:
content_html = distill_html(content_html)

content = markdownify.markdownify(
ret["content"],
content_html,
heading_style=markdownify.ATX,
)
return content
Expand Down Expand Up @@ -109,10 +178,18 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:


async def fetch_url(
url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
url: str,
user_agent: str,
force_raw: bool = False,
distill: bool = False,
proxy_url: str | None = None,
) -> Tuple[str, str]:
"""
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.

Token Optimization:
distill=True: Post-processes Readability output to remove remaining non-content
elements (60-85% token reduction). Has no effect when force_raw=True.
"""
from httpx import AsyncClient, HTTPError

Expand Down Expand Up @@ -140,7 +217,7 @@ async def fetch_url(
)

if is_page_html and not force_raw:
return extract_content_from_html(page_raw), ""
return extract_content_from_html(page_raw, distill=distill), ""

return (
page_raw,
Expand Down Expand Up @@ -176,6 +253,13 @@ class Fetch(BaseModel):
description="Get the actual HTML content of the requested page, without simplification.",
),
]
distill: Annotated[
bool,
Field(
default=False,
description="Aggressively clean HTML to reduce token usage. Removes navigation, ads, sidebars, and other non-content elements. Typically reduces tokens by 60-85%. Has no effect when raw=True.",
),
]


async def serve(
Expand Down Expand Up @@ -235,7 +319,11 @@ async def call_tool(name, arguments: dict) -> list[TextContent]:
await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)

content, prefix = await fetch_url(
url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
url,
user_agent_autonomous,
force_raw=args.raw,
distill=args.distill,
proxy_url=proxy_url,
)
original_length = len(content)
if args.start_index >= original_length:
Expand Down
146 changes: 146 additions & 0 deletions src/fetch/tests/test_distill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""Tests for the distill_html function and distill parameter integration."""

import pytest

from mcp_server_fetch.server import distill_html, extract_content_from_html


class TestDistillHtml:
"""Tests for the distill_html function using BeautifulSoup."""

def test_removes_script_tags(self):
html = "<html><body><script>evil();</script><p>Safe</p></body></html>"
result = distill_html(html)
assert "<script" not in result.lower()
assert "Safe" in result

def test_removes_style_tags(self):
html = "<html><body><style>.x{color:red}</style><p>Content</p></body></html>"
result = distill_html(html)
assert "<style" not in result.lower()
assert "Content" in result

def test_removes_nav_tags(self):
html = "<html><body><nav><a href='/'>Home</a></nav><p>Article</p></body></html>"
result = distill_html(html)
assert "<nav" not in result.lower()
assert "Article" in result

def test_removes_header_tags(self):
html = "<html><body><header>Logo</header><p>Main</p></body></html>"
result = distill_html(html)
assert "<header" not in result.lower()
assert "Main" in result

def test_removes_footer_tags(self):
html = "<html><body><p>Content</p><footer>Copyright</footer></body></html>"
result = distill_html(html)
assert "<footer" not in result.lower()
assert "Content" in result

def test_removes_aside_tags(self):
html = "<html><body><aside>Sidebar</aside><p>Main</p></body></html>"
result = distill_html(html)
assert "<aside" not in result.lower()
assert "Main" in result

def test_removes_iframe_tags(self):
html = '<html><body><iframe src="ad.html"></iframe><p>Content</p></body></html>'
result = distill_html(html)
assert "<iframe" not in result.lower()
assert "Content" in result

def test_removes_svg_tags(self):
html = "<html><body><svg><circle/></svg><p>Content</p></body></html>"
result = distill_html(html)
assert "<svg" not in result.lower()
assert "Content" in result

def test_removes_form_elements(self):
html = "<html><body><form><input/><button>Go</button></form><p>Content</p></body></html>"
result = distill_html(html)
assert "<form" not in result.lower()
assert "Content" in result

def test_removes_html_comments(self):
html = "<html><body><!-- secret --><p>Visible</p></body></html>"
result = distill_html(html)
assert "<!--" not in result
assert "secret" not in result
assert "Visible" in result

def test_removes_nested_nav_tags(self):
"""Verifies BeautifulSoup handles nested tags (unlike regex)."""
html = "<html><body><nav><nav>inner</nav>outer</nav><p>Content</p></body></html>"
result = distill_html(html)
assert "<nav" not in result.lower()
assert "inner" not in result
assert "outer" not in result
assert "Content" in result

def test_removes_elements_with_ad_class(self):
html = '<html><body><div class="sidebar-ad">Buy now!</div><p>Real</p></body></html>'
result = distill_html(html)
assert "Buy now" not in result
assert "Real" in result

def test_removes_elements_with_cookie_id(self):
html = '<html><body><div id="cookie-consent">Accept</div><p>Content</p></body></html>'
result = distill_html(html)
assert "Accept" not in result
assert "Content" in result

def test_removes_elements_with_social_class(self):
html = '<html><body><div class="social-share">Share us</div><p>Article</p></body></html>'
result = distill_html(html)
assert "Share us" not in result
assert "Article" in result

def test_removes_empty_elements(self):
html = "<html><body><div></div><span> </span><p>Content</p></body></html>"
result = distill_html(html)
assert "Content" in result

def test_preserves_br_hr_img(self):
html = '<html><body><p>Before</p><br/><hr/><img src="x.jpg"/><p>After</p></body></html>'
result = distill_html(html)
assert "Before" in result
assert "After" in result

def test_returns_string(self):
html = "<html><body><p>Hello</p></body></html>"
result = distill_html(html)
assert isinstance(result, str)


class TestExtractContentDistillIntegration:
"""Tests for the distill parameter in extract_content_from_html."""

def test_distill_false_returns_content(self):
html = "<html><body><article><h1>Title</h1><p>Body text.</p></article></body></html>"
result = extract_content_from_html(html, distill=False)
assert "Body text" in result

def test_distill_true_returns_content(self):
html = "<html><body><article><h1>Title</h1><p>Body text.</p></article></body></html>"
result = extract_content_from_html(html, distill=True)
assert "Body text" in result

def test_distill_true_does_not_produce_empty_output(self):
"""Ensures distill doesn't strip core content (the 13-token bug)."""
html = """<html><body>
<article>
<h1>Important Article</h1>
<p>First paragraph with real content.</p>
<p>Second paragraph with more details.</p>
</article>
</body></html>"""
result = extract_content_from_html(html, distill=True)
# Should have meaningful content, not just a few tokens
assert len(result.split()) > 5

def test_distill_default_is_false(self):
html = "<html><body><article><p>Content</p></article></body></html>"
result_default = extract_content_from_html(html)
result_explicit = extract_content_from_html(html, distill=False)
assert result_default == result_explicit
Loading