diff --git a/src/fetch/README.md b/src/fetch/README.md
index 2c3e048927..b995d6018c 100644
--- a/src/fetch/README.md
+++ b/src/fetch/README.md
@@ -16,6 +16,7 @@ The fetch tool will truncate the response, but by using the `start_index` argume
- `max_length` (integer, optional): Maximum number of characters to return (default: 5000)
- `start_index` (integer, optional): Start content from this character index (default: 0)
- `raw` (boolean, optional): Get raw content without markdown conversion (default: false)
+ - `distill` (boolean, optional): Aggressively clean HTML to minimize token usage. Removes scripts, styles, navigation, headers, footers, ads, and other non-essential content. Reduces token count by 60-85%. Recommended for cost optimization when only core content is needed. Has no effect when `raw=True` (default: false)
### Prompts
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index b42c7b1f6b..b07ed576e8 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -1,6 +1,8 @@
+import re
from typing import Annotated, Tuple
from urllib.parse import urlparse, urlunparse
+from bs4 import BeautifulSoup, Comment
import markdownify
import readabilipy.simple_json
from mcp.shared.exceptions import McpError
@@ -23,12 +25,73 @@
DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
+# Tags that are never part of the main content
+_NON_CONTENT_TAGS = frozenset({
+ 'script', 'style', 'nav', 'header', 'footer', 'aside',
+ 'iframe', 'noscript', 'svg', 'form', 'button', 'input',
+ 'select', 'textarea',
+})
+
+# Class/ID keywords that signal non-content elements
+_NON_CONTENT_KEYWORDS = re.compile(
+ r'\b(ad|ads|advert|advertisement|banner|sidebar|menu|nav|navigation|'
+ r'header|footer|popup|modal|cookie|consent|social|share|sharing|'
+ r'widget|promo|promotional)\b',
+ re.IGNORECASE,
+)
+
+
+def distill_html(html: str) -> str:
+ """Aggressively clean HTML to minimize token usage.
+
+ Uses BeautifulSoup for reliable HTML parsing instead of regex.
+ This function is applied *after* Readability extraction to avoid
+ interfering with Readability's content-detection heuristics.
+
+ Removes:
+ - Non-content tags (scripts, styles, nav, header, footer, etc.)
+ - HTML comments
+ - Elements with ad/navigation class names or IDs
+ - Empty elements
+
+ Args:
+ html: HTML content to clean (typically Readability output)
+
+ Returns:
+ Cleaned HTML with only essential content
+ """
+ soup = BeautifulSoup(html, "lxml")
+
+ # Remove HTML comments
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+ comment.extract()
+
+ # Remove non-content tags
+ for tag in soup.find_all(_NON_CONTENT_TAGS):
+ tag.decompose()
+
+ # Remove elements with ad/navigation class names or IDs
+ for tag in soup.find_all(True):
+ classes = " ".join(tag.get("class", []))
+ tag_id = tag.get("id", "")
+ if _NON_CONTENT_KEYWORDS.search(classes) or _NON_CONTENT_KEYWORDS.search(tag_id):
+ tag.decompose()
-def extract_content_from_html(html: str) -> str:
+ # Remove empty elements (no text, no children with text)
+ for tag in soup.find_all(True):
+ if not tag.get_text(strip=True) and tag.name not in ('br', 'hr', 'img'):
+ tag.decompose()
+
+ return str(soup)
+
+
+def extract_content_from_html(html: str, distill: bool = False) -> str:
"""Extract and convert HTML content to Markdown format.
Args:
html: Raw HTML content to process
+ distill: If True, aggressively clean the Readability output to minimize tokens.
+ Has no effect when raw=True (HTML is returned as-is).
Returns:
Simplified markdown version of the content
@@ -38,8 +101,14 @@ def extract_content_from_html(html: str) -> str:
)
if not ret["content"]:
return "
Safe
" + result = distill_html(html) + assert "