77
88import json
99import logging
10+ import re
1011from typing import Any, Dict, List, Optional
1112
1213from langchain_core.messages import AIMessage, ToolMessage
@@ -205,6 +206,84 @@ def _result_to_citation(result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
205206 }
206207
207208
209+ def extract_title_from_content(content: Optional[str], max_length: int = 200) -> str:
210+ """
211+ Intelligent title extraction supporting multiple formats.
212+
213+ Priority:
214+ 1. HTML <title> tag
215+ 2. Markdown h1 (# Title)
216+ 3. Markdown h2-h6 (## Title, etc.)
217+ 4. JSON/YAML title field
218+ 5. First substantial non-empty line
219+ 6. "Untitled" as fallback
220+
221+ Args:
222+ content: The content to extract title from (can be None)
223+ max_length: Maximum title length (default: 200)
224+
225+ Returns:
226+ Extracted title or "Untitled"
227+ """
228+ if not content:
229+ return "Untitled"
230+
231+ # 1. Try HTML title tag
232+ html_title_match = re.search(
233+ r'<title[^>]*>([^<]+)</title>',
234+ content,
235+ re.IGNORECASE | re.DOTALL
236+ )
237+ if html_title_match:
238+ title = html_title_match.group(1).strip()
239+ if title:
240+ return title[:max_length]
241+
242+ # 2. Try Markdown h1 (exact match of only one #)
243+ md_h1_match = re.search(
244+ r'^#{1}\s+(.+?)$',
245+ content,
246+ re.MULTILINE
247+ )
248+ if md_h1_match:
249+ title = md_h1_match.group(1).strip()
250+ if title:
251+ return title[:max_length]
252+
253+ # 3. Try any Markdown heading (h2-h6)
254+ md_heading_match = re.search(
255+ r'^#{2,6}\s+(.+?)$',
256+ content,
257+ re.MULTILINE
258+ )
259+ if md_heading_match:
260+ title = md_heading_match.group(1).strip()
261+ if title:
262+ return title[:max_length]
263+
264+ # 4. Try JSON/YAML title field
265+ json_title_match = re.search(
266+ r'"?title"?\s*:\s*["\']?([^"\'\n]+)["\']?',
267+ content,
268+ re.IGNORECASE
269+ )
270+ if json_title_match:
271+ title = json_title_match.group(1).strip()
272+ if title and len(title) > 3:
273+ return title[:max_length]
274+
275+ # 5. First substantial non-empty line
276+ for line in content.split('\n'):
277+ line = line.strip()
278+ # Skip short lines, code blocks, list items, and separators
279+ if (line and
280+ len(line) > 10 and
281+ not line.startswith(('```', '---', '***', '- ', '* ', '+ ', '#'))):
282+ return line[:max_length]
283+
284+ return "Untitled"
285+
286+
208287def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
209288 """
210289 Extract citation from crawl tool result.
@@ -224,18 +303,8 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
224303
225304 content = data.get("crawled_content", "")
226305
227- # Try to extract title from content (first h1 or first line)
228- title = "Untitled"
229- if content:
230- lines = content.strip().split("\n")
231- for line in lines:
232- line = line.strip()
233- if line.startswith("# "):
234- title = line[2:].strip()
235- break
236- elif line and not line.startswith("#"):
237- title = line[:100]
238- break
306+ # Extract title using intelligent extraction function
307+ title = extract_title_from_content(content)
239308
240309 return {
241310 "url": url,
@@ -248,15 +317,48 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
248317 }
249318
250319
251- def _extract_domain(url: str) -> str:
252- """Extract domain from URL."""
320+ def _extract_domain(url: Optional[str]) -> str:
321+ """
322+ Extract domain from URL using urllib with regex fallback.
323+
324+ Handles:
325+ - Standard URLs: https://www.example.com/path
326+ - Short URLs: example.com
327+ - Invalid URLs: graceful fallback
328+
329+ Args:
330+ url: The URL string to extract domain from (can be None)
331+
332+ Returns:
333+ The domain netloc (including port if present), or empty string if extraction fails
334+ """
335+ if not url:
336+ return ""
337+
338+ # Approach 1: Try urllib first (fast path for standard URLs)
253339 try:
254340 from urllib.parse import urlparse
255-
341+
256342 parsed = urlparse(url)
257- return parsed.netloc
258- except Exception:
259- return ""
343+ if parsed.netloc:
344+ return parsed.netloc
345+ except Exception as e:
346+ logger.debug(f"URL parsing failed for {url}: {e}")
347+
348+ # Approach 2: Regex fallback (for non-standard or bare URLs without scheme)
349+ # Matches: domain[:port] where domain is a valid hostname
350+ # Pattern breakdown:
351+ # ([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)
352+ # - domain labels separated by dots, each 1-63 chars, starting/ending with alphanumeric
353+ # (?::\d+)? - optional port
354+ pattern = r'^([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*(?::\d+)?)(?:[/?#]|$)'
355+
356+ match = re.match(pattern, url)
357+ if match:
358+ return match.group(1)
359+
360+ logger.warning(f"Could not extract domain from URL: {url}")
361+ return ""
260362
261363
262364def merge_citations(
0 commit comments