Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/citations/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(self):
self._citations: Dict[str, CitationMetadata] = {} # url -> metadata
self._citation_order: List[str] = [] # ordered list of URLs
self._used_citations: set[str] = set() # URLs that are actually cited
self._url_to_index: Dict[str, int] = {} # url -> index of _citation_order (O(1) lookup)

def add_from_search_results(
self, results: List[Dict[str, Any]], query: str = ""
Expand Down Expand Up @@ -58,6 +59,7 @@ def add_from_search_results(
if url not in self._citations:
self._citations[url] = metadata
self._citation_order.append(url)
self._url_to_index[url] = len(self._citation_order) - 1
added.append(metadata)
logger.debug(f"Added citation: {metadata.title} ({url})")
else:
Expand Down Expand Up @@ -104,6 +106,7 @@ def add_from_crawl_result(
)
self._citations[url] = metadata
self._citation_order.append(url)
self._url_to_index[url] = len(self._citation_order) - 1

return metadata

Expand All @@ -124,18 +127,16 @@ def mark_used(self, url: str) -> Optional[int]:

def get_number(self, url: str) -> Optional[int]:
"""
Get the citation number for a URL.
Get the citation number for a URL (O(1) time complexity).

Args:
url: The URL to look up

Returns:
The citation number (1-indexed) or None if not found
"""
try:
return self._citation_order.index(url) + 1
except ValueError:
return None
index = self._url_to_index.get(url)
return index + 1 if index is not None else None

def get_metadata(self, url: str) -> Optional[CitationMetadata]:
"""
Expand Down Expand Up @@ -215,7 +216,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "CitationCollector":
for citation_data in data.get("citations", []):
citation = Citation.from_dict(citation_data)
collector._citations[citation.url] = citation.metadata
index = len(collector._citation_order)
collector._citation_order.append(citation.url)
collector._url_to_index[citation.url] = index
collector._used_citations = set(data.get("used_urls", []))
return collector

Expand All @@ -230,6 +233,7 @@ def merge_with(self, other: "CitationCollector") -> None:
if url not in self._citations:
self._citations[url] = other._citations[url]
self._citation_order.append(url)
self._url_to_index[url] = len(self._citation_order) - 1
self._used_citations.update(other._used_citations)

@property
Expand All @@ -247,6 +251,7 @@ def clear(self) -> None:
self._citations.clear()
self._citation_order.clear()
self._used_citations.clear()
self._url_to_index.clear()


def extract_urls_from_text(text: str) -> List[str]:
Expand Down
138 changes: 120 additions & 18 deletions src/citations/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import json
import logging
import re
from typing import Any, Dict, List, Optional

from langchain_core.messages import AIMessage, ToolMessage
Expand Down Expand Up @@ -205,6 +206,84 @@ def _result_to_citation(result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
}


def extract_title_from_content(content: Optional[str], max_length: int = 200) -> str:
"""
Intelligent title extraction supporting multiple formats.

Priority:
1. HTML <title> tag
2. Markdown h1 (# Title)
3. Markdown h2-h6 (## Title, etc.)
4. JSON/YAML title field
5. First substantial non-empty line
6. "Untitled" as fallback

Args:
content: The content to extract title from (can be None)
max_length: Maximum title length (default: 200)

Returns:
Extracted title or "Untitled"
"""
if not content:
return "Untitled"

# 1. Try HTML title tag
html_title_match = re.search(
r'<title[^>]*>([^<]+)</title>',
content,
re.IGNORECASE | re.DOTALL
)
if html_title_match:
title = html_title_match.group(1).strip()
if title:
return title[:max_length]

# 2. Try Markdown h1 (exact match of only one #)
md_h1_match = re.search(
r'^#{1}\s+(.+?)$',
content,
re.MULTILINE
)
if md_h1_match:
title = md_h1_match.group(1).strip()
if title:
return title[:max_length]

# 3. Try any Markdown heading (h2-h6)
md_heading_match = re.search(
r'^#{2,6}\s+(.+?)$',
content,
re.MULTILINE
)
if md_heading_match:
title = md_heading_match.group(1).strip()
if title:
return title[:max_length]

# 4. Try JSON/YAML title field
json_title_match = re.search(
r'"?title"?\s*:\s*["\']?([^"\'\n]+)["\']?',
content,
re.IGNORECASE
)
if json_title_match:
title = json_title_match.group(1).strip()
if title and len(title) > 3:
return title[:max_length]

# 5. First substantial non-empty line
for line in content.split('\n'):
line = line.strip()
# Skip short lines, code blocks, list items, and separators
if (line and
len(line) > 10 and
not line.startswith(('```', '---', '***', '- ', '* ', '+ ', '#'))):
return line[:max_length]

return "Untitled"


def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
"""
Extract citation from crawl tool result.
Expand All @@ -224,18 +303,8 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:

content = data.get("crawled_content", "")

# Try to extract title from content (first h1 or first line)
title = "Untitled"
if content:
lines = content.strip().split("\n")
for line in lines:
line = line.strip()
if line.startswith("# "):
title = line[2:].strip()
break
elif line and not line.startswith("#"):
title = line[:100]
break
# Extract title using intelligent extraction function
title = extract_title_from_content(content)

return {
"url": url,
Expand All @@ -248,15 +317,48 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
}


def _extract_domain(url: str) -> str:
"""Extract domain from URL."""
def _extract_domain(url: Optional[str]) -> str:
"""
Extract domain from URL using urllib with regex fallback.

Handles:
- Standard URLs: https://www.example.com/path
- Short URLs: example.com
- Invalid URLs: graceful fallback

Args:
url: The URL string to extract domain from (can be None)

Returns:
The domain netloc (including port if present), or empty string if extraction fails
"""
if not url:
return ""

# Approach 1: Try urllib first (fast path for standard URLs)
try:
from urllib.parse import urlparse

parsed = urlparse(url)
return parsed.netloc
except Exception:
return ""
if parsed.netloc:
return parsed.netloc
except Exception as e:
logger.debug(f"URL parsing failed for {url}: {e}")

# Approach 2: Regex fallback (for non-standard or bare URLs without scheme)
# Matches: domain[:port] where domain is a valid hostname
# Pattern breakdown:
# ([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)
# - domain labels separated by dots, each 1-63 chars, starting/ending with alphanumeric
# (?::\d+)? - optional port
pattern = r'^([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*(?::\d+)?)(?:[/?#]|$)'

match = re.match(pattern, url)
if match:
return match.group(1)

logger.warning(f"Could not extract domain from URL: {url}")
return ""


def merge_citations(
Expand Down
Loading