Skip to content

Commit 9a34e32

Browse files
xunliuWillemJiang
andauthored
chore : Improved citation system (bytedance#834)
* improve: Improved citation system * fix --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
1 parent 31624b6 commit 9a34e32

File tree

8 files changed

+1730
-60
lines changed

8 files changed

+1730
-60
lines changed

src/citations/collector.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def __init__(self):
2828
self._citations: Dict[str, CitationMetadata] = {} # url -> metadata
2929
self._citation_order: List[str] = [] # ordered list of URLs
3030
self._used_citations: set[str] = set() # URLs that are actually cited
31+
self._url_to_index: Dict[str, int] = {} # url -> index of _citation_order (O(1) lookup)
3132

3233
def add_from_search_results(
3334
self, results: List[Dict[str, Any]], query: str = ""
@@ -58,6 +59,7 @@ def add_from_search_results(
5859
if url not in self._citations:
5960
self._citations[url] = metadata
6061
self._citation_order.append(url)
62+
self._url_to_index[url] = len(self._citation_order) - 1
6163
added.append(metadata)
6264
logger.debug(f"Added citation: {metadata.title} ({url})")
6365
else:
@@ -104,6 +106,7 @@ def add_from_crawl_result(
104106
)
105107
self._citations[url] = metadata
106108
self._citation_order.append(url)
109+
self._url_to_index[url] = len(self._citation_order) - 1
107110

108111
return metadata
109112

@@ -124,18 +127,16 @@ def mark_used(self, url: str) -> Optional[int]:
124127

125128
def get_number(self, url: str) -> Optional[int]:
126129
"""
127-
Get the citation number for a URL.
130+
Get the citation number for a URL (O(1) time complexity).
128131

129132
Args:
130133
url: The URL to look up
131134

132135
Returns:
133136
The citation number (1-indexed) or None if not found
134137
"""
135-
try:
136-
return self._citation_order.index(url) + 1
137-
except ValueError:
138-
return None
138+
index = self._url_to_index.get(url)
139+
return index + 1 if index is not None else None
139140

140141
def get_metadata(self, url: str) -> Optional[CitationMetadata]:
141142
"""
@@ -215,7 +216,9 @@ def from_dict(cls, data: Dict[str, Any]) -> "CitationCollector":
215216
for citation_data in data.get("citations", []):
216217
citation = Citation.from_dict(citation_data)
217218
collector._citations[citation.url] = citation.metadata
219+
index = len(collector._citation_order)
218220
collector._citation_order.append(citation.url)
221+
collector._url_to_index[citation.url] = index
219222
collector._used_citations = set(data.get("used_urls", []))
220223
return collector
221224

@@ -230,6 +233,7 @@ def merge_with(self, other: "CitationCollector") -> None:
230233
if url not in self._citations:
231234
self._citations[url] = other._citations[url]
232235
self._citation_order.append(url)
236+
self._url_to_index[url] = len(self._citation_order) - 1
233237
self._used_citations.update(other._used_citations)
234238

235239
@property
@@ -247,6 +251,7 @@ def clear(self) -> None:
247251
self._citations.clear()
248252
self._citation_order.clear()
249253
self._used_citations.clear()
254+
self._url_to_index.clear()
250255

251256

252257
def extract_urls_from_text(text: str) -> List[str]:

src/citations/extractor.py

Lines changed: 120 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import json
99
import logging
10+
import re
1011
from typing import Any, Dict, List, Optional
1112

1213
from langchain_core.messages import AIMessage, ToolMessage
@@ -205,6 +206,84 @@ def _result_to_citation(result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
205206
}
206207

207208

209+
def extract_title_from_content(content: Optional[str], max_length: int = 200) -> str:
210+
"""
211+
Intelligent title extraction supporting multiple formats.
212+
213+
Priority:
214+
1. HTML <title> tag
215+
2. Markdown h1 (# Title)
216+
3. Markdown h2-h6 (## Title, etc.)
217+
4. JSON/YAML title field
218+
5. First substantial non-empty line
219+
6. "Untitled" as fallback
220+
221+
Args:
222+
content: The content to extract title from (can be None)
223+
max_length: Maximum title length (default: 200)
224+
225+
Returns:
226+
Extracted title or "Untitled"
227+
"""
228+
if not content:
229+
return "Untitled"
230+
231+
# 1. Try HTML title tag
232+
html_title_match = re.search(
233+
r'<title[^>]*>([^<]+)</title>',
234+
content,
235+
re.IGNORECASE | re.DOTALL
236+
)
237+
if html_title_match:
238+
title = html_title_match.group(1).strip()
239+
if title:
240+
return title[:max_length]
241+
242+
# 2. Try Markdown h1 (exact match of only one #)
243+
md_h1_match = re.search(
244+
r'^#{1}\s+(.+?)$',
245+
content,
246+
re.MULTILINE
247+
)
248+
if md_h1_match:
249+
title = md_h1_match.group(1).strip()
250+
if title:
251+
return title[:max_length]
252+
253+
# 3. Try any Markdown heading (h2-h6)
254+
md_heading_match = re.search(
255+
r'^#{2,6}\s+(.+?)$',
256+
content,
257+
re.MULTILINE
258+
)
259+
if md_heading_match:
260+
title = md_heading_match.group(1).strip()
261+
if title:
262+
return title[:max_length]
263+
264+
# 4. Try JSON/YAML title field
265+
json_title_match = re.search(
266+
r'"?title"?\s*:\s*["\']?([^"\'\n]+)["\']?',
267+
content,
268+
re.IGNORECASE
269+
)
270+
if json_title_match:
271+
title = json_title_match.group(1).strip()
272+
if title and len(title) > 3:
273+
return title[:max_length]
274+
275+
# 5. First substantial non-empty line
276+
for line in content.split('\n'):
277+
line = line.strip()
278+
# Skip short lines, code blocks, list items, and separators
279+
if (line and
280+
len(line) > 10 and
281+
not line.startswith(('```', '---', '***', '- ', '* ', '+ ', '#'))):
282+
return line[:max_length]
283+
284+
return "Untitled"
285+
286+
208287
def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
209288
"""
210289
Extract citation from crawl tool result.
@@ -224,18 +303,8 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
224303

225304
content = data.get("crawled_content", "")
226305

227-
# Try to extract title from content (first h1 or first line)
228-
title = "Untitled"
229-
if content:
230-
lines = content.strip().split("\n")
231-
for line in lines:
232-
line = line.strip()
233-
if line.startswith("# "):
234-
title = line[2:].strip()
235-
break
236-
elif line and not line.startswith("#"):
237-
title = line[:100]
238-
break
306+
# Extract title using intelligent extraction function
307+
title = extract_title_from_content(content)
239308

240309
return {
241310
"url": url,
@@ -248,15 +317,48 @@ def _extract_from_crawl_result(data: Any) -> Optional[Dict[str, Any]]:
248317
}
249318

250319

251-
def _extract_domain(url: str) -> str:
252-
"""Extract domain from URL."""
320+
def _extract_domain(url: Optional[str]) -> str:
321+
"""
322+
Extract domain from URL using urllib with regex fallback.
323+
324+
Handles:
325+
- Standard URLs: https://www.example.com/path
326+
- Short URLs: example.com
327+
- Invalid URLs: graceful fallback
328+
329+
Args:
330+
url: The URL string to extract domain from (can be None)
331+
332+
Returns:
333+
The domain netloc (including port if present), or empty string if extraction fails
334+
"""
335+
if not url:
336+
return ""
337+
338+
# Approach 1: Try urllib first (fast path for standard URLs)
253339
try:
254340
from urllib.parse import urlparse
255-
341+
256342
parsed = urlparse(url)
257-
return parsed.netloc
258-
except Exception:
259-
return ""
343+
if parsed.netloc:
344+
return parsed.netloc
345+
except Exception as e:
346+
logger.debug(f"URL parsing failed for {url}: {e}")
347+
348+
# Approach 2: Regex fallback (for non-standard or bare URLs without scheme)
349+
# Matches: domain[:port] where domain is a valid hostname
350+
# Pattern breakdown:
351+
# ([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)
352+
# - domain labels separated by dots, each 1-63 chars, starting/ending with alphanumeric
353+
# (?::\d+)? - optional port
354+
pattern = r'^([a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*(?::\d+)?)(?:[/?#]|$)'
355+
356+
match = re.match(pattern, url)
357+
if match:
358+
return match.group(1)
359+
360+
logger.warning(f"Could not extract domain from URL: {url}")
361+
return ""
260362

261363

262364
def merge_citations(

0 commit comments

Comments
 (0)