Skip to content

Commit 8d9d767

Browse files
authored
feat(eval): add report quality evaluation module and UI integration (bytedance#776)
* feat(eval): add report quality evaluation module Addresses issue bytedance#773 - How to evaluate generated report quality objectively. This module provides two evaluation approaches: 1. Automated metrics (no LLM required): - Citation count and source diversity - Word count compliance per report style - Section structure validation - Image inclusion tracking 2. LLM-as-Judge evaluation: - Factual accuracy scoring - Completeness assessment - Coherence evaluation - Relevance and citation quality checks The combined evaluator provides a final score (1-10) and letter grade (A+ to F). Files added: - src/eval/__init__.py - src/eval/metrics.py - src/eval/llm_judge.py - src/eval/evaluator.py - tests/unit/eval/test_metrics.py - tests/unit/eval/test_evaluator.py * feat(eval): integrate report evaluation with web UI This commit adds the web UI integration for the evaluation module: Backend: - Add EvaluateReportRequest/Response models in src/server/eval_request.py - Add /api/report/evaluate endpoint to src/server/app.py Frontend: - Add evaluateReport API function in web/src/core/api/evaluate.ts - Create EvaluationDialog component with grade badge, metrics display, and optional LLM deep evaluation - Add evaluation button (graduation cap icon) to research-block.tsx toolbar - Add i18n translations for English and Chinese The evaluation UI allows users to: 1. View quick metrics-only evaluation (instant) 2. Optionally run deep LLM-based evaluation for detailed analysis 3. See grade (A+ to F), score (1-10), and metric breakdown * feat(eval): improve evaluation reliability and add LLM judge tests - Extract MAX_REPORT_LENGTH constant in llm_judge.py for maintainability - Add comprehensive unit tests for LLMJudge class (parse_response, calculate_weighted_score, evaluate with mocked LLM) - Pass reportStyle prop to EvaluationDialog for accurate evaluation criteria - Add researchQueries store map to reliably associate queries with research - Add getResearchQuery helper to retrieve query by researchId - Remove unused imports in test_metrics.py * fix(eval): use resolveServiceURL for evaluate API endpoint The evaluateReport function was using a relative URL '/api/report/evaluate' which sent requests to the Next.js server instead of the FastAPI backend. Changed to use resolveServiceURL() consistent with other API functions. * fix: improve type accuracy and React hooks in evaluation components - Fix get_word_count_target return type from Optional[Dict] to Dict since it always returns a value via default fallback - Fix useEffect dependency issue in EvaluationDialog using useRef to prevent unwanted re-evaluations - Add aria-label to GradeBadge for screen reader accessibility
1 parent 84a7f78 commit 8d9d767

File tree

17 files changed

+2103
-2
lines changed

17 files changed

+2103
-2
lines changed

src/eval/__init__.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2+
# SPDX-License-Identifier: MIT
3+
4+
"""
5+
Report Quality Evaluation Module for DeerFlow.
6+
7+
This module provides objective methods to evaluate generated report quality,
8+
including automated metrics and LLM-based evaluation.
9+
"""
10+
11+
from .evaluator import ReportEvaluator
12+
from .metrics import ReportMetrics, compute_metrics
13+
from .llm_judge import LLMJudge, evaluate_with_llm
14+
15+
__all__ = [
16+
"ReportEvaluator",
17+
"ReportMetrics",
18+
"compute_metrics",
19+
"LLMJudge",
20+
"evaluate_with_llm",
21+
]

src/eval/evaluator.py

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
2+
# SPDX-License-Identifier: MIT
3+
4+
"""
5+
Combined report evaluator orchestrating both automated metrics and LLM evaluation.
6+
"""
7+
8+
import logging
9+
from dataclasses import dataclass
10+
from typing import Any, Dict, Optional
11+
12+
from .llm_judge import EvaluationResult, LLMJudge
13+
from .metrics import ReportMetrics, compute_metrics, get_word_count_target
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
@dataclass
19+
class CombinedEvaluation:
20+
"""Combined evaluation results from metrics and LLM judge."""
21+
22+
metrics: ReportMetrics
23+
llm_evaluation: Optional[EvaluationResult]
24+
final_score: float
25+
grade: str
26+
summary: str
27+
28+
def to_dict(self) -> Dict[str, Any]:
29+
"""Convert to dictionary format."""
30+
return {
31+
"metrics": self.metrics.to_dict(),
32+
"llm_evaluation": (
33+
self.llm_evaluation.to_dict() if self.llm_evaluation else None
34+
),
35+
"final_score": self.final_score,
36+
"grade": self.grade,
37+
"summary": self.summary,
38+
}
39+
40+
41+
def score_to_grade(score: float) -> str:
42+
"""Convert numeric score to letter grade."""
43+
if score >= 9.0:
44+
return "A+"
45+
elif score >= 8.5:
46+
return "A"
47+
elif score >= 8.0:
48+
return "A-"
49+
elif score >= 7.5:
50+
return "B+"
51+
elif score >= 7.0:
52+
return "B"
53+
elif score >= 6.5:
54+
return "B-"
55+
elif score >= 6.0:
56+
return "C+"
57+
elif score >= 5.5:
58+
return "C"
59+
elif score >= 5.0:
60+
return "C-"
61+
elif score >= 4.0:
62+
return "D"
63+
else:
64+
return "F"
65+
66+
67+
class ReportEvaluator:
68+
"""
69+
Combined report evaluator using both automated metrics and LLM-as-Judge.
70+
71+
This evaluator provides comprehensive report quality assessment by:
72+
1. Computing automated metrics (fast, deterministic)
73+
2. Running LLM-based evaluation (nuanced, contextual)
74+
3. Combining both for a final score and grade
75+
"""
76+
77+
def __init__(self, llm: Any = None, use_llm: bool = True):
78+
"""
79+
Initialize the evaluator.
80+
81+
Args:
82+
llm: Optional LLM instance for LLM-as-Judge evaluation
83+
use_llm: Whether to use LLM evaluation (can be disabled for speed)
84+
"""
85+
self.use_llm = use_llm
86+
self.llm_judge = LLMJudge(llm=llm) if use_llm else None
87+
88+
def _compute_metrics_score(
89+
self, metrics: ReportMetrics, report_style: str
90+
) -> float:
91+
"""
92+
Convert automated metrics to a 0-10 score.
93+
94+
Scoring breakdown:
95+
- Section coverage: 30%
96+
- Citation quality: 25%
97+
- Word count compliance: 20%
98+
- Source diversity: 15%
99+
- Image inclusion: 10%
100+
"""
101+
score = 0.0
102+
103+
section_score = metrics.section_coverage_score * 10
104+
score += section_score * 0.30
105+
106+
citation_score = min(metrics.citation_count / 10, 1.0) * 10
107+
score += citation_score * 0.25
108+
109+
target = get_word_count_target(report_style)
110+
if target:
111+
if target["min"] <= metrics.word_count <= target["max"]:
112+
word_score = 10.0
113+
elif metrics.word_count < target["min"]:
114+
word_score = (metrics.word_count / target["min"]) * 8
115+
else:
116+
excess_ratio = metrics.word_count / target["max"]
117+
word_score = max(10 - (excess_ratio - 1) * 5, 5)
118+
score += word_score * 0.20
119+
120+
diversity_score = min(metrics.unique_sources / 5, 1.0) * 10
121+
score += diversity_score * 0.15
122+
123+
image_score = min(metrics.image_count / 3, 1.0) * 10
124+
score += image_score * 0.10
125+
126+
return round(score, 2)
127+
128+
def _generate_summary(
129+
self,
130+
metrics: ReportMetrics,
131+
llm_eval: Optional[EvaluationResult],
132+
final_score: float,
133+
grade: str,
134+
) -> str:
135+
"""Generate a human-readable evaluation summary."""
136+
lines = [f"Report Grade: {grade} ({final_score}/10)", ""]
137+
138+
lines.append("**Automated Metrics:**")
139+
lines.append(f"- Word Count: {metrics.word_count}")
140+
lines.append(f"- Citations: {metrics.citation_count}")
141+
lines.append(f"- Unique Sources: {metrics.unique_sources}")
142+
lines.append(f"- Images: {metrics.image_count}")
143+
lines.append(
144+
f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%"
145+
)
146+
147+
if metrics.sections_missing:
148+
lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}")
149+
150+
if llm_eval:
151+
lines.append("")
152+
lines.append("**LLM Evaluation:**")
153+
for criterion, score in llm_eval.scores.items():
154+
lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10")
155+
156+
if llm_eval.strengths:
157+
lines.append("")
158+
lines.append("**Strengths:**")
159+
for strength in llm_eval.strengths[:3]:
160+
lines.append(f"- {strength}")
161+
162+
if llm_eval.weaknesses:
163+
lines.append("")
164+
lines.append("**Areas for Improvement:**")
165+
for weakness in llm_eval.weaknesses[:3]:
166+
lines.append(f"- {weakness}")
167+
168+
return "\n".join(lines)
169+
170+
async def evaluate(
171+
self,
172+
report: str,
173+
query: str,
174+
report_style: str = "default",
175+
) -> CombinedEvaluation:
176+
"""
177+
Evaluate a report using both metrics and LLM.
178+
179+
Args:
180+
report: The report text to evaluate
181+
query: The original research query
182+
report_style: The style of report
183+
184+
Returns:
185+
CombinedEvaluation with full results
186+
"""
187+
metrics = compute_metrics(report, report_style)
188+
metrics_score = self._compute_metrics_score(metrics, report_style)
189+
190+
llm_eval = None
191+
if self.use_llm and self.llm_judge:
192+
try:
193+
llm_eval = await self.llm_judge.evaluate(report, query, report_style)
194+
except Exception as e:
195+
logger.warning(f"LLM evaluation failed, using metrics only: {e}")
196+
197+
if llm_eval and llm_eval.overall_score > 0:
198+
final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6)
199+
else:
200+
final_score = metrics_score
201+
202+
final_score = round(final_score, 2)
203+
grade = score_to_grade(final_score)
204+
205+
summary = self._generate_summary(metrics, llm_eval, final_score, grade)
206+
207+
return CombinedEvaluation(
208+
metrics=metrics,
209+
llm_evaluation=llm_eval,
210+
final_score=final_score,
211+
grade=grade,
212+
summary=summary,
213+
)
214+
215+
def evaluate_sync(
216+
self,
217+
report: str,
218+
query: str,
219+
report_style: str = "default",
220+
) -> CombinedEvaluation:
221+
"""Synchronous version of evaluate."""
222+
import asyncio
223+
224+
return asyncio.run(self.evaluate(report, query, report_style))
225+
226+
def evaluate_metrics_only(
227+
self,
228+
report: str,
229+
report_style: str = "default",
230+
) -> Dict[str, Any]:
231+
"""
232+
Quick evaluation using only automated metrics (no LLM).
233+
234+
Args:
235+
report: The report text to evaluate
236+
report_style: The style of report
237+
238+
Returns:
239+
Dictionary with metrics and score
240+
"""
241+
metrics = compute_metrics(report, report_style)
242+
metrics_score = self._compute_metrics_score(metrics, report_style)
243+
grade = score_to_grade(metrics_score)
244+
245+
return {
246+
"metrics": metrics.to_dict(),
247+
"score": metrics_score,
248+
"grade": grade,
249+
}

0 commit comments

Comments
 (0)