DIZ-admin
diff --git a/‎src/eval/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎src/eval/__init__.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/eval/evaluator.py‎
Lines changed: 249 additions & 0 deletions b/‎src/eval/evaluator.py‎
Lines changed: 249 additions & 0 deletions
@@ -0,0 +1,21 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Report Quality Evaluation Module for DeerFlow.
+
+This module provides objective methods to evaluate generated report quality,
+including automated metrics and LLM-based evaluation.
+"""
+
+from .evaluator import ReportEvaluator
+from .metrics import ReportMetrics, compute_metrics
+from .llm_judge import LLMJudge, evaluate_with_llm
+
+__all__ = [
+    "ReportEvaluator",
+    "ReportMetrics",
+    "compute_metrics",
+    "LLMJudge",
+    "evaluate_with_llm",
+]
@@ -0,0 +1,249 @@
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: MIT
+
+"""
+Combined report evaluator orchestrating both automated metrics and LLM evaluation.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+from .llm_judge import EvaluationResult, LLMJudge
+from .metrics import ReportMetrics, compute_metrics, get_word_count_target
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CombinedEvaluation:
+    """Combined evaluation results from metrics and LLM judge."""
+
+    metrics: ReportMetrics
+    llm_evaluation: Optional[EvaluationResult]
+    final_score: float
+    grade: str
+    summary: str
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary format."""
+        return {
+            "metrics": self.metrics.to_dict(),
+            "llm_evaluation": (
+                self.llm_evaluation.to_dict() if self.llm_evaluation else None
+            ),
+            "final_score": self.final_score,
+            "grade": self.grade,
+            "summary": self.summary,
+        }
+
+
+def score_to_grade(score: float) -> str:
+    """Convert numeric score to letter grade."""
+    if score >= 9.0:
+        return "A+"
+    elif score >= 8.5:
+        return "A"
+    elif score >= 8.0:
+        return "A-"
+    elif score >= 7.5:
+        return "B+"
+    elif score >= 7.0:
+        return "B"
+    elif score >= 6.5:
+        return "B-"
+    elif score >= 6.0:
+        return "C+"
+    elif score >= 5.5:
+        return "C"
+    elif score >= 5.0:
+        return "C-"
+    elif score >= 4.0:
+        return "D"
+    else:
+        return "F"
+
+
+class ReportEvaluator:
+    """
+    Combined report evaluator using both automated metrics and LLM-as-Judge.
+
+    This evaluator provides comprehensive report quality assessment by:
+    1. Computing automated metrics (fast, deterministic)
+    2. Running LLM-based evaluation (nuanced, contextual)
+    3. Combining both for a final score and grade
+    """
+
+    def __init__(self, llm: Any = None, use_llm: bool = True):
+        """
+        Initialize the evaluator.
+
+        Args:
+            llm: Optional LLM instance for LLM-as-Judge evaluation
+            use_llm: Whether to use LLM evaluation (can be disabled for speed)
+        """
+        self.use_llm = use_llm
+        self.llm_judge = LLMJudge(llm=llm) if use_llm else None
+
+    def _compute_metrics_score(
+        self, metrics: ReportMetrics, report_style: str
+    ) -> float:
+        """
+        Convert automated metrics to a 0-10 score.
+
+        Scoring breakdown:
+        - Section coverage: 30%
+        - Citation quality: 25%
+        - Word count compliance: 20%
+        - Source diversity: 15%
+        - Image inclusion: 10%
+        """
+        score = 0.0
+
+        section_score = metrics.section_coverage_score * 10
+        score += section_score * 0.30
+
+        citation_score = min(metrics.citation_count / 10, 1.0) * 10
+        score += citation_score * 0.25
+
+        target = get_word_count_target(report_style)
+        if target:
+            if target["min"] <= metrics.word_count <= target["max"]:
+                word_score = 10.0
+            elif metrics.word_count < target["min"]:
+                word_score = (metrics.word_count / target["min"]) * 8
+            else:
+                excess_ratio = metrics.word_count / target["max"]
+                word_score = max(10 - (excess_ratio - 1) * 5, 5)
+            score += word_score * 0.20
+
+        diversity_score = min(metrics.unique_sources / 5, 1.0) * 10
+        score += diversity_score * 0.15
+
+        image_score = min(metrics.image_count / 3, 1.0) * 10
+        score += image_score * 0.10
+
+        return round(score, 2)
+
+    def _generate_summary(
+        self,
+        metrics: ReportMetrics,
+        llm_eval: Optional[EvaluationResult],
+        final_score: float,
+        grade: str,
+    ) -> str:
+        """Generate a human-readable evaluation summary."""
+        lines = [f"Report Grade: {grade} ({final_score}/10)", ""]
+
+        lines.append("**Automated Metrics:**")
+        lines.append(f"- Word Count: {metrics.word_count}")
+        lines.append(f"- Citations: {metrics.citation_count}")
+        lines.append(f"- Unique Sources: {metrics.unique_sources}")
+        lines.append(f"- Images: {metrics.image_count}")
+        lines.append(
+            f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%"
+        )
+
+        if metrics.sections_missing:
+            lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}")
+
+        if llm_eval:
+            lines.append("")
+            lines.append("**LLM Evaluation:**")
+            for criterion, score in llm_eval.scores.items():
+                lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10")
+
+            if llm_eval.strengths:
+                lines.append("")
+                lines.append("**Strengths:**")
+                for strength in llm_eval.strengths[:3]:
+                    lines.append(f"- {strength}")
+
+            if llm_eval.weaknesses:
+                lines.append("")
+                lines.append("**Areas for Improvement:**")
+                for weakness in llm_eval.weaknesses[:3]:
+                    lines.append(f"- {weakness}")
+
+        return "\n".join(lines)
+
+    async def evaluate(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> CombinedEvaluation:
+        """
+        Evaluate a report using both metrics and LLM.
+
+        Args:
+            report: The report text to evaluate
+            query: The original research query
+            report_style: The style of report
+
+        Returns:
+            CombinedEvaluation with full results
+        """
+        metrics = compute_metrics(report, report_style)
+        metrics_score = self._compute_metrics_score(metrics, report_style)
+
+        llm_eval = None
+        if self.use_llm and self.llm_judge:
+            try:
+                llm_eval = await self.llm_judge.evaluate(report, query, report_style)
+            except Exception as e:
+                logger.warning(f"LLM evaluation failed, using metrics only: {e}")
+
+        if llm_eval and llm_eval.overall_score > 0:
+            final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6)
+        else:
+            final_score = metrics_score
+
+        final_score = round(final_score, 2)
+        grade = score_to_grade(final_score)
+
+        summary = self._generate_summary(metrics, llm_eval, final_score, grade)
+
+        return CombinedEvaluation(
+            metrics=metrics,
+            llm_evaluation=llm_eval,
+            final_score=final_score,
+            grade=grade,
+            summary=summary,
+        )
+
+    def evaluate_sync(
+        self,
+        report: str,
+        query: str,
+        report_style: str = "default",
+    ) -> CombinedEvaluation:
+        """Synchronous version of evaluate."""
+        import asyncio
+
+        return asyncio.run(self.evaluate(report, query, report_style))
+
+    def evaluate_metrics_only(
+        self,
+        report: str,
+        report_style: str = "default",
+    ) -> Dict[str, Any]:
+        """
+        Quick evaluation using only automated metrics (no LLM).
+
+        Args:
+            report: The report text to evaluate
+            report_style: The style of report
+
+        Returns:
+            Dictionary with metrics and score
+        """
+        metrics = compute_metrics(report, report_style)
+        metrics_score = self._compute_metrics_score(metrics, report_style)
+        grade = score_to_grade(metrics_score)
+
+        return {
+            "metrics": metrics.to_dict(),
+            "score": metrics_score,
+            "grade": grade,
+        }