// evals/error_report.go
package main
import (
"context"
"fmt"
"sort"
)
// FailureBreakdown 错误分布
type FailureBreakdown struct {
Mode FailureMode `json:"mode"`
Count int `json:"count"`
Percentage float64 `json:"percentage"`
AvgLLMScore float64 `json:"avg_llm_score"`
Examples []string `json:"examples"` // 前3个例子的query
}
// ErrorReport 完整错误报告
type ErrorReport struct {
TotalAnalyzed int `json:"total_analyzed"`
FailingCases int `json:"failing_cases"`
FailRate float64 `json:"fail_rate"`
Breakdown []FailureBreakdown `json:"breakdown"`
TopIssues []string `json:"top_issues"`
Recommendations []string `json:"recommendations"`
}
// AnalyzeAll 批量分析所有失败的cases
func AnalyzeAll(ctx context.Context, results []EvalResult, analyzer *ErrorAnalyzer) *ErrorReport {
report := &ErrorReport{TotalAnalyzed: len(results)}
var analyses []ErrorAnalysis
modeMap := make(map[FailureMode][]ErrorAnalysis)
for _, result := range results {
// 只分析低分的case(LLMScore < 3.5或Precision < 0.4)
if result.LLMScore >= 3.5 && result.PrecisionAt5 >= 0.4 {
continue
}
analysis := analyzer.Analyze(ctx, result)
analyses = append(analyses, analysis)
modeMap[analysis.PrimaryFailure] = append(modeMap[analysis.PrimaryFailure], analysis)
report.FailingCases++
}
report.FailRate = float64(report.FailingCases) / float64(len(results))
// 计算breakdown
for mode, list := range modeMap {
var totalScore float64
var examples []string
for i, a := range list {
totalScore += a.LLMScore
if i < 3 {
examples = append(examples, a.Query)
}
}
report.Breakdown = append(report.Breakdown, FailureBreakdown{
Mode: mode,
Count: len(list),
Percentage: float64(len(list)) / float64(report.FailingCases) * 100,
AvgLLMScore: totalScore / float64(len(list)),
Examples: examples,
})
}
// 按数量降序排列
sort.Slice(report.Breakdown, func(i, j int) bool {
return report.Breakdown[i].Count > report.Breakdown[j].Count
})
// 生成TopIssues和Recommendations
report.TopIssues = generateTopIssues(report.Breakdown)
report.Recommendations = generateRecommendations(report.Breakdown)
return report
}
func generateTopIssues(breakdown []FailureBreakdown) []string {
var issues []string
for i, b := range breakdown {
if i >= 3 {
break
}
issues = append(issues, fmt.Sprintf(
"%s:%d个case(%.1f%%),平均LLM评分%.2f",
b.Mode, b.Count, b.Percentage, b.AvgLLMScore,
))
}
return issues
}
func generateRecommendations(breakdown []FailureBreakdown) []string {
var recs []string
for _, b := range breakdown {
switch b.Mode {
case FailureNoRelevantChunk:
recs = append(recs, "优先级:HIGH - 扩充文档库或检查是否有内容缺失")
case FailureChunkRankedLow:
recs = append(recs, "优先级:HIGH - 尝试hybrid search或更好的embedding模型")
case FailureHallucination:
recs = append(recs, "优先级:MEDIUM - 强化prompt中的grounding指令")
case FailureIncomplete:
recs = append(recs, "优先级:LOW - 优化prompt中的输出格式要求")
}
}
return recs
}
// PrintReport 打印分析报告
func PrintReport(report *ErrorReport) {
fmt.Println("\n========== 错误分析报告 ==========")
fmt.Printf("分析总数:%d\n", report.TotalAnalyzed)
fmt.Printf("失败案例:%d(%.1f%%)\n", report.FailingCases, report.FailRate*100)
fmt.Println("\n---------- 错误分布 ----------")
for _, b := range report.Breakdown {
fmt.Printf(" %-40s %3d个 %.1f%% LLM均分=%.2f\n",
b.Mode, b.Count, b.Percentage, b.AvgLLMScore)
for _, ex := range b.Examples {
fmt.Printf(" - %s\n", ex)
}
}
fmt.Println("\n---------- 主要问题 ----------")
for i, issue := range report.TopIssues {
fmt.Printf(" %d. %s\n", i+1, issue)
}
fmt.Println("\n---------- 改进建议 ----------")
for _, rec := range report.Recommendations {
fmt.Printf(" * %s\n", rec)
}
}