评估与优化 #
概述 #
评估是 RAG 应用开发中至关重要的一环。本章介绍如何评估检索质量、响应质量和整体系统性能。
text
┌─────────────────────────────────────────────────────────────┐
│ RAG 评估维度 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 检索评估 │
│ ├── 召回率(Recall) │
│ ├── 精确率(Precision) │
│ ├── MRR(平均倒数排名) │
│ └── NDCG(归一化折损累积增益) │
│ │
│ 响应评估 │
│ ├── 相关性(Relevancy) │
│ ├── 忠实度(Faithfulness) │
│ ├── 正确性(Correctness) │
│ └── 语义相似度 │
│ │
│ 端到端评估 │
│ ├── 上下文精确度 │
│ ├── 上下文召回率 │
│ ├── 答案相关性 │
│ └── 答案忠实度 │
│ │
└─────────────────────────────────────────────────────────────┘
内置评估器 #
相关性评估 #
python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.evaluation import RelevancyEvaluator
documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
evaluator = RelevancyEvaluator()
response = query_engine.query("你的问题")
eval_result = evaluator.evaluate_response(
query="你的问题",
response=response,
)
print(f"是否相关: {eval_result.passing}")
print(f"分数: {eval_result.score}")
print(f"反馈: {eval_result.feedback}")
忠实度评估 #
python
from llama_index.core.evaluation import FaithfulnessEvaluator
evaluator = FaithfulnessEvaluator()
response = query_engine.query("你的问题")
eval_result = evaluator.evaluate_response(
response=response,
)
print(f"是否忠实: {eval_result.passing}")
print(f"分数: {eval_result.score}")
正确性评估 #
python
from llama_index.core.evaluation import CorrectnessEvaluator
evaluator = CorrectnessEvaluator()
response = query_engine.query("你的问题")
eval_result = evaluator.evaluate_response(
query="你的问题",
response=response,
reference="参考答案",
)
print(f"是否正确: {eval_result.passing}")
print(f"分数: {eval_result.score}")
语义相似度评估 #
python
from llama_index.core.evaluation import SemanticSimilarityEvaluator
from llama_index.embeddings.openai import OpenAIEmbedding
embed_model = OpenAIEmbedding()
evaluator = SemanticSimilarityEvaluator(
embed_model=embed_model,
)
eval_result = evaluator.evaluate_response(
query="你的问题",
response=response,
reference="参考答案",
)
print(f"相似度分数: {eval_result.score}")
批量评估 #
评估数据集 #
python
from llama_index.core.evaluation import (
RelevancyEvaluator,
FaithfulnessEvaluator,
CorrectnessEvaluator,
)
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
eval_questions = [
"问题1",
"问题2",
"问题3",
]
eval_answers = [
"参考答案1",
"参考答案2",
"参考答案3",
]
relevancy_evaluator = RelevancyEvaluator()
faithfulness_evaluator = FaithfulnessEvaluator()
correctness_evaluator = CorrectnessEvaluator()
results = {
"relevancy": [],
"faithfulness": [],
"correctness": [],
}
for question, answer in zip(eval_questions, eval_answers):
response = query_engine.query(question)
relevancy_result = relevancy_evaluator.evaluate_response(
query=question,
response=response,
)
results["relevancy"].append(relevancy_result.passing)
faithfulness_result = faithfulness_evaluator.evaluate_response(
response=response,
)
results["faithfulness"].append(faithfulness_result.passing)
correctness_result = correctness_evaluator.evaluate_response(
query=question,
response=response,
reference=answer,
)
results["correctness"].append(correctness_result.passing)
print(f"相关性通过率: {sum(results['relevancy']) / len(results['relevancy']):.2%}")
print(f"忠实度通过率: {sum(results['faithfulness']) / len(results['faithfulness']):.2%}")
print(f"正确性通过率: {sum(results['correctness']) / len(results['correctness']):.2%}")
使用 BatchEvalRunner #
python
from llama_index.core.evaluation import BatchEvalRunner
runner = BatchEvalRunner(
evaluators={
"relevancy": relevancy_evaluator,
"faithfulness": faithfulness_evaluator,
},
show_progress=True,
)
eval_results = await runner.aevaluate_queries(
query_engine,
queries=eval_questions,
)
for name, results in eval_results.items():
passing = sum(1 for r in results if r.passing)
print(f"{name}: {passing}/{len(results)} 通过")
RAGAS 评估 #
安装 RAGAS #
bash
pip install ragas
使用 RAGAS #
python
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_relevancy,
context_recall,
)
from datasets import Dataset
questions = ["问题1", "问题2"]
answers = ["回答1", "回答2"]
contexts = [["上下文1"], ["上下文2"]]
ground_truths = ["参考答案1", "参考答案2"]
data = {
"question": questions,
"answer": answers,
"contexts": contexts,
"ground_truth": ground_truths,
}
dataset = Dataset.from_dict(data)
results = evaluate(
dataset,
metrics=[
faithfulness,
answer_relevancy,
context_relevancy,
context_recall,
],
)
print(results)
LlamaIndex + RAGAS #
python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from ragas.integrations.llama_index import evaluate
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
questions = ["问题1", "问题2", "问题3"]
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_relevancy,
)
results = evaluate(
query_engine,
questions,
metrics=[faithfulness, answer_relevancy, context_relevancy],
)
print(results)
TruLens 评估 #
安装 TruLens #
bash
pip install trulens-eval
基本用法 #
python
from trulens_eval import TruChain, Feedback, Tru
from trulens_eval.feedback import Groundedness
from llama_index.core import VectorStoreIndex
tru = Tru()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
from trulens_eval.app import App
context = App.select_context(query_engine)
grounded = Groundedness.groundedness_measure
f_groundedness = (
Feedback(grounded.groundedness_measure_with_cot_reasons)
.on(context.collect())
.on_output()
.aggregate(grounded.grounded_statements_aggregator)
)
f_qa_relevance = (
Feedback(provider.relevance_with_cot_reasons)
.on_input_output()
)
f_context_relevance = (
Feedback(provider.context_relevance_with_cot_reasons)
.on_input()
.on(context)
.aggregate(np.mean)
)
tru_query_engine = TruChain(
query_engine,
app_id="LlamaIndex_App",
feedbacks=[f_groundedness, f_qa_relevance, f_context_relevance],
)
response = tru_query_engine.query("你的问题")
tru.run_dashboard()
检索评估 #
Hit Rate #
python
def calculate_hit_rate(retrieved_ids, relevant_ids, k=5):
retrieved_set = set(retrieved_ids[:k])
relevant_set = set(relevant_ids)
return 1 if retrieved_set & relevant_set else 0
def evaluate_retriever(retriever, questions, relevant_docs):
hit_rates = []
for question, relevant_ids in zip(questions, relevant_docs):
nodes = retriever.retrieve(question)
retrieved_ids = [node.node.node_id for node in nodes]
hit_rate = calculate_hit_rate(retrieved_ids, relevant_ids)
hit_rates.append(hit_rate)
return sum(hit_rates) / len(hit_rates)
MRR(Mean Reciprocal Rank) #
python
def calculate_mrr(retrieved_ids, relevant_ids):
for i, doc_id in enumerate(retrieved_ids):
if doc_id in relevant_ids:
return 1 / (i + 1)
return 0
def evaluate_mrr(retriever, questions, relevant_docs):
mrr_scores = []
for question, relevant_ids in zip(questions, relevant_docs):
nodes = retriever.retrieve(question)
retrieved_ids = [node.node.node_id for node in nodes]
mrr = calculate_mrr(retrieved_ids, relevant_ids)
mrr_scores.append(mrr)
return sum(mrr_scores) / len(mrr_scores)
性能监控 #
回复时间监控 #
python
import time
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
questions = ["问题1", "问题2", "问题3"]
response_times = []
for question in questions:
start_time = time.time()
response = query_engine.query(question)
end_time = time.time()
response_time = end_time - start_time
response_times.append(response_time)
print(f"问题: {question}")
print(f"回复时间: {response_time:.2f}s")
print(f"\n平均回复时间: {sum(response_times) / len(response_times):.2f}s")
Token 使用监控 #
python
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
from llama_index.core import Settings
token_counter = TokenCountingHandler(
tokenizer=tiktoken.encoding_for_model("gpt-4o").encode
)
Settings.callback_manager = CallbackManager([token_counter])
response = query_engine.query("你的问题")
print(f"嵌入 Token: {token_counter.total_embedding_token_count}")
print(f"LLM 提示 Token: {token_counter.total_llm_token_count}")
print(f"总 Token: {token_counter.total_token_count}")
完整评估示例 #
python
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.evaluation import (
RelevancyEvaluator,
FaithfulnessEvaluator,
BatchEvalRunner,
)
os.environ["OPENAI_API_KEY"] = "sk-your-key"
Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
eval_questions = [
"文档的主要内容是什么?",
"有哪些核心概念?",
"如何使用索引?",
]
relevancy_evaluator = RelevancyEvaluator()
faithfulness_evaluator = FaithfulnessEvaluator()
runner = BatchEvalRunner(
evaluators={
"relevancy": relevancy_evaluator,
"faithfulness": faithfulness_evaluator,
},
show_progress=True,
)
eval_results = await runner.aevaluate_queries(
query_engine,
queries=eval_questions,
)
print("\n=== 评估结果 ===\n")
for name, results in eval_results.items():
passing = sum(1 for r in results if r.passing)
avg_score = sum(r.score for r in results if r.score) / len(results)
print(f"{name}:")
print(f" 通过率: {passing}/{len(results)} ({passing/len(results):.1%})")
print(f" 平均分数: {avg_score:.2f}")
print()
下一步 #
掌握评估方法后,接下来学习 存储与持久化 了解如何持久化存储索引和数据!
最后更新:2026-03-30