evaluation
Measuring RAG pipeline quality across metrics: retrieval recall, answer faithfulness (is it grounded?), and answer relevance (does it answer the question?).
Syntax
rag
faithfulness = overlap(answer, context)
relevance = similarity(answer, question)Example
rag
# RAGAS evaluation:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall
dataset = [
{
"question": "What is the capital of France?",
"answer": "Paris",
"contexts": ["Paris is the capital of France..."],
"ground_truth": "Paris"
}
]
results = evaluate(dataset, metrics=[faithfulness, answer_relevancy, context_recall])
print(results)