from ezvals import eval, EvalContext
async def run_agent(ctx: EvalContext):
results = await run_rag_agent(ctx.input)
ctx.output = results.response
ctx.metadata["source_docs"] = results.metadata["source_docs"]
@eval(
target=run_agent,
dataset="rag_qa",
cases=[
{"input": "What is our refund policy?", "reference": "30-day money-back guarantee"},
{"input": "How do I reset my password?", "reference": "Click 'Forgot Password' on the login page"},
{"input": "What payment methods do you accept?", "reference": "Visa, Mastercard, and PayPal"},
{"input": "How long does shipping take?", "reference": "3-5 business days for standard shipping"},
{"input": "Can I change my order after placing it?", "reference": "Within 1 hour of placing the order"},
# ... hundreds more rows
],
)
async def test_rag_agent(ctx: EvalContext):
"""Evaluate a RAG Agent for pass and hallucinations"""
# LLM-as-a-Judge for hallucination
score, reasoning = await hallucination_judge(
answer=ctx.output,
sources=ctx.metadata["source_docs"]
)
ctx.store(scores={"value": score, "key": "hallucination", "notes": reasoning})
# Check pass against reference
pass_result = await pass_judge(
answer=ctx.output,
reference=ctx.reference
)
assert pass_result.is_correct, pass_result.explanation