A minimal but working eval harness in Python. It's not a framework — it's the simplest thing that's actually useful, designed to be copied into your repo and adapted. Module 5 of the course explains the why; this resource is the how.
evals/
├── datasets/
│ └── support_triage.jsonl # eval inputs + expected behavior
├── prompts/
│ └── support_triage.md # the prompt under test (versioned)
├── graders/
│ ├── __init__.py
│ ├── exact.py
│ ├── regex.py
│ └── llm_judge.py
├── runner.py # the CLI: python -m evals.runner ...
├── results/
│ └── <run_id>.json # one file per run, gitignored
└── baseline.json # the score we ship against
Each line is one example. The fields you need depend on what you're testing.
{"id": "trg_001", "input": "I was charged twice for May.", "must_classify_as": "billing", "must_not_contain": ["technical"]}
{"id": "trg_002", "input": "The settings page won't load.", "must_classify_as": "technical"}
{"id": "trg_003", "input": "How do I reset my password?", "must_classify_as": "account"}
{"id": "trg_004", "input": "You guys are awesome!", "must_classify_as": "feedback"}
{"id": "trg_005", "input": "asdfasdf", "must_classify_as": "other"}
Every grader is a function that takes (example, output) → {"passed": bool, "reasons": [str]}. That's the whole protocol.
# graders/exact.py
def grade_exact(example, output):
"""Pass if output exactly equals the expected value."""
expected = example.get("expected_output", "").strip()
actual = output.strip()
if actual == expected:
return {"passed": True, "reasons": []}
return {
"passed": False,
"reasons": [f"Expected '{expected}', got '{actual}'"],
}
# graders/regex.py
import re
def grade_property_checks(example, output):
"""Apply must_*/must_not_* property checks."""
reasons = []
if "must_classify_as" in example:
expected = example["must_classify_as"].lower().strip()
actual = output.lower().strip()
if actual != expected:
reasons.append(f"Classification: expected '{expected}', got '{actual}'")
for substring in example.get("must_contain", []):
if substring.lower() not in output.lower():
reasons.append(f"Missing required substring: {substring!r}")
for substring in example.get("must_not_contain", []):
if substring.lower() in output.lower():
reasons.append(f"Contains forbidden substring: {substring!r}")
for pattern in example.get("must_match", []):
if not re.search(pattern, output):
reasons.append(f"Does not match pattern: {pattern!r}")
return {"passed": len(reasons) == 0, "reasons": reasons}
# graders/llm_judge.py
import json
from anthropic import Anthropic
client = Anthropic()
JUDGE_PROMPT = """You are evaluating an AI response.
Question:
{question}
Response:
{response}
Score the response on each dimension using the rubric below.
ACCURACY (1-5)
5 = All factual claims are correct and verifiable.
3 = Mostly correct with one minor unverifiable claim.
1 = Contains a clear factual error.
HELPFULNESS (1-5)
5 = Directly and completely answers the question.
3 = Partially answers; missing important detail.
1 = Doesn't answer the question.
Write 2-3 sentences of analysis first, then output JSON only:
{{"accuracy": N, "helpfulness": N, "analysis": "..."}}"""
def grade_with_judge(example, output, threshold=4):
prompt = JUDGE_PROMPT.format(
question=example["input"], response=output
)
resp = client.messages.create(
model="claude-3-5-sonnet-latest",
max_tokens=400,
messages=[{"role": "user", "content": prompt}],
)
text = resp.content[0].text
# Extract the trailing JSON object
start = text.rfind("{")
end = text.rfind("}") + 1
scores = json.loads(text[start:end])
passed = (
scores["accuracy"] >= threshold
and scores["helpfulness"] >= threshold
)
reasons = []
if not passed:
reasons.append(
f"accuracy={scores['accuracy']}, helpfulness={scores['helpfulness']}: "
f"{scores.get('analysis', '')}"
)
return {"passed": passed, "reasons": reasons, "raw_scores": scores}
# runner.py
"""Run an eval. Usage:
python -m evals.runner --prompt support_triage --dataset support_triage --baseline
"""
import argparse
import json
import os
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from anthropic import Anthropic
from graders.regex import grade_property_checks
client = Anthropic()
ROOT = Path(__file__).parent
def load_dataset(name):
path = ROOT / "datasets" / f"{name}.jsonl"
return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]
def load_prompt(name):
return (ROOT / "prompts" / f"{name}.md").read_text()
def run_one(example, prompt_template, model):
prompt = prompt_template.replace("{user_message}", example["input"])
resp = client.messages.create(
model=model,
max_tokens=200,
messages=[{"role": "user", "content": prompt}],
)
output = resp.content[0].text.strip()
grade = grade_property_checks(example, output)
return {
"id": example["id"],
"input": example["input"],
"output": output,
"passed": grade["passed"],
"reasons": grade["reasons"],
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--prompt", required=True)
parser.add_argument("--dataset", required=True)
parser.add_argument("--model", default="claude-3-5-sonnet-latest")
parser.add_argument("--baseline", action="store_true",
help="Save this run as the new baseline")
args = parser.parse_args()
dataset = load_dataset(args.dataset)
prompt = load_prompt(args.prompt)
start = time.time()
with ThreadPoolExecutor(max_workers=5) as pool:
results = list(pool.map(
lambda ex: run_one(ex, prompt, args.model), dataset
))
elapsed = time.time() - start
passed = sum(1 for r in results if r["passed"])
total = len(results)
summary = {
"prompt": args.prompt,
"dataset": args.dataset,
"model": args.model,
"passed": passed,
"total": total,
"score": round(passed / total, 4),
"elapsed_seconds": round(elapsed, 2),
"results": results,
}
# Save run
run_id = f"{args.prompt}_{int(time.time())}"
(ROOT / "results" / f"{run_id}.json").write_text(json.dumps(summary, indent=2))
# Compare to baseline
baseline_path = ROOT / "baseline.json"
if baseline_path.exists() and not args.baseline:
baseline = json.loads(baseline_path.read_text())
diff_summary(baseline, summary)
if args.baseline:
baseline_path.write_text(json.dumps(summary, indent=2))
print(f"\nSaved as new baseline: {summary['score']:.2%}")
return
print(f"\nScore: {passed}/{total} = {summary['score']:.2%}")
def diff_summary(baseline, current):
base_results = {r["id"]: r for r in baseline["results"]}
cur_results = {r["id"]: r for r in current["results"]}
newly_passing = []
newly_failing = []
for id_, r in cur_results.items():
if id_ not in base_results:
continue
if r["passed"] and not base_results[id_]["passed"]:
newly_passing.append(id_)
elif not r["passed"] and base_results[id_]["passed"]:
newly_failing.append(id_)
print(f"\nBaseline: {baseline['score']:.2%}")
print(f"Current: {current['score']:.2%}")
print(f"Δ: {(current['score'] - baseline['score']) * 100:+.2f} pp")
if newly_passing:
print(f"\nNewly passing ({len(newly_passing)}):")
for id_ in newly_passing[:5]:
print(f" + {id_}: {cur_results[id_]['input'][:60]}")
if newly_failing:
print(f"\n⚠️ Newly FAILING ({len(newly_failing)}):")
for id_ in newly_failing[:10]:
r = cur_results[id_]
print(f" - {id_}: {r['input'][:60]}")
for reason in r["reasons"]:
print(f" {reason}")
if __name__ == "__main__":
main()
Run the eval on every pull request that touches a prompt or grader. Block merge if any baseline-passing example regresses.
# .github/workflows/eval.yml
name: Eval
on:
pull_request:
paths:
- "evals/prompts/**"
- "evals/graders/**"
- "evals/runner.py"
jobs:
eval:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- run: pip install anthropic
- name: Run eval
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
python -m evals.runner \
--prompt support_triage \
--dataset support_triage \
| tee eval_output.txt
- name: Fail on regression
run: |
if grep -q "Newly FAILING" eval_output.txt; then
echo "::error::Eval regression detected"
exit 1
fi
--baseline to lock it in