Eval Starter Kit — Prompt Engineering Mastery

What you get

A dataset format (JSONL) for eval inputs and expected behavior
A grader interface with three reference implementations: exact match, regex, LLM-as-judge
A runner that scores a prompt against the dataset and emits a diff against a baseline
A CI hook so prompt changes are eval'd on every PR

File layout

evals/
├── datasets/
│   └── support_triage.jsonl       # eval inputs + expected behavior
├── prompts/
│   └── support_triage.md          # the prompt under test (versioned)
├── graders/
│   ├── __init__.py
│   ├── exact.py
│   ├── regex.py
│   └── llm_judge.py
├── runner.py                       # the CLI: python -m evals.runner ...
├── results/
│   └── <run_id>.json              # one file per run, gitignored
└── baseline.json                   # the score we ship against

Dataset format

Each line is one example. The fields you need depend on what you're testing.

{"id": "trg_001", "input": "I was charged twice for May.", "must_classify_as": "billing", "must_not_contain": ["technical"]}
{"id": "trg_002", "input": "The settings page won't load.", "must_classify_as": "technical"}
{"id": "trg_003", "input": "How do I reset my password?", "must_classify_as": "account"}
{"id": "trg_004", "input": "You guys are awesome!", "must_classify_as": "feedback"}
{"id": "trg_005", "input": "asdfasdf", "must_classify_as": "other"}

The grader interface

Every grader is a function that takes (example, output) → {"passed": bool, "reasons": [str]}. That's the whole protocol.

# graders/exact.py
def grade_exact(example, output):
    """Pass if output exactly equals the expected value."""
    expected = example.get("expected_output", "").strip()
    actual = output.strip()
    if actual == expected:
        return {"passed": True, "reasons": []}
    return {
        "passed": False,
        "reasons": [f"Expected '{expected}', got '{actual}'"],
    }


# graders/regex.py
import re

def grade_property_checks(example, output):
    """Apply must_*/must_not_* property checks."""
    reasons = []

    if "must_classify_as" in example:
        expected = example["must_classify_as"].lower().strip()
        actual = output.lower().strip()
        if actual != expected:
            reasons.append(f"Classification: expected '{expected}', got '{actual}'")

    for substring in example.get("must_contain", []):
        if substring.lower() not in output.lower():
            reasons.append(f"Missing required substring: {substring!r}")

    for substring in example.get("must_not_contain", []):
        if substring.lower() in output.lower():
            reasons.append(f"Contains forbidden substring: {substring!r}")

    for pattern in example.get("must_match", []):
        if not re.search(pattern, output):
            reasons.append(f"Does not match pattern: {pattern!r}")

    return {"passed": len(reasons) == 0, "reasons": reasons}

LLM-as-judge grader

# graders/llm_judge.py
import json
from anthropic import Anthropic

client = Anthropic()

JUDGE_PROMPT = """You are evaluating an AI response.

Question:
{question}

Response:
{response}

Score the response on each dimension using the rubric below.

ACCURACY (1-5)
5 = All factual claims are correct and verifiable.
3 = Mostly correct with one minor unverifiable claim.
1 = Contains a clear factual error.

HELPFULNESS (1-5)
5 = Directly and completely answers the question.
3 = Partially answers; missing important detail.
1 = Doesn't answer the question.

Write 2-3 sentences of analysis first, then output JSON only:
{{"accuracy": N, "helpfulness": N, "analysis": "..."}}"""


def grade_with_judge(example, output, threshold=4):
    prompt = JUDGE_PROMPT.format(
        question=example["input"], response=output
    )
    resp = client.messages.create(
        model="claude-3-5-sonnet-latest",
        max_tokens=400,
        messages=[{"role": "user", "content": prompt}],
    )
    text = resp.content[0].text
    # Extract the trailing JSON object
    start = text.rfind("{")
    end = text.rfind("}") + 1
    scores = json.loads(text[start:end])

    passed = (
        scores["accuracy"] >= threshold
        and scores["helpfulness"] >= threshold
    )
    reasons = []
    if not passed:
        reasons.append(
            f"accuracy={scores['accuracy']}, helpfulness={scores['helpfulness']}: "
            f"{scores.get('analysis', '')}"
        )
    return {"passed": passed, "reasons": reasons, "raw_scores": scores}

The runner

# runner.py
"""Run an eval. Usage:
  python -m evals.runner --prompt support_triage --dataset support_triage --baseline
"""
import argparse
import json
import os
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from anthropic import Anthropic

from graders.regex import grade_property_checks

client = Anthropic()
ROOT = Path(__file__).parent


def load_dataset(name):
    path = ROOT / "datasets" / f"{name}.jsonl"
    return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]


def load_prompt(name):
    return (ROOT / "prompts" / f"{name}.md").read_text()


def run_one(example, prompt_template, model):
    prompt = prompt_template.replace("{user_message}", example["input"])
    resp = client.messages.create(
        model=model,
        max_tokens=200,
        messages=[{"role": "user", "content": prompt}],
    )
    output = resp.content[0].text.strip()
    grade = grade_property_checks(example, output)
    return {
        "id": example["id"],
        "input": example["input"],
        "output": output,
        "passed": grade["passed"],
        "reasons": grade["reasons"],
    }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--prompt", required=True)
    parser.add_argument("--dataset", required=True)
    parser.add_argument("--model", default="claude-3-5-sonnet-latest")
    parser.add_argument("--baseline", action="store_true",
                        help="Save this run as the new baseline")
    args = parser.parse_args()

    dataset = load_dataset(args.dataset)
    prompt = load_prompt(args.prompt)

    start = time.time()
    with ThreadPoolExecutor(max_workers=5) as pool:
        results = list(pool.map(
            lambda ex: run_one(ex, prompt, args.model), dataset
        ))
    elapsed = time.time() - start

    passed = sum(1 for r in results if r["passed"])
    total = len(results)

    summary = {
        "prompt": args.prompt,
        "dataset": args.dataset,
        "model": args.model,
        "passed": passed,
        "total": total,
        "score": round(passed / total, 4),
        "elapsed_seconds": round(elapsed, 2),
        "results": results,
    }

    # Save run
    run_id = f"{args.prompt}_{int(time.time())}"
    (ROOT / "results" / f"{run_id}.json").write_text(json.dumps(summary, indent=2))

    # Compare to baseline
    baseline_path = ROOT / "baseline.json"
    if baseline_path.exists() and not args.baseline:
        baseline = json.loads(baseline_path.read_text())
        diff_summary(baseline, summary)

    if args.baseline:
        baseline_path.write_text(json.dumps(summary, indent=2))
        print(f"\nSaved as new baseline: {summary['score']:.2%}")
        return

    print(f"\nScore: {passed}/{total} = {summary['score']:.2%}")


def diff_summary(baseline, current):
    base_results = {r["id"]: r for r in baseline["results"]}
    cur_results = {r["id"]: r for r in current["results"]}

    newly_passing = []
    newly_failing = []
    for id_, r in cur_results.items():
        if id_ not in base_results:
            continue
        if r["passed"] and not base_results[id_]["passed"]:
            newly_passing.append(id_)
        elif not r["passed"] and base_results[id_]["passed"]:
            newly_failing.append(id_)

    print(f"\nBaseline: {baseline['score']:.2%}")
    print(f"Current:  {current['score']:.2%}")
    print(f"Δ: {(current['score'] - baseline['score']) * 100:+.2f} pp")

    if newly_passing:
        print(f"\nNewly passing ({len(newly_passing)}):")
        for id_ in newly_passing[:5]:
            print(f"  + {id_}: {cur_results[id_]['input'][:60]}")
    if newly_failing:
        print(f"\n⚠️  Newly FAILING ({len(newly_failing)}):")
        for id_ in newly_failing[:10]:
            r = cur_results[id_]
            print(f"  - {id_}: {r['input'][:60]}")
            for reason in r["reasons"]:
                print(f"      {reason}")


if __name__ == "__main__":
    main()

CI integration

Run the eval on every pull request that touches a prompt or grader. Block merge if any baseline-passing example regresses.

# .github/workflows/eval.yml
name: Eval

on:
  pull_request:
    paths:
      - "evals/prompts/**"
      - "evals/graders/**"
      - "evals/runner.py"

jobs:
  eval:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - run: pip install anthropic
      - name: Run eval
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
        run: |
          python -m evals.runner \
            --prompt support_triage \
            --dataset support_triage \
            | tee eval_output.txt
      - name: Fail on regression
        run: |
          if grep -q "Newly FAILING" eval_output.txt; then
            echo "::error::Eval regression detected"
            exit 1
          fi

Workflow

Build your dataset (start with 30-50 examples; grow it as you find production failures)
Write a baseline prompt; run --baseline to lock it in
Iterate: change the prompt → run the eval → see the diff
If newly-failing > 0, decide: fix the prompt, fix the dataset, or accept the trade-off explicitly
When you ship a new version, update the baseline

Don't over-engineer this. The starter kit is ~150 lines of code. Resist the urge to bring in a framework before you've used a homegrown version long enough to know what you actually need.