paperclip/evals/promptfoo/promptfooconfig.yaml

37 lines
1,019 B
YAML
Raw Normal View History

# Paperclip Agent Evals - Phase 0: Promptfoo Bootstrap
#
# Tests narrow heartbeat behaviors across models with deterministic assertions.
# Test cases are organized by category in tests/*.yaml files.
# See doc/plans/2026-03-13-agent-evals-framework.md for the full framework plan.
#
# Usage:
# cd evals/promptfoo && promptfoo eval
# promptfoo view # open results in browser
#
# Validate config before committing:
# promptfoo validate
#
# Requires OPENROUTER_API_KEY or individual provider keys.
description: "Paperclip heartbeat behavior evals"
prompts:
- file://prompts/heartbeat-system.txt
providers:
- id: openrouter:anthropic/claude-sonnet-4-20250514
label: claude-sonnet-4
- id: openrouter:openai/gpt-4.1
label: gpt-4.1
- id: openrouter:openai/codex-5.4
label: codex-5.4
- id: openrouter:google/gemini-2.5-pro
label: gemini-2.5-pro
defaultTest:
options:
transformVars: "{ ...vars, apiUrl: 'http://localhost:18080', runId: 'run-eval-001' }"
tests:
- file://tests/*.yaml