| # Load 10K eval cases to Postgres and run all five eval runners (optional limits). | |
| set -euo pipefail | |
| ROOT="$(cd "$(dirname "$0")/.." && pwd)" | |
| cd "$ROOT" | |
| if [[ -f "$ROOT/.env" ]]; then | |
| set -a | |
| # shellcheck disable=SC1091 | |
| source "$ROOT/.env" | |
| set +a | |
| fi | |
| LIMIT="${LIMIT:-}" | |
| LIMIT_ARG=() | |
| if [[ -n "${LIMIT}" ]]; then | |
| LIMIT_ARG=(--limit "${LIMIT}") | |
| fi | |
| echo "=== Loading 10,000 eval cases to Postgres ===" | |
| python3 scripts/load_eval_pack_to_db.py | |
| echo "" | |
| echo "=== Eval pack status ===" | |
| python3 scripts/load_eval_pack_to_db.py --verify-only | |
| echo "" | |
| echo "=== Running eval runners ${LIMIT_ARG[*]:-(full datasets)} ===" | |
| python3 eval/runners/run_golden_memory_eval.py "${LIMIT_ARG[@]}" | |
| python3 eval/runners/run_adversarial_memory_eval.py "${LIMIT_ARG[@]}" | |
| python3 eval/runners/run_governance_policy_eval.py "${LIMIT_ARG[@]}" | |
| python3 eval/runners/run_retrieval_stress_eval.py "${LIMIT_ARG[@]}" | |
| echo "" | |
| echo "Done. Summaries in eval/dashboards/" | |