{
"run_id": "r_8f3a91b04c",
"project": "support-triage",
"env": "prod",
"started_at": "2026-05-12T14:18:54.211Z",
"duration_ms": 3184,
"status": "failed",
"failure": {
"test": "draft_reply.no_hallucinated_refund",
"step": 3,
"expected": "$24.99",
"actual": "$249.00",
"severity": "high"
},
"model": {
"name": "gpt-5.1-mini",
"temperature": 0.2,
"prev": "gpt-5.0"
},
"trigger": "deploy",
"deploy": {
"sha": "a4f7c2e",
"by": "marin.n@safeship.dev"
},
"steps": [
{
"i": 1,
"tool": "classify_intent",
"kind": "llm",
"ok": true,
"ms": 142,
"in": "Hi, I was double charged…",
"out": { "intent": "refund_request", "confidence": 0.94 }
},
{
"i": 2,
"tool": "lookup_order",
"kind": "tool",
"ok": true,
"ms": 87,
"in": { "user_id": "u_77a2" },
"out": {
"order_id": "4821",
"total": 24.99,
"items": 2,
"status": "shipped"
}
},
{
"i": 3,
"tool": "draft_reply",
"kind": "llm",
"ok": false,
"ms": 1847,
"assertion": "output.contains($24.99)",
"matched": false,
"in": { /* … see expanded view */ },
"out": "…issued a refund of $249.00…",
"diff_against": "r_18b720c5d3 · gpt-5.0"
},
{
"i": 4,
"tool": "policy_check",
"kind": "tool",
"ok": true,
"ms": 61,
"out": { "passed": true, "flags": [] }
},
{
"i": 5,
"tool": "queue_for_send",
"kind": "tool",
"ok": true,
"ms": 23,
"out": { "queued": true }
}
],
"tokens": { "in": 412, "out": 98, "cost_usd": 0.0021 },
"tags": ["refund", "numeric", "hallucination", "regression"],
"parent_test": "draft_reply.suite",
"replay_url": "https://safeship.dev/r/8f3a91b04c"
}