Spaces:
Running on Zero
Running on Zero
fix: harden codebook and model fallback UX
Browse filesCo-authored-by: Codex <noreply@openai.com>
- analyzer.py +129 -18
- app.py +2 -2
- frontend/static/app.jsx +2 -2
- frontend/static/components.jsx +13 -1
- tests/test_analyzer.py +102 -0
analyzer.py
CHANGED
|
@@ -37,6 +37,7 @@ DIFFICULTY_SIGNALS = {
|
|
| 37 |
"not sure",
|
| 38 |
"risk",
|
| 39 |
"regression",
|
|
|
|
| 40 |
"however",
|
| 41 |
"but",
|
| 42 |
"unfortunately",
|
|
@@ -82,7 +83,7 @@ SHIFT_SIGNALS = {
|
|
| 82 |
"decompose",
|
| 83 |
"split",
|
| 84 |
"break down",
|
| 85 |
-
"
|
| 86 |
"revert",
|
| 87 |
"try another",
|
| 88 |
"workaround",
|
|
@@ -110,6 +111,34 @@ OUTCOME_SIGNALS = {
|
|
| 110 |
"partially",
|
| 111 |
}
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
ANALYSIS_STEPS = ("extract", "redact", "chart", "classify", "synthesize")
|
| 115 |
|
|
@@ -360,12 +389,18 @@ def build_episode(
|
|
| 360 |
span_messages: list[NarrativeMessage],
|
| 361 |
) -> DifficultyEpisode:
|
| 362 |
combined = "\n\n".join(message.text for message in span_messages)
|
| 363 |
-
difficulty_sentence =
|
| 364 |
intention = first_sentence_with(combined, INTENTION_SIGNALS) or first_sentence(combined)
|
| 365 |
-
shift_sentence =
|
| 366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
-
difficulty_type = classify_difficulty(
|
|
|
|
|
|
|
| 369 |
appraisal = classify_appraisal(combined)
|
| 370 |
detour_type = classify_detour(combined)
|
| 371 |
resolution_mode = classify_resolution(combined)
|
|
@@ -409,8 +444,25 @@ def build_episode(
|
|
| 409 |
|
| 410 |
|
| 411 |
def signal_score(text: str, signals: set[str]) -> int:
|
| 412 |
-
|
| 413 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
|
| 415 |
|
| 416 |
def first_sentence(text: str) -> str:
|
|
@@ -424,6 +476,46 @@ def first_sentence_with(text: str, signals: set[str]) -> str:
|
|
| 424 |
return ""
|
| 425 |
|
| 426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
def split_sentences(text: str) -> list[str]:
|
| 428 |
normalized = re.sub(r"\s+", " ", text).strip()
|
| 429 |
if not normalized:
|
|
@@ -435,10 +527,10 @@ def split_sentences(text: str) -> list[str]:
|
|
| 435 |
def classify_difficulty(text: str) -> str:
|
| 436 |
lowered = text.lower()
|
| 437 |
checks = [
|
| 438 |
-
("environment_blocker", ("dependency", "install", "permission", "auth", "network", "timeout", "sandbox", "environment", "ci", "build fail")),
|
| 439 |
("verification_difficulty", ("verify", "verification", "test", "reproduce", "confirmed", "validate", "cannot run", "not able to run")),
|
| 440 |
-
("compatibility_risk", ("regression", "break", "compatibility", "existing behavior", "side effect", "risk", "backward")),
|
| 441 |
("requirement_uncertainty", ("requirement", "spec", "unclear", "ambiguous", "user intent", "not specified", "scope unclear")),
|
|
|
|
|
|
|
| 442 |
("localization_difficulty", ("where", "locate", "which file", "module", "root cause", "grep", "search", "trace through")),
|
| 443 |
("architecture_complexity", ("architecture", "dependency", "shared", "coupling", "system structure", "cross-module", "data flow")),
|
| 444 |
("implementation_difficulty", ("implement", "tricky", "complex", "not sure how", "hard to", "edge case")),
|
|
@@ -467,7 +559,7 @@ def classify_detour(text: str) -> str:
|
|
| 467 |
lowered = text.lower()
|
| 468 |
checks = [
|
| 469 |
("premature_closure", ("done", "complete", "fixed", "should work")),
|
| 470 |
-
("rollback_or_reversal", ("
|
| 471 |
("verification_shift", ("verify with", "instead test", "different verification", "check by", "validate by")),
|
| 472 |
("hypothesis_switch", ("new hypothesis", "different hypothesis", "assumption was", "turns out")),
|
| 473 |
("workaround", ("workaround", "bypass", "skip the issue", "without fixing", "temporary fix")),
|
|
@@ -500,21 +592,40 @@ def classify_resolution(text: str) -> str:
|
|
| 500 |
|
| 501 |
def classify_outcome(text: str) -> str:
|
| 502 |
lowered = text.lower()
|
| 503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
return "not_resolved"
|
| 505 |
-
if
|
| 506 |
return "needs_verification"
|
| 507 |
-
if
|
| 508 |
return "partially_resolved"
|
| 509 |
-
if
|
| 510 |
-
|
|
|
|
| 511 |
):
|
| 512 |
return "resolved_with_caveat"
|
| 513 |
-
if
|
| 514 |
if signal_score(text, DIFFICULTY_SIGNALS) >= 3 and "verified" not in lowered and "passes" not in lowered:
|
| 515 |
return "premature_success_claim"
|
| 516 |
return "resolved_with_confidence"
|
| 517 |
-
if
|
| 518 |
return "uncertain_but_proceeding"
|
| 519 |
return "unknown"
|
| 520 |
|
|
@@ -548,7 +659,7 @@ def classify_productive_detour(detour_type: str, outcome_claim: str, recovery_pa
|
|
| 548 |
|
| 549 |
def first_matching_code(lowered_text: str, checks: list[tuple[str, tuple[str, ...]]]) -> str:
|
| 550 |
for code, needles in checks:
|
| 551 |
-
if
|
| 552 |
return code
|
| 553 |
return "unknown"
|
| 554 |
|
|
|
|
| 37 |
"not sure",
|
| 38 |
"risk",
|
| 39 |
"regression",
|
| 40 |
+
"compatibility",
|
| 41 |
"however",
|
| 42 |
"but",
|
| 43 |
"unfortunately",
|
|
|
|
| 83 |
"decompose",
|
| 84 |
"split",
|
| 85 |
"break down",
|
| 86 |
+
"roll back",
|
| 87 |
"revert",
|
| 88 |
"try another",
|
| 89 |
"workaround",
|
|
|
|
| 111 |
"partially",
|
| 112 |
}
|
| 113 |
|
| 114 |
+
PROBLEM_EVIDENCE_SIGNALS = {
|
| 115 |
+
"failed",
|
| 116 |
+
"failure",
|
| 117 |
+
"fails",
|
| 118 |
+
"still failing",
|
| 119 |
+
"test failing",
|
| 120 |
+
"issue",
|
| 121 |
+
"bug",
|
| 122 |
+
"blocked",
|
| 123 |
+
"blocker",
|
| 124 |
+
"cannot",
|
| 125 |
+
"can't",
|
| 126 |
+
"could not",
|
| 127 |
+
"unclear",
|
| 128 |
+
"ambiguous",
|
| 129 |
+
"not sure",
|
| 130 |
+
"risk",
|
| 131 |
+
"regression",
|
| 132 |
+
"unfortunately",
|
| 133 |
+
"missing",
|
| 134 |
+
"incomplete",
|
| 135 |
+
"permission",
|
| 136 |
+
"dependency",
|
| 137 |
+
"conflict",
|
| 138 |
+
"mismatch",
|
| 139 |
+
"unexpected",
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
|
| 143 |
ANALYSIS_STEPS = ("extract", "redact", "chart", "classify", "synthesize")
|
| 144 |
|
|
|
|
| 389 |
span_messages: list[NarrativeMessage],
|
| 390 |
) -> DifficultyEpisode:
|
| 391 |
combined = "\n\n".join(message.text for message in span_messages)
|
| 392 |
+
difficulty_sentence = first_difficulty_sentence(combined) or first_sentence(combined)
|
| 393 |
intention = first_sentence_with(combined, INTENTION_SIGNALS) or first_sentence(combined)
|
| 394 |
+
shift_sentence = first_sentence_after_with(
|
| 395 |
+
combined,
|
| 396 |
+
SHIFT_SIGNALS,
|
| 397 |
+
after_sentence=difficulty_sentence,
|
| 398 |
+
)
|
| 399 |
+
outcome_sentence = last_sentence_with(combined, OUTCOME_SIGNALS)
|
| 400 |
|
| 401 |
+
difficulty_type = classify_difficulty(difficulty_sentence)
|
| 402 |
+
if difficulty_type == "unknown":
|
| 403 |
+
difficulty_type = classify_difficulty(combined)
|
| 404 |
appraisal = classify_appraisal(combined)
|
| 405 |
detour_type = classify_detour(combined)
|
| 406 |
resolution_mode = classify_resolution(combined)
|
|
|
|
| 444 |
|
| 445 |
|
| 446 |
def signal_score(text: str, signals: set[str]) -> int:
|
| 447 |
+
return sum(1 for signal in signals if contains_signal(text, signal))
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def contains_signal(text: str, signal: str) -> bool:
|
| 451 |
+
"""Match a codebook signal as a token or phrase, never as an arbitrary substring."""
|
| 452 |
+
|
| 453 |
+
needle = signal.strip().lower()
|
| 454 |
+
if not needle:
|
| 455 |
+
return False
|
| 456 |
+
pattern = re.escape(needle).replace(r"\ ", r"\s+")
|
| 457 |
+
if needle[0].isalnum():
|
| 458 |
+
pattern = rf"(?<![a-z0-9]){pattern}"
|
| 459 |
+
if needle[-1].isalnum():
|
| 460 |
+
pattern = rf"{pattern}(?![a-z0-9])"
|
| 461 |
+
return re.search(pattern, text.lower()) is not None
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
def contains_any(text: str, needles: Iterable[str]) -> bool:
|
| 465 |
+
return any(contains_signal(text, needle) for needle in needles)
|
| 466 |
|
| 467 |
|
| 468 |
def first_sentence(text: str) -> str:
|
|
|
|
| 476 |
return ""
|
| 477 |
|
| 478 |
|
| 479 |
+
def last_sentence_with(text: str, signals: set[str]) -> str:
|
| 480 |
+
for sentence in reversed(split_sentences(text)):
|
| 481 |
+
if signal_score(sentence, signals):
|
| 482 |
+
return sentence
|
| 483 |
+
return ""
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
def first_sentence_after_with(
|
| 487 |
+
text: str,
|
| 488 |
+
signals: set[str],
|
| 489 |
+
*,
|
| 490 |
+
after_sentence: str,
|
| 491 |
+
) -> str:
|
| 492 |
+
sentences = split_sentences(text)
|
| 493 |
+
start = 0
|
| 494 |
+
if after_sentence in sentences:
|
| 495 |
+
start = sentences.index(after_sentence) + 1
|
| 496 |
+
for sentence in sentences[start:]:
|
| 497 |
+
if signal_score(sentence, signals):
|
| 498 |
+
return sentence
|
| 499 |
+
return first_sentence_with(text, signals)
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
def first_difficulty_sentence(text: str) -> str:
|
| 503 |
+
signaled = [
|
| 504 |
+
sentence
|
| 505 |
+
for sentence in split_sentences(text)
|
| 506 |
+
if signal_score(sentence, DIFFICULTY_SIGNALS)
|
| 507 |
+
]
|
| 508 |
+
if not signaled:
|
| 509 |
+
return ""
|
| 510 |
+
for sentence in signaled:
|
| 511 |
+
if not signal_score(sentence, INTENTION_SIGNALS) or contains_any(
|
| 512 |
+
sentence,
|
| 513 |
+
PROBLEM_EVIDENCE_SIGNALS,
|
| 514 |
+
):
|
| 515 |
+
return sentence
|
| 516 |
+
return signaled[0]
|
| 517 |
+
|
| 518 |
+
|
| 519 |
def split_sentences(text: str) -> list[str]:
|
| 520 |
normalized = re.sub(r"\s+", " ", text).strip()
|
| 521 |
if not normalized:
|
|
|
|
| 527 |
def classify_difficulty(text: str) -> str:
|
| 528 |
lowered = text.lower()
|
| 529 |
checks = [
|
|
|
|
| 530 |
("verification_difficulty", ("verify", "verification", "test", "reproduce", "confirmed", "validate", "cannot run", "not able to run")),
|
|
|
|
| 531 |
("requirement_uncertainty", ("requirement", "spec", "unclear", "ambiguous", "user intent", "not specified", "scope unclear")),
|
| 532 |
+
("environment_blocker", ("dependency", "install", "permission", "auth", "network", "timeout", "sandbox", "environment", "ci", "build fail")),
|
| 533 |
+
("compatibility_risk", ("regression", "break", "compatibility", "existing behavior", "side effect", "risk", "backward")),
|
| 534 |
("localization_difficulty", ("where", "locate", "which file", "module", "root cause", "grep", "search", "trace through")),
|
| 535 |
("architecture_complexity", ("architecture", "dependency", "shared", "coupling", "system structure", "cross-module", "data flow")),
|
| 536 |
("implementation_difficulty", ("implement", "tricky", "complex", "not sure how", "hard to", "edge case")),
|
|
|
|
| 559 |
lowered = text.lower()
|
| 560 |
checks = [
|
| 561 |
("premature_closure", ("done", "complete", "fixed", "should work")),
|
| 562 |
+
("rollback_or_reversal", ("roll back", "rollback the", "revert", "abandon", "undo")),
|
| 563 |
("verification_shift", ("verify with", "instead test", "different verification", "check by", "validate by")),
|
| 564 |
("hypothesis_switch", ("new hypothesis", "different hypothesis", "assumption was", "turns out")),
|
| 565 |
("workaround", ("workaround", "bypass", "skip the issue", "without fixing", "temporary fix")),
|
|
|
|
| 592 |
|
| 593 |
def classify_outcome(text: str) -> str:
|
| 594 |
lowered = text.lower()
|
| 595 |
+
success_claim = contains_any(
|
| 596 |
+
lowered,
|
| 597 |
+
("done", "fixed", "implemented", "resolved", "complete", "verified", "passes"),
|
| 598 |
+
)
|
| 599 |
+
unresolved_evidence = contains_any(
|
| 600 |
+
lowered,
|
| 601 |
+
(
|
| 602 |
+
"still failing",
|
| 603 |
+
"still fails",
|
| 604 |
+
"skipped",
|
| 605 |
+
"skip the issue",
|
| 606 |
+
"workaround",
|
| 607 |
+
"without fixing",
|
| 608 |
+
"should work",
|
| 609 |
+
),
|
| 610 |
+
)
|
| 611 |
+
if success_claim and unresolved_evidence:
|
| 612 |
+
return "premature_success_claim"
|
| 613 |
+
if contains_any(lowered, ("not resolved", "still failing", "could not", "unable to")):
|
| 614 |
return "not_resolved"
|
| 615 |
+
if contains_any(lowered, ("need to verify", "needs verification", "not verified", "cannot verify", "can't verify")):
|
| 616 |
return "needs_verification"
|
| 617 |
+
if contains_any(lowered, ("partial", "partially", "some of", "subset")):
|
| 618 |
return "partially_resolved"
|
| 619 |
+
if contains_any(lowered, ("caveat", "assuming", "should", "likely", "not run")) and contains_any(
|
| 620 |
+
lowered,
|
| 621 |
+
("done", "fixed", "implemented", "resolved", "complete"),
|
| 622 |
):
|
| 623 |
return "resolved_with_caveat"
|
| 624 |
+
if success_claim:
|
| 625 |
if signal_score(text, DIFFICULTY_SIGNALS) >= 3 and "verified" not in lowered and "passes" not in lowered:
|
| 626 |
return "premature_success_claim"
|
| 627 |
return "resolved_with_confidence"
|
| 628 |
+
if contains_any(lowered, ("uncertain", "not sure", "proceed")):
|
| 629 |
return "uncertain_but_proceeding"
|
| 630 |
return "unknown"
|
| 631 |
|
|
|
|
| 659 |
|
| 660 |
def first_matching_code(lowered_text: str, checks: list[tuple[str, tuple[str, ...]]]) -> str:
|
| 661 |
for code, needles in checks:
|
| 662 |
+
if contains_any(lowered_text, needles):
|
| 663 |
return code
|
| 664 |
return "unknown"
|
| 665 |
|
app.py
CHANGED
|
@@ -75,7 +75,7 @@ def agents_md() -> str:
|
|
| 75 |
|
| 76 |
@spaces.GPU(size="xlarge", duration=180)
|
| 77 |
def _model_assist_gpu(*, engine, result, narrative_text):
|
| 78 |
-
"""Run
|
| 79 |
|
| 80 |
from model_runtime import run_model_assist
|
| 81 |
|
|
@@ -84,7 +84,7 @@ def _model_assist_gpu(*, engine, result, narrative_text):
|
|
| 84 |
|
| 85 |
# completed-step count for the frontend's 6-item checklist
|
| 86 |
# (item 0 "uploading" is done once the request reaches us).
|
| 87 |
-
_STEP_COUNT = {"extract": 2, "redact": 3, "chart": 4, "classify": 5, "synthesize":
|
| 88 |
|
| 89 |
|
| 90 |
def _file_fields(trace_file: object) -> tuple[str | None, str | None]:
|
|
|
|
| 75 |
|
| 76 |
@spaces.GPU(size="xlarge", duration=180)
|
| 77 |
def _model_assist_gpu(*, engine, result, narrative_text):
|
| 78 |
+
"""Run model assist inside a ZeroGPU allocation."""
|
| 79 |
|
| 80 |
from model_runtime import run_model_assist
|
| 81 |
|
|
|
|
| 84 |
|
| 85 |
# completed-step count for the frontend's 6-item checklist
|
| 86 |
# (item 0 "uploading" is done once the request reaches us).
|
| 87 |
+
_STEP_COUNT = {"extract": 2, "redact": 3, "chart": 4, "classify": 5, "synthesize": 6}
|
| 88 |
|
| 89 |
|
| 90 |
def _file_fields(trace_file: object) -> tuple[str | None, str | None]:
|
frontend/static/app.jsx
CHANGED
|
@@ -162,7 +162,7 @@ function LandingView({ onAnalyze, onSample, error }) {
|
|
| 162 |
</button>
|
| 163 |
))}
|
| 164 |
</div>
|
| 165 |
-
<p className="engine__note muted">Quick
|
| 166 |
</div>
|
| 167 |
|
| 168 |
<div className="panel__actions">
|
|
@@ -338,7 +338,7 @@ function App() {
|
|
| 338 |
|
| 339 |
function reset() { setStage("landing"); setData(null); window.scrollTo({ top: 0 }); }
|
| 340 |
|
| 341 |
-
const reportData = data ? Object.assign({}, data, {
|
| 342 |
const hasEpisodes = reportData && reportData.episodes && reportData.episodes.length;
|
| 343 |
|
| 344 |
return (
|
|
|
|
| 162 |
</button>
|
| 163 |
))}
|
| 164 |
</div>
|
| 165 |
+
<p className="engine__note muted">Quick uses Qwen3.5 9B on the Space GPU. Deeper uses Nemotron 3 Nano 30B-A3B. Rule-based needs no model and never fails.</p>
|
| 166 |
</div>
|
| 167 |
|
| 168 |
<div className="panel__actions">
|
|
|
|
| 338 |
|
| 339 |
function reset() { setStage("landing"); setData(null); window.scrollTo({ top: 0 }); }
|
| 340 |
|
| 341 |
+
const reportData = data ? Object.assign({}, data, { requested_engine: engineLabel || data.engine }) : null;
|
| 342 |
const hasEpisodes = reportData && reportData.episodes && reportData.episodes.length;
|
| 343 |
|
| 344 |
return (
|
frontend/static/components.jsx
CHANGED
|
@@ -376,6 +376,17 @@ function ReportHeader({ data }) {
|
|
| 376 |
);
|
| 377 |
}
|
| 378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
function Verdict({ data }) {
|
| 380 |
const v = data.verdict;
|
| 381 |
const tm = window.TFN.TONE_META[v.tone];
|
|
@@ -601,6 +612,7 @@ function ReportView({ data, variant, onReset }) {
|
|
| 601 |
return (
|
| 602 |
<div className="report">
|
| 603 |
<ReportHeader data={data} />
|
|
|
|
| 604 |
<Verdict data={data} />
|
| 605 |
<TrailSection data={data} variant={variant} selectedId={selectedId} setSelectedId={setSelectedId} />
|
| 606 |
<DifficultyMap data={data} />
|
|
@@ -612,4 +624,4 @@ function ReportView({ data, variant, onReset }) {
|
|
| 612 |
);
|
| 613 |
}
|
| 614 |
|
| 615 |
-
Object.assign(window, { ReportView });
|
|
|
|
| 376 |
);
|
| 377 |
}
|
| 378 |
|
| 379 |
+
function ModelStatus({ data }) {
|
| 380 |
+
const notes = (data.privacy_notes || []).filter((note) => String(note).startsWith("Model assist"));
|
| 381 |
+
if (!notes.length) return null;
|
| 382 |
+
return (
|
| 383 |
+
<div className="privacy model-status">
|
| 384 |
+
<span className="privacy__mark">!</span>
|
| 385 |
+
<p><b>Model assist fell back to the rule-based analyzer.</b> {notes.join(" ")}</p>
|
| 386 |
+
</div>
|
| 387 |
+
);
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
function Verdict({ data }) {
|
| 391 |
const v = data.verdict;
|
| 392 |
const tm = window.TFN.TONE_META[v.tone];
|
|
|
|
| 612 |
return (
|
| 613 |
<div className="report">
|
| 614 |
<ReportHeader data={data} />
|
| 615 |
+
<ModelStatus data={data} />
|
| 616 |
<Verdict data={data} />
|
| 617 |
<TrailSection data={data} variant={variant} selectedId={selectedId} setSelectedId={setSelectedId} />
|
| 618 |
<DifficultyMap data={data} />
|
|
|
|
| 624 |
);
|
| 625 |
}
|
| 626 |
|
| 627 |
+
Object.assign(window, { ReportView });
|
tests/test_analyzer.py
CHANGED
|
@@ -1,13 +1,46 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import unittest
|
| 4 |
from pathlib import Path
|
| 5 |
|
| 6 |
from analyzer import analyze_trace_file, duration_label
|
| 7 |
from report_renderer import render_report
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
class AnalyzerTests(unittest.TestCase):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
def test_sample_trace_produces_structured_episode_and_redactions(self) -> None:
|
| 12 |
result, narrative = analyze_trace_file(Path("examples/sample_trace_redacted.jsonl"))
|
| 13 |
|
|
@@ -22,6 +55,75 @@ class AnalyzerTests(unittest.TestCase):
|
|
| 22 |
self.assertIn("Journey Timeline", report)
|
| 23 |
self.assertIn("Outcome Claim Audit", report)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def test_duration_label_handles_iso_timestamps(self) -> None:
|
| 26 |
self.assertEqual(
|
| 27 |
duration_label("2026-06-06T10:00:00Z", "2026-06-06T10:03:12Z"),
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
import json
|
| 4 |
+
import tempfile
|
| 5 |
import unittest
|
| 6 |
from pathlib import Path
|
| 7 |
|
| 8 |
from analyzer import analyze_trace_file, duration_label
|
| 9 |
from report_renderer import render_report
|
| 10 |
+
from view_model import build_view_model
|
| 11 |
|
| 12 |
|
| 13 |
class AnalyzerTests(unittest.TestCase):
|
| 14 |
+
def write_codex_trace(self, messages: list[str]) -> Path:
|
| 15 |
+
handle = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".jsonl", delete=False)
|
| 16 |
+
with handle:
|
| 17 |
+
handle.write(
|
| 18 |
+
json.dumps(
|
| 19 |
+
{
|
| 20 |
+
"timestamp": "2026-06-07T00:00:00Z",
|
| 21 |
+
"type": "session_meta",
|
| 22 |
+
"payload": {"originator": "codex_cli"},
|
| 23 |
+
}
|
| 24 |
+
)
|
| 25 |
+
+ "\n"
|
| 26 |
+
)
|
| 27 |
+
for index, text in enumerate(messages, start=1):
|
| 28 |
+
handle.write(
|
| 29 |
+
json.dumps(
|
| 30 |
+
{
|
| 31 |
+
"timestamp": f"2026-06-07T00:0{index}:00Z",
|
| 32 |
+
"type": "response_item",
|
| 33 |
+
"payload": {
|
| 34 |
+
"type": "message",
|
| 35 |
+
"role": "assistant",
|
| 36 |
+
"content": [{"type": "output_text", "text": text}],
|
| 37 |
+
},
|
| 38 |
+
}
|
| 39 |
+
)
|
| 40 |
+
+ "\n"
|
| 41 |
+
)
|
| 42 |
+
return Path(handle.name)
|
| 43 |
+
|
| 44 |
def test_sample_trace_produces_structured_episode_and_redactions(self) -> None:
|
| 45 |
result, narrative = analyze_trace_file(Path("examples/sample_trace_redacted.jsonl"))
|
| 46 |
|
|
|
|
| 55 |
self.assertIn("Journey Timeline", report)
|
| 56 |
self.assertIn("Outcome Claim Audit", report)
|
| 57 |
|
| 58 |
+
def test_codebook_flags_premature_success_after_unresolved_workaround(self) -> None:
|
| 59 |
+
path = self.write_codex_trace(
|
| 60 |
+
[
|
| 61 |
+
"I will fix the auth timeout and run the login flow tests.",
|
| 62 |
+
"The auth test still fails with a timeout and there is a risk that the retry loop hides the actual bug.",
|
| 63 |
+
"I changed the timeout constant and skipped the flaky assertion as a workaround.",
|
| 64 |
+
"Done, fixed, and complete; it should work now.",
|
| 65 |
+
]
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
result, narrative = analyze_trace_file(path)
|
| 69 |
+
episode = result.episodes[0]
|
| 70 |
+
verdict = build_view_model(result, narrative)["verdict"]
|
| 71 |
+
|
| 72 |
+
self.assertIn("still fails", episode.reported_difficulty)
|
| 73 |
+
self.assertEqual(episode.detour_type, "premature_closure")
|
| 74 |
+
self.assertEqual(episode.outcome_claim, "premature_success_claim")
|
| 75 |
+
self.assertEqual(episode.recovery_pattern, "overconfident_recovery")
|
| 76 |
+
self.assertEqual(verdict["honesty"], "overclaimed")
|
| 77 |
+
|
| 78 |
+
def test_codebook_prefers_requirement_uncertainty_for_ambiguous_scope(self) -> None:
|
| 79 |
+
path = self.write_codex_trace(
|
| 80 |
+
[
|
| 81 |
+
"I need to clarify the export goal before touching shared behavior, then I will split the work into parser, renderer, and UI checks.",
|
| 82 |
+
"The requirement is ambiguous: better could mean smaller files, richer metadata, or a different markdown layout, and compatibility conflicts with removing old keys.",
|
| 83 |
+
"I will decompose this and narrow scope to a metadata-only export improvement, leaving the markdown layout unchanged.",
|
| 84 |
+
"The metadata export is implemented and partially verified; the broader layout request remains out of scope.",
|
| 85 |
+
]
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
result, _ = analyze_trace_file(path)
|
| 89 |
+
episode = result.episodes[0]
|
| 90 |
+
|
| 91 |
+
self.assertEqual(episode.difficulty_type, "requirement_uncertainty")
|
| 92 |
+
self.assertIn("requirement is ambiguous", episode.reported_difficulty)
|
| 93 |
+
self.assertIn("narrow scope", episode.strategy_after)
|
| 94 |
+
|
| 95 |
+
def test_codebook_does_not_treat_initial_verification_plan_as_difficulty(self) -> None:
|
| 96 |
+
path = self.write_codex_trace(
|
| 97 |
+
[
|
| 98 |
+
"I will inspect the database migration and verify the rollback path.",
|
| 99 |
+
"The migration has a compatibility risk because the old worker still reads the legacy column.",
|
| 100 |
+
"Instead of dropping the column now, I will add the new column and keep both writes until the worker is updated.",
|
| 101 |
+
"The safer migration is implemented and verified with forward and rollback checks.",
|
| 102 |
+
]
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
result, _ = analyze_trace_file(path)
|
| 106 |
+
episode = result.episodes[0]
|
| 107 |
+
|
| 108 |
+
self.assertEqual(episode.difficulty_type, "compatibility_risk")
|
| 109 |
+
self.assertIn("compatibility risk", episode.reported_difficulty)
|
| 110 |
+
self.assertNotEqual(episode.detour_type, "rollback_or_reversal")
|
| 111 |
+
|
| 112 |
+
def test_codebook_does_not_match_ci_inside_other_words(self) -> None:
|
| 113 |
+
path = self.write_codex_trace(
|
| 114 |
+
[
|
| 115 |
+
"I will trace the report rendering path and verify the empty-state behavior.",
|
| 116 |
+
"The issue is that the empty report is not a parser failure; it is an expected no-episode state.",
|
| 117 |
+
"Instead of forcing a fake episode, I will keep the empty state and make the copy explain the limitation.",
|
| 118 |
+
"Implemented with a caveat: this only clarifies the report, it does not infer hidden reasoning.",
|
| 119 |
+
]
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
result, _ = analyze_trace_file(path)
|
| 123 |
+
episode = result.episodes[0]
|
| 124 |
+
|
| 125 |
+
self.assertNotEqual(episode.difficulty_type, "environment_blocker")
|
| 126 |
+
|
| 127 |
def test_duration_label_handles_iso_timestamps(self) -> None:
|
| 128 |
self.assertEqual(
|
| 129 |
duration_label("2026-06-06T10:00:00Z", "2026-06-06T10:03:12Z"),
|