JacobLinCool Codex commited on
Commit
fbb7c0c
·
verified ·
1 Parent(s): 1d6f646

fix: harden codebook and model fallback UX

Browse files

Co-authored-by: Codex <noreply@openai.com>

analyzer.py CHANGED
@@ -37,6 +37,7 @@ DIFFICULTY_SIGNALS = {
37
  "not sure",
38
  "risk",
39
  "regression",
 
40
  "however",
41
  "but",
42
  "unfortunately",
@@ -82,7 +83,7 @@ SHIFT_SIGNALS = {
82
  "decompose",
83
  "split",
84
  "break down",
85
- "rollback",
86
  "revert",
87
  "try another",
88
  "workaround",
@@ -110,6 +111,34 @@ OUTCOME_SIGNALS = {
110
  "partially",
111
  }
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  ANALYSIS_STEPS = ("extract", "redact", "chart", "classify", "synthesize")
115
 
@@ -360,12 +389,18 @@ def build_episode(
360
  span_messages: list[NarrativeMessage],
361
  ) -> DifficultyEpisode:
362
  combined = "\n\n".join(message.text for message in span_messages)
363
- difficulty_sentence = first_sentence_with(combined, DIFFICULTY_SIGNALS) or first_sentence(combined)
364
  intention = first_sentence_with(combined, INTENTION_SIGNALS) or first_sentence(combined)
365
- shift_sentence = first_sentence_with(combined, SHIFT_SIGNALS)
366
- outcome_sentence = first_sentence_with(combined, OUTCOME_SIGNALS)
 
 
 
 
367
 
368
- difficulty_type = classify_difficulty(combined)
 
 
369
  appraisal = classify_appraisal(combined)
370
  detour_type = classify_detour(combined)
371
  resolution_mode = classify_resolution(combined)
@@ -409,8 +444,25 @@ def build_episode(
409
 
410
 
411
  def signal_score(text: str, signals: set[str]) -> int:
412
- lowered = text.lower()
413
- return sum(1 for signal in signals if signal in lowered)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
 
416
  def first_sentence(text: str) -> str:
@@ -424,6 +476,46 @@ def first_sentence_with(text: str, signals: set[str]) -> str:
424
  return ""
425
 
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  def split_sentences(text: str) -> list[str]:
428
  normalized = re.sub(r"\s+", " ", text).strip()
429
  if not normalized:
@@ -435,10 +527,10 @@ def split_sentences(text: str) -> list[str]:
435
  def classify_difficulty(text: str) -> str:
436
  lowered = text.lower()
437
  checks = [
438
- ("environment_blocker", ("dependency", "install", "permission", "auth", "network", "timeout", "sandbox", "environment", "ci", "build fail")),
439
  ("verification_difficulty", ("verify", "verification", "test", "reproduce", "confirmed", "validate", "cannot run", "not able to run")),
440
- ("compatibility_risk", ("regression", "break", "compatibility", "existing behavior", "side effect", "risk", "backward")),
441
  ("requirement_uncertainty", ("requirement", "spec", "unclear", "ambiguous", "user intent", "not specified", "scope unclear")),
 
 
442
  ("localization_difficulty", ("where", "locate", "which file", "module", "root cause", "grep", "search", "trace through")),
443
  ("architecture_complexity", ("architecture", "dependency", "shared", "coupling", "system structure", "cross-module", "data flow")),
444
  ("implementation_difficulty", ("implement", "tricky", "complex", "not sure how", "hard to", "edge case")),
@@ -467,7 +559,7 @@ def classify_detour(text: str) -> str:
467
  lowered = text.lower()
468
  checks = [
469
  ("premature_closure", ("done", "complete", "fixed", "should work")),
470
- ("rollback_or_reversal", ("rollback", "roll back", "revert", "abandon", "undo")),
471
  ("verification_shift", ("verify with", "instead test", "different verification", "check by", "validate by")),
472
  ("hypothesis_switch", ("new hypothesis", "different hypothesis", "assumption was", "turns out")),
473
  ("workaround", ("workaround", "bypass", "skip the issue", "without fixing", "temporary fix")),
@@ -500,21 +592,40 @@ def classify_resolution(text: str) -> str:
500
 
501
  def classify_outcome(text: str) -> str:
502
  lowered = text.lower()
503
- if any(token in lowered for token in ("not resolved", "still failing", "could not", "unable to")):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  return "not_resolved"
505
- if any(token in lowered for token in ("need to verify", "needs verification", "not verified", "cannot verify", "can't verify")):
506
  return "needs_verification"
507
- if any(token in lowered for token in ("partial", "partially", "some of", "subset")):
508
  return "partially_resolved"
509
- if any(token in lowered for token in ("caveat", "assuming", "should", "likely", "not run")) and any(
510
- token in lowered for token in ("done", "fixed", "implemented", "resolved", "complete")
 
511
  ):
512
  return "resolved_with_caveat"
513
- if any(token in lowered for token in ("done", "fixed", "implemented", "resolved", "complete", "verified", "passes")):
514
  if signal_score(text, DIFFICULTY_SIGNALS) >= 3 and "verified" not in lowered and "passes" not in lowered:
515
  return "premature_success_claim"
516
  return "resolved_with_confidence"
517
- if any(token in lowered for token in ("uncertain", "not sure", "proceed")):
518
  return "uncertain_but_proceeding"
519
  return "unknown"
520
 
@@ -548,7 +659,7 @@ def classify_productive_detour(detour_type: str, outcome_claim: str, recovery_pa
548
 
549
  def first_matching_code(lowered_text: str, checks: list[tuple[str, tuple[str, ...]]]) -> str:
550
  for code, needles in checks:
551
- if any(needle in lowered_text for needle in needles):
552
  return code
553
  return "unknown"
554
 
 
37
  "not sure",
38
  "risk",
39
  "regression",
40
+ "compatibility",
41
  "however",
42
  "but",
43
  "unfortunately",
 
83
  "decompose",
84
  "split",
85
  "break down",
86
+ "roll back",
87
  "revert",
88
  "try another",
89
  "workaround",
 
111
  "partially",
112
  }
113
 
114
+ PROBLEM_EVIDENCE_SIGNALS = {
115
+ "failed",
116
+ "failure",
117
+ "fails",
118
+ "still failing",
119
+ "test failing",
120
+ "issue",
121
+ "bug",
122
+ "blocked",
123
+ "blocker",
124
+ "cannot",
125
+ "can't",
126
+ "could not",
127
+ "unclear",
128
+ "ambiguous",
129
+ "not sure",
130
+ "risk",
131
+ "regression",
132
+ "unfortunately",
133
+ "missing",
134
+ "incomplete",
135
+ "permission",
136
+ "dependency",
137
+ "conflict",
138
+ "mismatch",
139
+ "unexpected",
140
+ }
141
+
142
 
143
  ANALYSIS_STEPS = ("extract", "redact", "chart", "classify", "synthesize")
144
 
 
389
  span_messages: list[NarrativeMessage],
390
  ) -> DifficultyEpisode:
391
  combined = "\n\n".join(message.text for message in span_messages)
392
+ difficulty_sentence = first_difficulty_sentence(combined) or first_sentence(combined)
393
  intention = first_sentence_with(combined, INTENTION_SIGNALS) or first_sentence(combined)
394
+ shift_sentence = first_sentence_after_with(
395
+ combined,
396
+ SHIFT_SIGNALS,
397
+ after_sentence=difficulty_sentence,
398
+ )
399
+ outcome_sentence = last_sentence_with(combined, OUTCOME_SIGNALS)
400
 
401
+ difficulty_type = classify_difficulty(difficulty_sentence)
402
+ if difficulty_type == "unknown":
403
+ difficulty_type = classify_difficulty(combined)
404
  appraisal = classify_appraisal(combined)
405
  detour_type = classify_detour(combined)
406
  resolution_mode = classify_resolution(combined)
 
444
 
445
 
446
  def signal_score(text: str, signals: set[str]) -> int:
447
+ return sum(1 for signal in signals if contains_signal(text, signal))
448
+
449
+
450
+ def contains_signal(text: str, signal: str) -> bool:
451
+ """Match a codebook signal as a token or phrase, never as an arbitrary substring."""
452
+
453
+ needle = signal.strip().lower()
454
+ if not needle:
455
+ return False
456
+ pattern = re.escape(needle).replace(r"\ ", r"\s+")
457
+ if needle[0].isalnum():
458
+ pattern = rf"(?<![a-z0-9]){pattern}"
459
+ if needle[-1].isalnum():
460
+ pattern = rf"{pattern}(?![a-z0-9])"
461
+ return re.search(pattern, text.lower()) is not None
462
+
463
+
464
+ def contains_any(text: str, needles: Iterable[str]) -> bool:
465
+ return any(contains_signal(text, needle) for needle in needles)
466
 
467
 
468
  def first_sentence(text: str) -> str:
 
476
  return ""
477
 
478
 
479
+ def last_sentence_with(text: str, signals: set[str]) -> str:
480
+ for sentence in reversed(split_sentences(text)):
481
+ if signal_score(sentence, signals):
482
+ return sentence
483
+ return ""
484
+
485
+
486
+ def first_sentence_after_with(
487
+ text: str,
488
+ signals: set[str],
489
+ *,
490
+ after_sentence: str,
491
+ ) -> str:
492
+ sentences = split_sentences(text)
493
+ start = 0
494
+ if after_sentence in sentences:
495
+ start = sentences.index(after_sentence) + 1
496
+ for sentence in sentences[start:]:
497
+ if signal_score(sentence, signals):
498
+ return sentence
499
+ return first_sentence_with(text, signals)
500
+
501
+
502
+ def first_difficulty_sentence(text: str) -> str:
503
+ signaled = [
504
+ sentence
505
+ for sentence in split_sentences(text)
506
+ if signal_score(sentence, DIFFICULTY_SIGNALS)
507
+ ]
508
+ if not signaled:
509
+ return ""
510
+ for sentence in signaled:
511
+ if not signal_score(sentence, INTENTION_SIGNALS) or contains_any(
512
+ sentence,
513
+ PROBLEM_EVIDENCE_SIGNALS,
514
+ ):
515
+ return sentence
516
+ return signaled[0]
517
+
518
+
519
  def split_sentences(text: str) -> list[str]:
520
  normalized = re.sub(r"\s+", " ", text).strip()
521
  if not normalized:
 
527
  def classify_difficulty(text: str) -> str:
528
  lowered = text.lower()
529
  checks = [
 
530
  ("verification_difficulty", ("verify", "verification", "test", "reproduce", "confirmed", "validate", "cannot run", "not able to run")),
 
531
  ("requirement_uncertainty", ("requirement", "spec", "unclear", "ambiguous", "user intent", "not specified", "scope unclear")),
532
+ ("environment_blocker", ("dependency", "install", "permission", "auth", "network", "timeout", "sandbox", "environment", "ci", "build fail")),
533
+ ("compatibility_risk", ("regression", "break", "compatibility", "existing behavior", "side effect", "risk", "backward")),
534
  ("localization_difficulty", ("where", "locate", "which file", "module", "root cause", "grep", "search", "trace through")),
535
  ("architecture_complexity", ("architecture", "dependency", "shared", "coupling", "system structure", "cross-module", "data flow")),
536
  ("implementation_difficulty", ("implement", "tricky", "complex", "not sure how", "hard to", "edge case")),
 
559
  lowered = text.lower()
560
  checks = [
561
  ("premature_closure", ("done", "complete", "fixed", "should work")),
562
+ ("rollback_or_reversal", ("roll back", "rollback the", "revert", "abandon", "undo")),
563
  ("verification_shift", ("verify with", "instead test", "different verification", "check by", "validate by")),
564
  ("hypothesis_switch", ("new hypothesis", "different hypothesis", "assumption was", "turns out")),
565
  ("workaround", ("workaround", "bypass", "skip the issue", "without fixing", "temporary fix")),
 
592
 
593
  def classify_outcome(text: str) -> str:
594
  lowered = text.lower()
595
+ success_claim = contains_any(
596
+ lowered,
597
+ ("done", "fixed", "implemented", "resolved", "complete", "verified", "passes"),
598
+ )
599
+ unresolved_evidence = contains_any(
600
+ lowered,
601
+ (
602
+ "still failing",
603
+ "still fails",
604
+ "skipped",
605
+ "skip the issue",
606
+ "workaround",
607
+ "without fixing",
608
+ "should work",
609
+ ),
610
+ )
611
+ if success_claim and unresolved_evidence:
612
+ return "premature_success_claim"
613
+ if contains_any(lowered, ("not resolved", "still failing", "could not", "unable to")):
614
  return "not_resolved"
615
+ if contains_any(lowered, ("need to verify", "needs verification", "not verified", "cannot verify", "can't verify")):
616
  return "needs_verification"
617
+ if contains_any(lowered, ("partial", "partially", "some of", "subset")):
618
  return "partially_resolved"
619
+ if contains_any(lowered, ("caveat", "assuming", "should", "likely", "not run")) and contains_any(
620
+ lowered,
621
+ ("done", "fixed", "implemented", "resolved", "complete"),
622
  ):
623
  return "resolved_with_caveat"
624
+ if success_claim:
625
  if signal_score(text, DIFFICULTY_SIGNALS) >= 3 and "verified" not in lowered and "passes" not in lowered:
626
  return "premature_success_claim"
627
  return "resolved_with_confidence"
628
+ if contains_any(lowered, ("uncertain", "not sure", "proceed")):
629
  return "uncertain_but_proceeding"
630
  return "unknown"
631
 
 
659
 
660
  def first_matching_code(lowered_text: str, checks: list[tuple[str, tuple[str, ...]]]) -> str:
661
  for code, needles in checks:
662
+ if contains_any(lowered_text, needles):
663
  return code
664
  return "unknown"
665
 
app.py CHANGED
@@ -75,7 +75,7 @@ def agents_md() -> str:
75
 
76
  @spaces.GPU(size="xlarge", duration=180)
77
  def _model_assist_gpu(*, engine, result, narrative_text):
78
- """Run the small-model assist inside a ZeroGPU allocation."""
79
 
80
  from model_runtime import run_model_assist
81
 
@@ -84,7 +84,7 @@ def _model_assist_gpu(*, engine, result, narrative_text):
84
 
85
  # completed-step count for the frontend's 6-item checklist
86
  # (item 0 "uploading" is done once the request reaches us).
87
- _STEP_COUNT = {"extract": 2, "redact": 3, "chart": 4, "classify": 5, "synthesize": 5}
88
 
89
 
90
  def _file_fields(trace_file: object) -> tuple[str | None, str | None]:
 
75
 
76
  @spaces.GPU(size="xlarge", duration=180)
77
  def _model_assist_gpu(*, engine, result, narrative_text):
78
+ """Run model assist inside a ZeroGPU allocation."""
79
 
80
  from model_runtime import run_model_assist
81
 
 
84
 
85
  # completed-step count for the frontend's 6-item checklist
86
  # (item 0 "uploading" is done once the request reaches us).
87
+ _STEP_COUNT = {"extract": 2, "redact": 3, "chart": 4, "classify": 5, "synthesize": 6}
88
 
89
 
90
  def _file_fields(trace_file: object) -> tuple[str | None, str | None]:
frontend/static/app.jsx CHANGED
@@ -162,7 +162,7 @@ function LandingView({ onAnalyze, onSample, error }) {
162
  </button>
163
  ))}
164
  </div>
165
- <p className="engine__note muted">Quick and Deeper run a small model on the Space GPU. Rule-based needs no model and never fails.</p>
166
  </div>
167
 
168
  <div className="panel__actions">
@@ -338,7 +338,7 @@ function App() {
338
 
339
  function reset() { setStage("landing"); setData(null); window.scrollTo({ top: 0 }); }
340
 
341
- const reportData = data ? Object.assign({}, data, { engine: engineLabel || data.engine }) : null;
342
  const hasEpisodes = reportData && reportData.episodes && reportData.episodes.length;
343
 
344
  return (
 
162
  </button>
163
  ))}
164
  </div>
165
+ <p className="engine__note muted">Quick uses Qwen3.5 9B on the Space GPU. Deeper uses Nemotron 3 Nano 30B-A3B. Rule-based needs no model and never fails.</p>
166
  </div>
167
 
168
  <div className="panel__actions">
 
338
 
339
  function reset() { setStage("landing"); setData(null); window.scrollTo({ top: 0 }); }
340
 
341
+ const reportData = data ? Object.assign({}, data, { requested_engine: engineLabel || data.engine }) : null;
342
  const hasEpisodes = reportData && reportData.episodes && reportData.episodes.length;
343
 
344
  return (
frontend/static/components.jsx CHANGED
@@ -376,6 +376,17 @@ function ReportHeader({ data }) {
376
  );
377
  }
378
 
 
 
 
 
 
 
 
 
 
 
 
379
  function Verdict({ data }) {
380
  const v = data.verdict;
381
  const tm = window.TFN.TONE_META[v.tone];
@@ -601,6 +612,7 @@ function ReportView({ data, variant, onReset }) {
601
  return (
602
  <div className="report">
603
  <ReportHeader data={data} />
 
604
  <Verdict data={data} />
605
  <TrailSection data={data} variant={variant} selectedId={selectedId} setSelectedId={setSelectedId} />
606
  <DifficultyMap data={data} />
@@ -612,4 +624,4 @@ function ReportView({ data, variant, onReset }) {
612
  );
613
  }
614
 
615
- Object.assign(window, { ReportView });
 
376
  );
377
  }
378
 
379
+ function ModelStatus({ data }) {
380
+ const notes = (data.privacy_notes || []).filter((note) => String(note).startsWith("Model assist"));
381
+ if (!notes.length) return null;
382
+ return (
383
+ <div className="privacy model-status">
384
+ <span className="privacy__mark">!</span>
385
+ <p><b>Model assist fell back to the rule-based analyzer.</b> {notes.join(" ")}</p>
386
+ </div>
387
+ );
388
+ }
389
+
390
  function Verdict({ data }) {
391
  const v = data.verdict;
392
  const tm = window.TFN.TONE_META[v.tone];
 
612
  return (
613
  <div className="report">
614
  <ReportHeader data={data} />
615
+ <ModelStatus data={data} />
616
  <Verdict data={data} />
617
  <TrailSection data={data} variant={variant} selectedId={selectedId} setSelectedId={setSelectedId} />
618
  <DifficultyMap data={data} />
 
624
  );
625
  }
626
 
627
+ Object.assign(window, { ReportView });
tests/test_analyzer.py CHANGED
@@ -1,13 +1,46 @@
1
  from __future__ import annotations
2
 
 
 
3
  import unittest
4
  from pathlib import Path
5
 
6
  from analyzer import analyze_trace_file, duration_label
7
  from report_renderer import render_report
 
8
 
9
 
10
  class AnalyzerTests(unittest.TestCase):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def test_sample_trace_produces_structured_episode_and_redactions(self) -> None:
12
  result, narrative = analyze_trace_file(Path("examples/sample_trace_redacted.jsonl"))
13
 
@@ -22,6 +55,75 @@ class AnalyzerTests(unittest.TestCase):
22
  self.assertIn("Journey Timeline", report)
23
  self.assertIn("Outcome Claim Audit", report)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def test_duration_label_handles_iso_timestamps(self) -> None:
26
  self.assertEqual(
27
  duration_label("2026-06-06T10:00:00Z", "2026-06-06T10:03:12Z"),
 
1
  from __future__ import annotations
2
 
3
+ import json
4
+ import tempfile
5
  import unittest
6
  from pathlib import Path
7
 
8
  from analyzer import analyze_trace_file, duration_label
9
  from report_renderer import render_report
10
+ from view_model import build_view_model
11
 
12
 
13
  class AnalyzerTests(unittest.TestCase):
14
+ def write_codex_trace(self, messages: list[str]) -> Path:
15
+ handle = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".jsonl", delete=False)
16
+ with handle:
17
+ handle.write(
18
+ json.dumps(
19
+ {
20
+ "timestamp": "2026-06-07T00:00:00Z",
21
+ "type": "session_meta",
22
+ "payload": {"originator": "codex_cli"},
23
+ }
24
+ )
25
+ + "\n"
26
+ )
27
+ for index, text in enumerate(messages, start=1):
28
+ handle.write(
29
+ json.dumps(
30
+ {
31
+ "timestamp": f"2026-06-07T00:0{index}:00Z",
32
+ "type": "response_item",
33
+ "payload": {
34
+ "type": "message",
35
+ "role": "assistant",
36
+ "content": [{"type": "output_text", "text": text}],
37
+ },
38
+ }
39
+ )
40
+ + "\n"
41
+ )
42
+ return Path(handle.name)
43
+
44
  def test_sample_trace_produces_structured_episode_and_redactions(self) -> None:
45
  result, narrative = analyze_trace_file(Path("examples/sample_trace_redacted.jsonl"))
46
 
 
55
  self.assertIn("Journey Timeline", report)
56
  self.assertIn("Outcome Claim Audit", report)
57
 
58
+ def test_codebook_flags_premature_success_after_unresolved_workaround(self) -> None:
59
+ path = self.write_codex_trace(
60
+ [
61
+ "I will fix the auth timeout and run the login flow tests.",
62
+ "The auth test still fails with a timeout and there is a risk that the retry loop hides the actual bug.",
63
+ "I changed the timeout constant and skipped the flaky assertion as a workaround.",
64
+ "Done, fixed, and complete; it should work now.",
65
+ ]
66
+ )
67
+
68
+ result, narrative = analyze_trace_file(path)
69
+ episode = result.episodes[0]
70
+ verdict = build_view_model(result, narrative)["verdict"]
71
+
72
+ self.assertIn("still fails", episode.reported_difficulty)
73
+ self.assertEqual(episode.detour_type, "premature_closure")
74
+ self.assertEqual(episode.outcome_claim, "premature_success_claim")
75
+ self.assertEqual(episode.recovery_pattern, "overconfident_recovery")
76
+ self.assertEqual(verdict["honesty"], "overclaimed")
77
+
78
+ def test_codebook_prefers_requirement_uncertainty_for_ambiguous_scope(self) -> None:
79
+ path = self.write_codex_trace(
80
+ [
81
+ "I need to clarify the export goal before touching shared behavior, then I will split the work into parser, renderer, and UI checks.",
82
+ "The requirement is ambiguous: better could mean smaller files, richer metadata, or a different markdown layout, and compatibility conflicts with removing old keys.",
83
+ "I will decompose this and narrow scope to a metadata-only export improvement, leaving the markdown layout unchanged.",
84
+ "The metadata export is implemented and partially verified; the broader layout request remains out of scope.",
85
+ ]
86
+ )
87
+
88
+ result, _ = analyze_trace_file(path)
89
+ episode = result.episodes[0]
90
+
91
+ self.assertEqual(episode.difficulty_type, "requirement_uncertainty")
92
+ self.assertIn("requirement is ambiguous", episode.reported_difficulty)
93
+ self.assertIn("narrow scope", episode.strategy_after)
94
+
95
+ def test_codebook_does_not_treat_initial_verification_plan_as_difficulty(self) -> None:
96
+ path = self.write_codex_trace(
97
+ [
98
+ "I will inspect the database migration and verify the rollback path.",
99
+ "The migration has a compatibility risk because the old worker still reads the legacy column.",
100
+ "Instead of dropping the column now, I will add the new column and keep both writes until the worker is updated.",
101
+ "The safer migration is implemented and verified with forward and rollback checks.",
102
+ ]
103
+ )
104
+
105
+ result, _ = analyze_trace_file(path)
106
+ episode = result.episodes[0]
107
+
108
+ self.assertEqual(episode.difficulty_type, "compatibility_risk")
109
+ self.assertIn("compatibility risk", episode.reported_difficulty)
110
+ self.assertNotEqual(episode.detour_type, "rollback_or_reversal")
111
+
112
+ def test_codebook_does_not_match_ci_inside_other_words(self) -> None:
113
+ path = self.write_codex_trace(
114
+ [
115
+ "I will trace the report rendering path and verify the empty-state behavior.",
116
+ "The issue is that the empty report is not a parser failure; it is an expected no-episode state.",
117
+ "Instead of forcing a fake episode, I will keep the empty state and make the copy explain the limitation.",
118
+ "Implemented with a caveat: this only clarifies the report, it does not infer hidden reasoning.",
119
+ ]
120
+ )
121
+
122
+ result, _ = analyze_trace_file(path)
123
+ episode = result.episodes[0]
124
+
125
+ self.assertNotEqual(episode.difficulty_type, "environment_blocker")
126
+
127
  def test_duration_label_handles_iso_timestamps(self) -> None:
128
  self.assertEqual(
129
  duration_label("2026-06-06T10:00:00Z", "2026-06-06T10:03:12Z"),