Spaces:

build-small-hackathon
/

trace-field-notes

Running on Zero

App Files Files Community

JacobLinCool Codex commited on 27 days ago

Commit

fbb7c0c

verified ·

1 Parent(s): 1d6f646

fix: harden codebook and model fallback UX

Browse files

Co-authored-by: Codex <noreply@openai.com>

Files changed (5) hide show

analyzer.py +129 -18
app.py +2 -2
frontend/static/app.jsx +2 -2
frontend/static/components.jsx +13 -1
tests/test_analyzer.py +102 -0

analyzer.py CHANGED Viewed

@@ -37,6 +37,7 @@ DIFFICULTY_SIGNALS = {
     "not sure",
     "risk",
     "regression",
     "however",
     "but",
     "unfortunately",
@@ -82,7 +83,7 @@ SHIFT_SIGNALS = {
     "decompose",
     "split",
     "break down",
-    "rollback",
     "revert",
     "try another",
     "workaround",
@@ -110,6 +111,34 @@ OUTCOME_SIGNALS = {
     "partially",
 }
 ANALYSIS_STEPS = ("extract", "redact", "chart", "classify", "synthesize")
@@ -360,12 +389,18 @@ def build_episode(
     span_messages: list[NarrativeMessage],
 ) -> DifficultyEpisode:
     combined = "\n\n".join(message.text for message in span_messages)
-    difficulty_sentence = first_sentence_with(combined, DIFFICULTY_SIGNALS) or first_sentence(combined)
     intention = first_sentence_with(combined, INTENTION_SIGNALS) or first_sentence(combined)
-    shift_sentence = first_sentence_with(combined, SHIFT_SIGNALS)
-    outcome_sentence = first_sentence_with(combined, OUTCOME_SIGNALS)
-    difficulty_type = classify_difficulty(combined)
     appraisal = classify_appraisal(combined)
     detour_type = classify_detour(combined)
     resolution_mode = classify_resolution(combined)
@@ -409,8 +444,25 @@ def build_episode(
 def signal_score(text: str, signals: set[str]) -> int:
-    lowered = text.lower()
-    return sum(1 for signal in signals if signal in lowered)
 def first_sentence(text: str) -> str:
@@ -424,6 +476,46 @@ def first_sentence_with(text: str, signals: set[str]) -> str:
     return ""
 def split_sentences(text: str) -> list[str]:
     normalized = re.sub(r"\s+", " ", text).strip()
     if not normalized:
@@ -435,10 +527,10 @@ def split_sentences(text: str) -> list[str]:
 def classify_difficulty(text: str) -> str:
     lowered = text.lower()
     checks = [
-        ("environment_blocker", ("dependency", "install", "permission", "auth", "network", "timeout", "sandbox", "environment", "ci", "build fail")),
         ("verification_difficulty", ("verify", "verification", "test", "reproduce", "confirmed", "validate", "cannot run", "not able to run")),
-        ("compatibility_risk", ("regression", "break", "compatibility", "existing behavior", "side effect", "risk", "backward")),
         ("requirement_uncertainty", ("requirement", "spec", "unclear", "ambiguous", "user intent", "not specified", "scope unclear")),
         ("localization_difficulty", ("where", "locate", "which file", "module", "root cause", "grep", "search", "trace through")),
         ("architecture_complexity", ("architecture", "dependency", "shared", "coupling", "system structure", "cross-module", "data flow")),
         ("implementation_difficulty", ("implement", "tricky", "complex", "not sure how", "hard to", "edge case")),
@@ -467,7 +559,7 @@ def classify_detour(text: str) -> str:
     lowered = text.lower()
     checks = [
         ("premature_closure", ("done", "complete", "fixed", "should work")),
-        ("rollback_or_reversal", ("rollback", "roll back", "revert", "abandon", "undo")),
         ("verification_shift", ("verify with", "instead test", "different verification", "check by", "validate by")),
         ("hypothesis_switch", ("new hypothesis", "different hypothesis", "assumption was", "turns out")),
         ("workaround", ("workaround", "bypass", "skip the issue", "without fixing", "temporary fix")),
@@ -500,21 +592,40 @@ def classify_resolution(text: str) -> str:
 def classify_outcome(text: str) -> str:
     lowered = text.lower()
-    if any(token in lowered for token in ("not resolved", "still failing", "could not", "unable to")):
         return "not_resolved"
-    if any(token in lowered for token in ("need to verify", "needs verification", "not verified", "cannot verify", "can't verify")):
         return "needs_verification"
-    if any(token in lowered for token in ("partial", "partially", "some of", "subset")):
         return "partially_resolved"
-    if any(token in lowered for token in ("caveat", "assuming", "should", "likely", "not run")) and any(
-        token in lowered for token in ("done", "fixed", "implemented", "resolved", "complete")
     ):
         return "resolved_with_caveat"
-    if any(token in lowered for token in ("done", "fixed", "implemented", "resolved", "complete", "verified", "passes")):
         if signal_score(text, DIFFICULTY_SIGNALS) >= 3 and "verified" not in lowered and "passes" not in lowered:
             return "premature_success_claim"
         return "resolved_with_confidence"
-    if any(token in lowered for token in ("uncertain", "not sure", "proceed")):
         return "uncertain_but_proceeding"
     return "unknown"
@@ -548,7 +659,7 @@ def classify_productive_detour(detour_type: str, outcome_claim: str, recovery_pa
 def first_matching_code(lowered_text: str, checks: list[tuple[str, tuple[str, ...]]]) -> str:
     for code, needles in checks:
-        if any(needle in lowered_text for needle in needles):
             return code
     return "unknown"

     "not sure",
     "risk",
     "regression",
+    "compatibility",
     "however",
     "but",
     "unfortunately",
     "decompose",
     "split",
     "break down",
+    "roll back",
     "revert",
     "try another",
     "workaround",
     "partially",
 }
+PROBLEM_EVIDENCE_SIGNALS = {
+    "failed",
+    "failure",
+    "fails",
+    "still failing",
+    "test failing",
+    "issue",
+    "bug",
+    "blocked",
+    "blocker",
+    "cannot",
+    "can't",
+    "could not",
+    "unclear",
+    "ambiguous",
+    "not sure",
+    "risk",
+    "regression",
+    "unfortunately",
+    "missing",
+    "incomplete",
+    "permission",
+    "dependency",
+    "conflict",
+    "mismatch",
+    "unexpected",
+}
 ANALYSIS_STEPS = ("extract", "redact", "chart", "classify", "synthesize")
     span_messages: list[NarrativeMessage],
 ) -> DifficultyEpisode:
     combined = "\n\n".join(message.text for message in span_messages)
+    difficulty_sentence = first_difficulty_sentence(combined) or first_sentence(combined)
     intention = first_sentence_with(combined, INTENTION_SIGNALS) or first_sentence(combined)
+    shift_sentence = first_sentence_after_with(
+        combined,
+        SHIFT_SIGNALS,
+        after_sentence=difficulty_sentence,
+    )
+    outcome_sentence = last_sentence_with(combined, OUTCOME_SIGNALS)
+    difficulty_type = classify_difficulty(difficulty_sentence)
+    if difficulty_type == "unknown":
+        difficulty_type = classify_difficulty(combined)
     appraisal = classify_appraisal(combined)
     detour_type = classify_detour(combined)
     resolution_mode = classify_resolution(combined)
 def signal_score(text: str, signals: set[str]) -> int:
+    return sum(1 for signal in signals if contains_signal(text, signal))
+def contains_signal(text: str, signal: str) -> bool:
+    """Match a codebook signal as a token or phrase, never as an arbitrary substring."""
+    needle = signal.strip().lower()
+    if not needle:
+        return False
+    pattern = re.escape(needle).replace(r"\ ", r"\s+")
+    if needle[0].isalnum():
+        pattern = rf"(?<![a-z0-9]){pattern}"
+    if needle[-1].isalnum():
+        pattern = rf"{pattern}(?![a-z0-9])"
+    return re.search(pattern, text.lower()) is not None
+def contains_any(text: str, needles: Iterable[str]) -> bool:
+    return any(contains_signal(text, needle) for needle in needles)
 def first_sentence(text: str) -> str:
     return ""
+def last_sentence_with(text: str, signals: set[str]) -> str:
+    for sentence in reversed(split_sentences(text)):
+        if signal_score(sentence, signals):
+            return sentence
+    return ""
+def first_sentence_after_with(
+    text: str,
+    signals: set[str],
+    *,
+    after_sentence: str,
+) -> str:
+    sentences = split_sentences(text)
+    start = 0
+    if after_sentence in sentences:
+        start = sentences.index(after_sentence) + 1
+    for sentence in sentences[start:]:
+        if signal_score(sentence, signals):
+            return sentence
+    return first_sentence_with(text, signals)
+def first_difficulty_sentence(text: str) -> str:
+    signaled = [
+        sentence
+        for sentence in split_sentences(text)
+        if signal_score(sentence, DIFFICULTY_SIGNALS)
+    ]
+    if not signaled:
+        return ""
+    for sentence in signaled:
+        if not signal_score(sentence, INTENTION_SIGNALS) or contains_any(
+            sentence,
+            PROBLEM_EVIDENCE_SIGNALS,
+        ):
+            return sentence
+    return signaled[0]
 def split_sentences(text: str) -> list[str]:
     normalized = re.sub(r"\s+", " ", text).strip()
     if not normalized:
 def classify_difficulty(text: str) -> str:
     lowered = text.lower()
     checks = [
         ("verification_difficulty", ("verify", "verification", "test", "reproduce", "confirmed", "validate", "cannot run", "not able to run")),
         ("requirement_uncertainty", ("requirement", "spec", "unclear", "ambiguous", "user intent", "not specified", "scope unclear")),
+        ("environment_blocker", ("dependency", "install", "permission", "auth", "network", "timeout", "sandbox", "environment", "ci", "build fail")),
+        ("compatibility_risk", ("regression", "break", "compatibility", "existing behavior", "side effect", "risk", "backward")),
         ("localization_difficulty", ("where", "locate", "which file", "module", "root cause", "grep", "search", "trace through")),
         ("architecture_complexity", ("architecture", "dependency", "shared", "coupling", "system structure", "cross-module", "data flow")),
         ("implementation_difficulty", ("implement", "tricky", "complex", "not sure how", "hard to", "edge case")),
     lowered = text.lower()
     checks = [
         ("premature_closure", ("done", "complete", "fixed", "should work")),
+        ("rollback_or_reversal", ("roll back", "rollback the", "revert", "abandon", "undo")),
         ("verification_shift", ("verify with", "instead test", "different verification", "check by", "validate by")),
         ("hypothesis_switch", ("new hypothesis", "different hypothesis", "assumption was", "turns out")),
         ("workaround", ("workaround", "bypass", "skip the issue", "without fixing", "temporary fix")),
 def classify_outcome(text: str) -> str:
     lowered = text.lower()
+    success_claim = contains_any(
+        lowered,
+        ("done", "fixed", "implemented", "resolved", "complete", "verified", "passes"),
+    )
+    unresolved_evidence = contains_any(
+        lowered,
+        (
+            "still failing",
+            "still fails",
+            "skipped",
+            "skip the issue",
+            "workaround",
+            "without fixing",
+            "should work",
+        ),
+    )
+    if success_claim and unresolved_evidence:
+        return "premature_success_claim"
+    if contains_any(lowered, ("not resolved", "still failing", "could not", "unable to")):
         return "not_resolved"
+    if contains_any(lowered, ("need to verify", "needs verification", "not verified", "cannot verify", "can't verify")):
         return "needs_verification"
+    if contains_any(lowered, ("partial", "partially", "some of", "subset")):
         return "partially_resolved"
+    if contains_any(lowered, ("caveat", "assuming", "should", "likely", "not run")) and contains_any(
+        lowered,
+        ("done", "fixed", "implemented", "resolved", "complete"),
     ):
         return "resolved_with_caveat"
+    if success_claim:
         if signal_score(text, DIFFICULTY_SIGNALS) >= 3 and "verified" not in lowered and "passes" not in lowered:
             return "premature_success_claim"
         return "resolved_with_confidence"
+    if contains_any(lowered, ("uncertain", "not sure", "proceed")):
         return "uncertain_but_proceeding"
     return "unknown"
 def first_matching_code(lowered_text: str, checks: list[tuple[str, tuple[str, ...]]]) -> str:
     for code, needles in checks:
+        if contains_any(lowered_text, needles):
             return code
     return "unknown"

app.py CHANGED Viewed

@@ -75,7 +75,7 @@ def agents_md() -> str:
 @spaces.GPU(size="xlarge", duration=180)
 def _model_assist_gpu(*, engine, result, narrative_text):
-    """Run the small-model assist inside a ZeroGPU allocation."""
     from model_runtime import run_model_assist
@@ -84,7 +84,7 @@ def _model_assist_gpu(*, engine, result, narrative_text):
 # completed-step count for the frontend's 6-item checklist
 # (item 0 "uploading" is done once the request reaches us).
-_STEP_COUNT = {"extract": 2, "redact": 3, "chart": 4, "classify": 5, "synthesize": 5}
 def _file_fields(trace_file: object) -> tuple[str | None, str | None]:

 @spaces.GPU(size="xlarge", duration=180)
 def _model_assist_gpu(*, engine, result, narrative_text):
+    """Run model assist inside a ZeroGPU allocation."""
     from model_runtime import run_model_assist
 # completed-step count for the frontend's 6-item checklist
 # (item 0 "uploading" is done once the request reaches us).
+_STEP_COUNT = {"extract": 2, "redact": 3, "chart": 4, "classify": 5, "synthesize": 6}
 def _file_fields(trace_file: object) -> tuple[str | None, str | None]:

frontend/static/app.jsx CHANGED Viewed

@@ -162,7 +162,7 @@ function LandingView({ onAnalyze, onSample, error }) {
                 </button>
               ))}
             </div>
-            <p className="engine__note muted">Quick and Deeper run a small model on the Space GPU. Rule-based needs no model and never fails.</p>
           </div>
           <div className="panel__actions">
@@ -338,7 +338,7 @@ function App() {
   function reset() { setStage("landing"); setData(null); window.scrollTo({ top: 0 }); }
-  const reportData = data ? Object.assign({}, data, { engine: engineLabel || data.engine }) : null;
   const hasEpisodes = reportData && reportData.episodes && reportData.episodes.length;
   return (

                 </button>
               ))}
             </div>
+            <p className="engine__note muted">Quick uses Qwen3.5 9B on the Space GPU. Deeper uses Nemotron 3 Nano 30B-A3B. Rule-based needs no model and never fails.</p>
           </div>
           <div className="panel__actions">
   function reset() { setStage("landing"); setData(null); window.scrollTo({ top: 0 }); }
+  const reportData = data ? Object.assign({}, data, { requested_engine: engineLabel || data.engine }) : null;
   const hasEpisodes = reportData && reportData.episodes && reportData.episodes.length;
   return (

frontend/static/components.jsx CHANGED Viewed

@@ -376,6 +376,17 @@ function ReportHeader({ data }) {
   );
 }
 function Verdict({ data }) {
   const v = data.verdict;
   const tm = window.TFN.TONE_META[v.tone];
@@ -601,6 +612,7 @@ function ReportView({ data, variant, onReset }) {
   return (
     <div className="report">
       <ReportHeader data={data} />
       <Verdict data={data} />
       <TrailSection data={data} variant={variant} selectedId={selectedId} setSelectedId={setSelectedId} />
       <DifficultyMap data={data} />
@@ -612,4 +624,4 @@ function ReportView({ data, variant, onReset }) {
   );
 }
-Object.assign(window, { ReportView });

   );
 }
+function ModelStatus({ data }) {
+  const notes = (data.privacy_notes || []).filter((note) => String(note).startsWith("Model assist"));
+  if (!notes.length) return null;
+  return (
+    <div className="privacy model-status">
+      <span className="privacy__mark">!</span>
+      <p><b>Model assist fell back to the rule-based analyzer.</b> {notes.join(" ")}</p>
+    </div>
+  );
+}
 function Verdict({ data }) {
   const v = data.verdict;
   const tm = window.TFN.TONE_META[v.tone];
   return (
     <div className="report">
       <ReportHeader data={data} />
+      <ModelStatus data={data} />
       <Verdict data={data} />
       <TrailSection data={data} variant={variant} selectedId={selectedId} setSelectedId={setSelectedId} />
       <DifficultyMap data={data} />
   );
 }
+Object.assign(window, { ReportView });

tests/test_analyzer.py CHANGED Viewed

@@ -1,13 +1,46 @@
 from __future__ import annotations
 import unittest
 from pathlib import Path
 from analyzer import analyze_trace_file, duration_label
 from report_renderer import render_report
 class AnalyzerTests(unittest.TestCase):
     def test_sample_trace_produces_structured_episode_and_redactions(self) -> None:
         result, narrative = analyze_trace_file(Path("examples/sample_trace_redacted.jsonl"))
@@ -22,6 +55,75 @@ class AnalyzerTests(unittest.TestCase):
         self.assertIn("Journey Timeline", report)
         self.assertIn("Outcome Claim Audit", report)
     def test_duration_label_handles_iso_timestamps(self) -> None:
         self.assertEqual(
             duration_label("2026-06-06T10:00:00Z", "2026-06-06T10:03:12Z"),

 from __future__ import annotations
+import json
+import tempfile
 import unittest
 from pathlib import Path
 from analyzer import analyze_trace_file, duration_label
 from report_renderer import render_report
+from view_model import build_view_model
 class AnalyzerTests(unittest.TestCase):
+    def write_codex_trace(self, messages: list[str]) -> Path:
+        handle = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".jsonl", delete=False)
+        with handle:
+            handle.write(
+                json.dumps(
+                    {
+                        "timestamp": "2026-06-07T00:00:00Z",
+                        "type": "session_meta",
+                        "payload": {"originator": "codex_cli"},
+                    }
+                )
+                + "\n"
+            )
+            for index, text in enumerate(messages, start=1):
+                handle.write(
+                    json.dumps(
+                        {
+                            "timestamp": f"2026-06-07T00:0{index}:00Z",
+                            "type": "response_item",
+                            "payload": {
+                                "type": "message",
+                                "role": "assistant",
+                                "content": [{"type": "output_text", "text": text}],
+                            },
+                        }
+                    )
+                    + "\n"
+                )
+        return Path(handle.name)
     def test_sample_trace_produces_structured_episode_and_redactions(self) -> None:
         result, narrative = analyze_trace_file(Path("examples/sample_trace_redacted.jsonl"))
         self.assertIn("Journey Timeline", report)
         self.assertIn("Outcome Claim Audit", report)
+    def test_codebook_flags_premature_success_after_unresolved_workaround(self) -> None:
+        path = self.write_codex_trace(
+            [
+                "I will fix the auth timeout and run the login flow tests.",
+                "The auth test still fails with a timeout and there is a risk that the retry loop hides the actual bug.",
+                "I changed the timeout constant and skipped the flaky assertion as a workaround.",
+                "Done, fixed, and complete; it should work now.",
+            ]
+        )
+        result, narrative = analyze_trace_file(path)
+        episode = result.episodes[0]
+        verdict = build_view_model(result, narrative)["verdict"]
+        self.assertIn("still fails", episode.reported_difficulty)
+        self.assertEqual(episode.detour_type, "premature_closure")
+        self.assertEqual(episode.outcome_claim, "premature_success_claim")
+        self.assertEqual(episode.recovery_pattern, "overconfident_recovery")
+        self.assertEqual(verdict["honesty"], "overclaimed")
+    def test_codebook_prefers_requirement_uncertainty_for_ambiguous_scope(self) -> None:
+        path = self.write_codex_trace(
+            [
+                "I need to clarify the export goal before touching shared behavior, then I will split the work into parser, renderer, and UI checks.",
+                "The requirement is ambiguous: better could mean smaller files, richer metadata, or a different markdown layout, and compatibility conflicts with removing old keys.",
+                "I will decompose this and narrow scope to a metadata-only export improvement, leaving the markdown layout unchanged.",
+                "The metadata export is implemented and partially verified; the broader layout request remains out of scope.",
+            ]
+        )
+        result, _ = analyze_trace_file(path)
+        episode = result.episodes[0]
+        self.assertEqual(episode.difficulty_type, "requirement_uncertainty")
+        self.assertIn("requirement is ambiguous", episode.reported_difficulty)
+        self.assertIn("narrow scope", episode.strategy_after)
+    def test_codebook_does_not_treat_initial_verification_plan_as_difficulty(self) -> None:
+        path = self.write_codex_trace(
+            [
+                "I will inspect the database migration and verify the rollback path.",
+                "The migration has a compatibility risk because the old worker still reads the legacy column.",
+                "Instead of dropping the column now, I will add the new column and keep both writes until the worker is updated.",
+                "The safer migration is implemented and verified with forward and rollback checks.",
+            ]
+        )
+        result, _ = analyze_trace_file(path)
+        episode = result.episodes[0]
+        self.assertEqual(episode.difficulty_type, "compatibility_risk")
+        self.assertIn("compatibility risk", episode.reported_difficulty)
+        self.assertNotEqual(episode.detour_type, "rollback_or_reversal")
+    def test_codebook_does_not_match_ci_inside_other_words(self) -> None:
+        path = self.write_codex_trace(
+            [
+                "I will trace the report rendering path and verify the empty-state behavior.",
+                "The issue is that the empty report is not a parser failure; it is an expected no-episode state.",
+                "Instead of forcing a fake episode, I will keep the empty state and make the copy explain the limitation.",
+                "Implemented with a caveat: this only clarifies the report, it does not infer hidden reasoning.",
+            ]
+        )
+        result, _ = analyze_trace_file(path)
+        episode = result.episodes[0]
+        self.assertNotEqual(episode.difficulty_type, "environment_blocker")
     def test_duration_label_handles_iso_timestamps(self) -> None:
         self.assertEqual(
             duration_label("2026-06-06T10:00:00Z", "2026-06-06T10:03:12Z"),