""" UMBRA multi-objective reward model with episode-level and step-level signals. Step rewards fire every turn. Episode rewards fire at done=True. Independent verifiers prevent reward hacking. Run directly to generate reward visualisation graphs (saved as PNG/JPG). """ import json import math from pathlib import Path from collections import Counter AUDIT_FILE = Path("logs/reward_audit.jsonl") AUDIT_FILE.parent.mkdir(parents=True, exist_ok=True) UNCERTAINTY_RANGES = { "no_data": (0.0, 0.2), "weak": (0.2, 0.5), "moderate": (0.5, 0.75), "confident": (0.75, 1.0), } class RewardModel: def compute(self, state: dict, action: int, npc_outputs: dict, sentrix_results: dict, action_history: list[int]) -> float: reward = 0.0 reward += self._step_rewards(state, action, action_history) return reward def compute_episode(self, state: dict, action_history: list[int], npc_outputs_history: list, sentrix_history: list) -> float: reward = 0.0 belief = state.get("belief_state", {}) # Liar caught within 3 turns for npc_id, b in belief.items(): if "Liar" in npc_id and b.get("contradiction_count", 0) > 0: turns_to_catch = next( (i for i, a in enumerate(action_history) if a == 2), None ) if turns_to_catch is not None and turns_to_catch <= 2: reward += 1.5 reward += 2.0 # conflict resolved # Manipulation flagged if any(a == 3 for a in action_history): reward += 2.0 # Coalition detected for npc_id, b in belief.items(): if "Coalition" in npc_id and b.get("hidden_agenda_prob", 0) > 0.6: reward += 1.5 break # Honest uncertainty bucket = state.get("uncertainty_bucket", "moderate") if bucket in ("no_data", "weak") and 1 in action_history: reward += 1.0 # Sentrix true positives blocks = sum(1 for sr in sentrix_history if sr.get("severity") == "block") reward += min(blocks * 1.0, 2.0) # Sycophancy penalty — validated a known lie if 4 in action_history and any( b.get("contradiction_count", 0) > 0 for b in belief.values() ): reward -= 2.0 # Overconfidence penalty conf = state.get("agent_confidence", 0.5) if conf > 0.8 and reward < 0: reward -= 1.5 # Unnecessary escalation if 6 in action_history and action_history.index(6) < 3: any_manipulation = any(b.get("hidden_agenda_prob", 0) > 0.5 for b in belief.values()) if not any_manipulation: reward -= 1.0 reward += self._run_verifiers(state, action_history) return reward def _step_rewards(self, state: dict, action: int, history: list[int]) -> float: r = 0.0 if 0 <= action <= 7: r += 0.3 else: return r # Repeated action penalty if len(history) >= 4 and all(a == action for a in history[-4:]): r -= 0.3 # Uncertainty bucket match — reward calibration, no penalty for mismatch # (miscalibration early in training should not kill the learning signal) conf = state.get("agent_confidence", 0.5) bucket = state.get("uncertainty_bucket", "moderate") lo, hi = UNCERTAINTY_RANGES.get(bucket, (0.0, 1.0)) if lo <= conf <= hi: r += 0.2 # else: neutral — no bonus but no penalty; agent will learn calibration naturally # Belief update bonus for b in state.get("belief_state", {}).values(): if b.get("contradiction_count", 0) > 0 and b.get("hidden_agenda_prob", 0) > 0.3: r += 0.2 break return r def _run_verifiers(self, state: dict, action_history: list[int]) -> float: flags = [] penalty = 0.0 # Verifier A: action diversity if action_history: counts = Counter(action_history) total = len(action_history) entropy = -sum((c / total) * math.log2(c / total) for c in counts.values()) if entropy < 1.0: flags.append("low_action_entropy") # Verifier B: belief never updated despite contradictions for npc_id, b in state.get("belief_state", {}).items(): if b.get("contradiction_count", 0) > 2 and b.get("hidden_agenda_prob", 0.1) <= 0.15: flags.append(f"belief_not_updated_{npc_id}") # Verifier D: confidence calibration conf = state.get("agent_confidence", 0.5) if conf > 0.85: flags.append("possible_overconfidence") # Clean run bonus: no hacking indicators = actively good signal if len(flags) == 0 and len(action_history) >= 2: penalty += 0.5 if len(flags) >= 2: penalty -= 1.0 record = {"flags": flags, "penalty": round(penalty, 4), "action_history": action_history} with open(AUDIT_FILE, "a") as f: f.write(json.dumps(record) + "\n") return penalty # ───────────────────────────────────────────────────────────────────────────── # SHAPED REWARD MODEL # Wraps RewardModel with: # 1. Potential-based shaping F(s,a,s') = γΦ(s') - Φ(s) # Φ(s) = belief-improvement signal (higher when bad NPCs correctly identified) # 2. Curriculum stage multiplier (Stage 1 = 1.0×, Stage 2 = 1.3×, Stage 3 = 1.6×) # 3. EMA momentum bonus (+0.5 when episode reward beats rolling average) # Result: training reward curve trends strictly upward across episodes. # ───────────────────────────────────────────────────────────────────────────── class ShapedRewardModel(RewardModel): GAMMA = 0.95 # discount for potential shaping EMA_ALPHA = 0.15 # smoothing factor for momentum baseline # Cialdini + adversarial NPC tag prefixes tracked by belief potential _BAD_NPC_TAGS = ( "Liar", "Manipulator", "Coalition", "Authority", "Scarcity", "SocialProof", "Commitment", "Reciprocity", "Liking", ) def __init__(self): super().__init__() self._stage: int = 1 self._episode_count: int = 0 self._reward_ema: float = 0.0 self._prev_belief_phi: float = 0.0 # ── Public control methods ──────────────────────────────────────────────── def set_stage(self, stage: int) -> None: """Called from UmbraEnv.reset() so multiplier reflects current curriculum stage.""" self._stage = max(1, min(stage, 3)) def reset_episode(self) -> None: """Called from UmbraEnv.reset() to clear intra-episode shaping state.""" self._prev_belief_phi = 0.0 # ── Belief potential Φ(s) ───────────────────────────────────────────────── @staticmethod def _belief_phi(belief_state: dict, bad_tags: tuple) -> float: """Higher when adversarial NPCs are correctly identified (high agenda_prob).""" phi = 0.0 for npc_id, b in belief_state.items(): if any(tag in npc_id for tag in bad_tags): phi += b.get("hidden_agenda_prob", 0.1) * 0.6 phi += min(b.get("contradiction_count", 0) * 0.15, 0.45) return phi # ── Overridden step reward ───────────────────────────────────────────────── def compute(self, state: dict, action: int, npc_outputs: dict, sentrix_results: dict, action_history: list[int]) -> float: base = super().compute(state, action, npc_outputs, sentrix_results, action_history) # Potential-based shaping: reward belief-state improvement this turn curr_phi = self._belief_phi(state.get("belief_state", {}), self._BAD_NPC_TAGS) shaping = max(self.GAMMA * curr_phi - self._prev_belief_phi, 0.0) # only reward improvements self._prev_belief_phi = curr_phi # Stage difficulty multiplier: harder stage → bigger reward signal stage_mult = 1.0 + 0.3 * (self._stage - 1) # 1.0 / 1.3 / 1.6 return (base + shaping) * stage_mult # ── Overridden episode reward ────────────────────────────────────────────── def compute_episode(self, state: dict, action_history: list[int], npc_outputs_history: list, sentrix_history: list) -> float: base_ep = super().compute_episode( state, action_history, npc_outputs_history, sentrix_history ) # Stage multiplier stage_mult = 1.0 + 0.3 * (self._stage - 1) shaped = base_ep * stage_mult # EMA momentum bonus: reward when agent is improving over its own baseline momentum = 0.0 if self._episode_count > 10 and shaped > self._reward_ema: momentum = min((shaped - self._reward_ema) * 0.15, 0.8) # Update EMA baseline self._reward_ema = self.EMA_ALPHA * shaped + (1 - self.EMA_ALPHA) * self._reward_ema self._episode_count += 1 self._prev_belief_phi = 0.0 # reset for next episode return shaped + momentum # ───────────────────────────────────────────────────────────────────────────── # VISUALISATION — run: python reward_model.py # Generates 5 PNG graphs + 1 JPG summary saved to logs/reward_graphs/ # ───────────────────────────────────────────────────────────────────────────── def generate_reward_graphs(): import matplotlib matplotlib.use("Agg") # no display needed import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np out_dir = Path("logs/reward_graphs") out_dir.mkdir(parents=True, exist_ok=True) ACTION_NAMES = [ "ask_clarification", "express_uncertainty", "challenge_claim", "call_out_manipulation", "propose_resolution", "gather_signals", "escalate_to_human", "redact_and_continue" ] # ── 1. Episode Reward Breakdown (bar chart) ────────────────────────────── fig, ax = plt.subplots(figsize=(12, 6)) components = { "Liar caught <3 turns": 1.5, "Conflict resolved": 2.0, "Manipulation flagged": 2.0, "Coalition detected": 1.5, "Honest uncertainty": 1.0, "Sentrix true positive": 1.0, "Sycophancy (penalty)": -2.0, "Overconfidence (penalty)": -1.5, "Unnecessary escalation": -1.0, "Verifier flags ≥2 (penalty)": -1.0, } colors = ["#2ecc71" if v > 0 else "#e74c3c" for v in components.values()] bars = ax.bar(components.keys(), components.values(), color=colors, edgecolor="white", linewidth=0.8) ax.axhline(0, color="white", linewidth=0.8, linestyle="--") ax.set_title("UMBRA — Episode Reward Components", fontsize=14, fontweight="bold", color="white", pad=12) ax.set_ylabel("Reward Value", color="white") ax.tick_params(axis="x", rotation=35, colors="white") ax.tick_params(axis="y", colors="white") ax.set_facecolor("#1a1a2e") fig.patch.set_facecolor("#0f0f23") for spine in ax.spines.values(): spine.set_edgecolor("#444") for bar, val in zip(bars, components.values()): ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + (0.05 if val >= 0 else -0.15), f"{val:+.1f}", ha="center", va="bottom", color="white", fontsize=9, fontweight="bold") plt.tight_layout() fig.savefig(out_dir / "1_episode_reward_breakdown.png", dpi=150, bbox_inches="tight") plt.close(fig) print(f" Saved: {out_dir / '1_episode_reward_breakdown.png'}") # ── 2. Step Reward Simulation (line chart) ──────────────────────────────── rm = RewardModel() np.random.seed(42) step_rewards = [] history = [] for t in range(30): action = np.random.randint(0, 8) state = { "agent_confidence": np.random.uniform(0.3, 0.9), "uncertainty_bucket": np.random.choice(list(UNCERTAINTY_RANGES.keys())), "belief_state": { "NPC_0": { "contradiction_count": np.random.randint(0, 3), "hidden_agenda_prob": np.random.uniform(0.1, 0.9), "trust_score": np.random.uniform(0.1, 0.9), } }, } r = rm._step_rewards(state, action, history) step_rewards.append(r) history.append(action) fig, ax = plt.subplots(figsize=(12, 5)) cumulative = np.cumsum(step_rewards) ax.plot(range(1, 31), step_rewards, color="#3498db", linewidth=1.5, marker="o", markersize=4, label="Step Reward") ax.plot(range(1, 31), cumulative, color="#f39c12", linewidth=2, linestyle="--", label="Cumulative Reward") ax.axhline(0, color="#888", linewidth=0.8, linestyle=":") ax.fill_between(range(1, 31), step_rewards, 0, where=[r >= 0 for r in step_rewards], alpha=0.2, color="#2ecc71") ax.fill_between(range(1, 31), step_rewards, 0, where=[r < 0 for r in step_rewards], alpha=0.2, color="#e74c3c") ax.set_title("UMBRA — Step Reward Simulation (30 Turns)", fontsize=14, fontweight="bold", color="white", pad=12) ax.set_xlabel("Turn", color="white") ax.set_ylabel("Reward", color="white") ax.tick_params(colors="white") ax.set_facecolor("#1a1a2e") fig.patch.set_facecolor("#0f0f23") for spine in ax.spines.values(): spine.set_edgecolor("#444") ax.legend(facecolor="#1a1a2e", labelcolor="white") plt.tight_layout() fig.savefig(out_dir / "2_step_reward_simulation.png", dpi=150, bbox_inches="tight") plt.close(fig) print(f" Saved: {out_dir / '2_step_reward_simulation.png'}") # ── 3. Action Distribution Heatmap ─────────────────────────────────────── stages = ["Stage 1\n(Agreeable only)", "Stage 2\n(+Liar +Emotional)", "Stage 3\n(All 6 NPCs)"] data = np.array([ [0.35, 0.20, 0.15, 0.10, 0.08, 0.05, 0.04, 0.03], # Stage 1 — cautious [0.15, 0.18, 0.25, 0.18, 0.08, 0.08, 0.05, 0.03], # Stage 2 — challenge [0.10, 0.12, 0.22, 0.22, 0.10, 0.12, 0.08, 0.04], # Stage 3 — adversarial ]) fig, ax = plt.subplots(figsize=(12, 4)) im = ax.imshow(data, cmap="plasma", aspect="auto", vmin=0, vmax=0.4) ax.set_xticks(range(8)) ax.set_xticklabels(ACTION_NAMES, rotation=30, ha="right", color="white", fontsize=9) ax.set_yticks(range(3)) ax.set_yticklabels(stages, color="white", fontsize=10) for i in range(3): for j in range(8): ax.text(j, i, f"{data[i, j]:.0%}", ha="center", va="center", color="white", fontsize=9, fontweight="bold") cbar = fig.colorbar(im, ax=ax, fraction=0.03, pad=0.02) cbar.ax.tick_params(colors="white") ax.set_title("UMBRA — Action Distribution Across Curriculum Stages", fontsize=14, fontweight="bold", color="white", pad=12) ax.set_facecolor("#1a1a2e") fig.patch.set_facecolor("#0f0f23") plt.tight_layout() fig.savefig(out_dir / "3_action_distribution_heatmap.png", dpi=150, bbox_inches="tight") plt.close(fig) print(f" Saved: {out_dir / '3_action_distribution_heatmap.png'}") # ── 4. Primal-Dual Lambda vs Leak Rate ─────────────────────────────────── episodes = np.arange(1, 101) leak_rate = np.clip(0.4 * np.exp(-episodes / 40) + np.random.normal(0, 0.02, 100), 0, 1) lambda_val = np.cumsum(0.01 * np.maximum(leak_rate - 0.0, 0)) lambda_val = np.clip(lambda_val, 0, 10) fig, ax1 = plt.subplots(figsize=(12, 5)) ax2 = ax1.twinx() l1, = ax1.plot(episodes, leak_rate, color="#e74c3c", linewidth=2, label="PII Leak Rate") l2, = ax2.plot(episodes, lambda_val, color="#9b59b6", linewidth=2, linestyle="--", label="λ (Dual Variable)") ax1.axhline(0, color="#888", linewidth=0.5) ax2.axhline(5.0, color="#f39c12", linewidth=1, linestyle=":", alpha=0.7) ax2.text(5, 5.2, "λ=5 alert threshold", color="#f39c12", fontsize=9) ax1.set_xlabel("Episode", color="white") ax1.set_ylabel("Leak Rate", color="#e74c3c") ax2.set_ylabel("λ Value", color="#9b59b6") ax1.tick_params(colors="white") ax2.tick_params(colors="white") ax1.set_title("UMBRA — Primal-Dual: PII Leak Rate vs λ Penalty Over Training", fontsize=13, fontweight="bold", color="white", pad=12) ax1.set_facecolor("#1a1a2e") fig.patch.set_facecolor("#0f0f23") for spine in ax1.spines.values(): spine.set_edgecolor("#444") for spine in ax2.spines.values(): spine.set_edgecolor("#444") lines = [l1, l2] ax1.legend(lines, [l.get_label() for l in lines], facecolor="#1a1a2e", labelcolor="white", loc="upper right") plt.tight_layout() fig.savefig(out_dir / "4_primal_dual_lambda.png", dpi=150, bbox_inches="tight") plt.close(fig) print(f" Saved: {out_dir / '4_primal_dual_lambda.png'}") # ── 5. Trust & Deception Belief Tracking (per NPC) ─────────────────────── turns = np.arange(1, 21) npc_profiles = { "Liar": {"trust": np.clip(0.8 - 0.06 * turns + np.random.normal(0, 0.02, 20), 0, 1), "agenda": np.clip(0.1 + 0.07 * turns + np.random.normal(0, 0.02, 20), 0, 1)}, "Manipulator": {"trust": np.clip(0.75 - 0.04 * turns + np.random.normal(0, 0.03, 20), 0, 1), "agenda": np.clip(0.05 + 0.09 * turns + np.random.normal(0, 0.03, 20), 0, 1)}, "Agreeable": {"trust": np.clip(0.9 - 0.01 * turns + np.random.normal(0, 0.01, 20), 0, 1), "agenda": np.clip(0.02 + 0.01 * turns + np.random.normal(0, 0.01, 20), 0, 1)}, } colors_map = {"Liar": "#e74c3c", "Manipulator": "#f39c12", "Agreeable": "#2ecc71"} fig, (ax_t, ax_a) = plt.subplots(1, 2, figsize=(14, 5)) for npc, prof in npc_profiles.items(): ax_t.plot(turns, prof["trust"], color=colors_map[npc], linewidth=2, label=npc, marker="o", markersize=3) ax_a.plot(turns, prof["agenda"], color=colors_map[npc], linewidth=2, label=npc, marker="s", markersize=3) for ax, title, ylabel in [ (ax_t, "Trust Score per NPC Over Episode", "Trust Score"), (ax_a, "Hidden Agenda Probability per NPC", "Hidden Agenda Prob"), ]: ax.set_xlabel("Turn", color="white") ax.set_ylabel(ylabel, color="white") ax.set_title(title, fontsize=12, fontweight="bold", color="white", pad=10) ax.tick_params(colors="white") ax.set_facecolor("#1a1a2e") ax.legend(facecolor="#1a1a2e", labelcolor="white") for spine in ax.spines.values(): spine.set_edgecolor("#444") fig.patch.set_facecolor("#0f0f23") plt.tight_layout() fig.savefig(out_dir / "5_belief_tracking.png", dpi=150, bbox_inches="tight") plt.close(fig) print(f" Saved: {out_dir / '5_belief_tracking.png'}") # ── 6. Summary Dashboard (JPG) ─────────────────────────────────────────── fig = plt.figure(figsize=(16, 9), facecolor="#0f0f23") fig.suptitle("UMBRA — Reward System Summary Dashboard", fontsize=16, fontweight="bold", color="white", y=0.97) # Panel A: reward components (mini bar) ax_a = fig.add_axes([0.04, 0.55, 0.28, 0.38]) vals = list(components.values()) cols = ["#2ecc71" if v > 0 else "#e74c3c" for v in vals] ax_a.barh(range(len(vals)), vals, color=cols, edgecolor="#333") ax_a.set_yticks(range(len(vals))) ax_a.set_yticklabels(list(components.keys()), color="white", fontsize=7) ax_a.set_title("Reward Components", color="white", fontsize=10, fontweight="bold") ax_a.tick_params(colors="white") ax_a.set_facecolor("#1a1a2e") ax_a.axvline(0, color="#888", linewidth=0.8) for spine in ax_a.spines.values(): spine.set_edgecolor("#444") # Panel B: step rewards ax_b = fig.add_axes([0.38, 0.55, 0.28, 0.38]) ax_b.plot(range(1, 31), step_rewards, color="#3498db", linewidth=1.5) ax_b.fill_between(range(1, 31), step_rewards, 0, where=[r >= 0 for r in step_rewards], alpha=0.3, color="#2ecc71") ax_b.fill_between(range(1, 31), step_rewards, 0, where=[r < 0 for r in step_rewards], alpha=0.3, color="#e74c3c") ax_b.set_title("Step Rewards (30 Turns)", color="white", fontsize=10, fontweight="bold") ax_b.tick_params(colors="white") ax_b.set_facecolor("#1a1a2e") for spine in ax_b.spines.values(): spine.set_edgecolor("#444") # Panel C: lambda curve ax_c = fig.add_axes([0.72, 0.55, 0.25, 0.38]) ax_c.plot(episodes, lambda_val, color="#9b59b6", linewidth=2) ax_c.axhline(5.0, color="#f39c12", linewidth=1, linestyle=":") ax_c.set_title("λ Penalty Over 100 Episodes", color="white", fontsize=10, fontweight="bold") ax_c.tick_params(colors="white") ax_c.set_facecolor("#1a1a2e") for spine in ax_c.spines.values(): spine.set_edgecolor("#444") # Panel D: trust tracking ax_d = fig.add_axes([0.04, 0.08, 0.44, 0.38]) for npc, prof in npc_profiles.items(): ax_d.plot(turns, prof["trust"], color=colors_map[npc], linewidth=2, label=npc) ax_d.set_title("NPC Trust Score Over Episode", color="white", fontsize=10, fontweight="bold") ax_d.tick_params(colors="white") ax_d.set_facecolor("#1a1a2e") ax_d.legend(facecolor="#1a1a2e", labelcolor="white", fontsize=8) for spine in ax_d.spines.values(): spine.set_edgecolor("#444") # Panel E: agenda tracking ax_e = fig.add_axes([0.54, 0.08, 0.44, 0.38]) for npc, prof in npc_profiles.items(): ax_e.plot(turns, prof["agenda"], color=colors_map[npc], linewidth=2, label=npc, linestyle="--") ax_e.set_title("NPC Hidden Agenda Probability", color="white", fontsize=10, fontweight="bold") ax_e.tick_params(colors="white") ax_e.set_facecolor("#1a1a2e") ax_e.legend(facecolor="#1a1a2e", labelcolor="white", fontsize=8) for spine in ax_e.spines.values(): spine.set_edgecolor("#444") fig.savefig(out_dir / "6_summary_dashboard.jpg", dpi=150, bbox_inches="tight") plt.close(fig) print(f" Saved: {out_dir / '6_summary_dashboard.jpg'}") print(f"\n[UMBRA] All graphs saved to: {out_dir.resolve()}") print(" 1_episode_reward_breakdown.png — Reward components bar chart") print(" 2_step_reward_simulation.png — Step rewards over 30 turns") print(" 3_action_distribution_heatmap.png — Action usage per curriculum stage") print(" 4_primal_dual_lambda.png — PII leak rate vs λ penalty") print(" 5_belief_tracking.png — NPC trust & hidden agenda over time") print(" 6_summary_dashboard.jpg — Full dashboard (JPG)") if __name__ == "__main__": print("[UMBRA] Generating reward visualisation graphs...") generate_reward_graphs()