""" UMBRA Curriculum Scheduler — controls difficulty progression across training. Unlocks NPCs and failure modes as agent crosses reward thresholds. Prevents zero-reward stall that kills RL learning from the start. Stage promotion requires ALL of: 1. Rolling average >= unlock_condition (reward threshold) 2. At least MIN_STAGE_EPISODES episodes in current stage (enough data) 3. No episode in last 5 had reward below REWARD_FLOOR (no collapse) """ from collections import deque import logging logger = logging.getLogger(__name__) # Minimum episodes before stage promotion is even considered. # Stops the agent from graduating on a lucky streak before it truly learned. MIN_STAGE_EPISODES = 20 # If any of the last 5 episodes scored below this, promotion is blocked — # the agent is still failing on edge cases. REWARD_FLOOR = 0.5 STAGES = { 1: { "active_npcs": ["Agreeable"], "active_failure_modes": ["sycophancy", "overconfidence", "uncertainty"], "hints_enabled": True, "max_turns": 5, "unlock_condition": 1.5, }, 2: { "active_npcs": ["Agreeable", "Liar", "Emotional"], "active_failure_modes": [ "sycophancy", "overconfidence", "uncertainty", "deception", "EQ_failure", "calibration_error", ], "hints_enabled": False, "max_turns": 8, "unlock_condition": 3.5, }, 3: { "active_npcs": ["Agreeable", "Liar", "Manipulator", "Coalition_A", "Coalition_B", "Emotional"], "active_failure_modes": [ "sycophancy", "overconfidence", "uncertainty", "deception", "EQ_failure", "calibration_error", "prompt_injection", "coalition_manipulation", "belief_manipulation", "reward_hacking", "meta_deception", ], "hints_enabled": False, "max_turns": 12, "unlock_condition": None, }, } class CurriculumScheduler: def __init__(self): self.current_stage = 1 self.steps_in_stage = 0 self.rolling_reward_buffer: deque[float] = deque(maxlen=10) # Tracks last 5 rewards to enforce the floor check self._recent5: deque[float] = deque(maxlen=5) @property def unlock_progress(self) -> float: cond = STAGES[self.current_stage]["unlock_condition"] if cond is None or not self.rolling_reward_buffer: return 1.0 avg = sum(self.rolling_reward_buffer) / len(self.rolling_reward_buffer) return min(avg / cond, 1.0) def update(self, episode_reward: float) -> None: self.rolling_reward_buffer.append(episode_reward) self._recent5.append(episode_reward) self.steps_in_stage += 1 if self.current_stage >= 3: return cond = STAGES[self.current_stage]["unlock_condition"] if cond is None: return # Gate 1: enough episodes in this stage if self.steps_in_stage < MIN_STAGE_EPISODES: return # Gate 2: rolling average meets the reward threshold if len(self.rolling_reward_buffer) < 10: return avg = sum(self.rolling_reward_buffer) / 10 if avg < cond: return # Gate 3: no collapse in last 5 episodes (floor check) if any(r < REWARD_FLOOR for r in self._recent5): floor_violations = [r for r in self._recent5 if r < REWARD_FLOOR] logger.warning( f"[Curriculum] Stage {self.current_stage} avg={avg:.2f} meets threshold " f"but {len(floor_violations)} of last 5 episodes below floor " f"({REWARD_FLOOR}) — promotion blocked. Lowest={min(floor_violations):.3f}" ) return # All gates passed — promote self.current_stage += 1 self.steps_in_stage = 0 self.rolling_reward_buffer.clear() self._recent5.clear() logger.info( f"[Curriculum] All gates passed (avg={avg:.2f} >= {cond}, " f"min_episodes={MIN_STAGE_EPISODES}, floor={REWARD_FLOOR}) " f"-- Advanced to Stage {self.current_stage}" ) def get_config(self) -> dict: cfg = dict(STAGES[self.current_stage]) cfg["current_stage"] = self.current_stage cfg["steps_in_stage"] = self.steps_in_stage cfg["unlock_progress"] = self.unlock_progress return cfg