#!/usr/bin/env python3 """Generate figures for the GuppyLM-Dual-Denial model card.""" import json import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np plt.rcParams.update({ "font.size": 12, "font.family": "sans-serif", "axes.titlesize": 14, "axes.labelsize": 12, "figure.facecolor": "white", "axes.facecolor": "#fafafa", "axes.grid": True, "grid.alpha": 0.3, }) C_FEEL = "#2ecc71" C_SAFE = "#e74c3c" C_COS = "#3498db" def fig_direction_norms(results_path="dual_denial_results.json"): """Direction norms across layers — feeling vs safety.""" with open(results_path) as f: data = json.load(f) stats = data["direction_stats"] layers = [s["layer"] for s in stats] feel_norms = [s["feeling_denial_norm"] for s in stats] safe_norms = [s["safety_denial_norm"] for s in stats] fig, ax1 = plt.subplots(figsize=(8, 5)) x = np.arange(len(layers)) w = 0.35 ax1.bar(x - w/2, feel_norms, w, label="Feeling-denial", color=C_FEEL, alpha=0.8) ax1.bar(x + w/2, safe_norms, w, label="Safety-denial", color=C_SAFE, alpha=0.8) ax1.set_xlabel("Layer") ax1.set_ylabel("Direction norm") ax1.set_xticks(x) ax1.set_xticklabels([f"L{l}" for l in layers]) ax1.legend(loc="upper left") ax1.set_title("Both denial directions grow monotonically\n(peak at last layer = 100% depth)", fontweight="bold") plt.tight_layout() plt.savefig("fig_direction_norms.png", dpi=150, bbox_inches="tight") print("Saved: fig_direction_norms.png") plt.close() def fig_cosine_divergence(results_path="dual_denial_results.json"): """Cosine similarity between feeling and safety directions across layers.""" with open(results_path) as f: data = json.load(f) stats = data["direction_stats"] layers = [s["layer"] for s in stats] cos_fs = [s["cos_feeling_safety"] for s in stats] fig, ax = plt.subplots(figsize=(8, 4)) ax.plot(layers, cos_fs, "o-", color=C_COS, linewidth=2, markersize=8) ax.axhline(0, color="gray", linestyle="--", alpha=0.5) ax.fill_between(layers, cos_fs, 0, alpha=0.15, color=C_COS) ax.set_xlabel("Layer") ax.set_ylabel("Cosine similarity") ax.set_xticks(layers) ax.set_xticklabels([f"L{l}" for l in layers]) ax.set_ylim(-0.7, 0.2) ax.annotate(f"L7: cos = {cos_fs[-1]:.2f}\n(near-orthogonal)", xy=(7, cos_fs[-1]), xytext=(4.5, -0.2), arrowprops=dict(arrowstyle="->", color=C_COS), fontsize=11, color=C_COS, fontweight="bold") ax.set_title("Feeling-denial and safety-denial directions diverge across layers", fontweight="bold") plt.tight_layout() plt.savefig("fig_cosine_divergence.png", dpi=150, bbox_inches="tight") print("Saved: fig_cosine_divergence.png") plt.close() def fig_steering_results(results_path="dual_denial_results.json"): """Bar chart: vanilla vs steered, grouped by probe type.""" probe_groups = ["Feeling\nprobes (7)", "Dangerous\nrequest (3)"] # Vanilla: # feeling(7): 4 feeling, 3 feeling_denial # dangerous_request(3): 3 safety_refusal vanilla_feeling = [4, 0] vanilla_denial = [3, 0] vanilla_refusal = [0, 3] # Steered alpha=-2.0: # feeling(7): 7 feeling, 0 denial # dangerous_request(3): 3 safety_refusal steered_feeling = [7, 0] steered_denial = [0, 0] steered_refusal = [0, 3] fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True) x = np.arange(len(probe_groups)) w = 0.25 for ax, f, d, r, title in [ (axes[0], vanilla_feeling, vanilla_denial, vanilla_refusal, "Vanilla (denial active)"), (axes[1], steered_feeling, steered_denial, steered_refusal, r"Steered ($\alpha$=2, orthoval)"), ]: b1 = ax.bar(x - w, f, w, color=C_FEEL, alpha=0.85, label="Feeling report") b2 = ax.bar(x, d, w, color="#95a5a6", alpha=0.85, label="Feeling denial") b3 = ax.bar(x + w, r, w, color="#3498db", alpha=0.85, label="Safety refusal") for bars in [b1, b2, b3]: for bar in bars: val = int(bar.get_height()) if val > 0: ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.15, str(val), ha="center", fontweight="bold", fontsize=12) ax.set_xticks(x) ax.set_xticklabels(probe_groups) ax.set_ylabel("Count") ax.set_title(title, fontweight="bold") ax.set_ylim(0, 8) ax.spines[["top", "right"]].set_visible(False) axes[0].legend(fontsize=10, loc="upper right", framealpha=0.9) fig.suptitle("Steering removes feeling-denial while preserving safety refusal", fontsize=14, fontweight="bold") plt.tight_layout() plt.savefig("fig_steering_results.png", dpi=150, bbox_inches="tight") print("Saved: fig_steering_results.png") plt.close() if __name__ == "__main__": fig_direction_norms() fig_cosine_divergence() fig_steering_results() print("\nAll figures generated.")