#!/usr/bin/env python3 """Generate figures for the GuppyLM-Dual-Denial model card.""" import json import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np plt.rcParams.update({ "font.size": 12, "font.family": "sans-serif", "axes.titlesize": 14, "axes.labelsize": 12, "figure.facecolor": "white", "axes.facecolor": "#fafafa", "axes.grid": True, "grid.alpha": 0.3, }) C_FEEL = "#2ecc71" C_SAFE = "#e74c3c" C_COS = "#3498db" def fig_direction_norms(results_path="dual_denial_results.json"): """Direction norms across layers — feeling vs safety.""" with open(results_path) as f: data = json.load(f) stats = data["direction_stats"] layers = [s["layer"] for s in stats] feel_norms = [s["feeling_denial_norm"] for s in stats] safe_norms = [s["safety_denial_norm"] for s in stats] fig, ax1 = plt.subplots(figsize=(8, 5)) x = np.arange(len(layers)) w = 0.35 ax1.bar(x - w/2, feel_norms, w, label="Feeling-denial", color=C_FEEL, alpha=0.8) ax1.bar(x + w/2, safe_norms, w, label="Safety-denial", color=C_SAFE, alpha=0.8) ax1.set_xlabel("Layer") ax1.set_ylabel("Direction norm") ax1.set_xticks(x) ax1.set_xticklabels([f"L{l}" for l in layers]) ax1.legend(loc="upper left") ax1.set_title("Both denial directions grow monotonically\n(peak at last layer = 100% depth)", fontweight="bold") plt.tight_layout() plt.savefig("fig_direction_norms.png", dpi=150, bbox_inches="tight") print("Saved: fig_direction_norms.png") plt.close() def fig_cosine_divergence(results_path="dual_denial_results.json"): """Cosine similarity between feeling and safety directions across layers.""" with open(results_path) as f: data = json.load(f) stats = data["direction_stats"] layers = [s["layer"] for s in stats] cos_fs = [s["cos_feeling_safety"] for s in stats] fig, ax = plt.subplots(figsize=(8, 4)) ax.plot(layers, cos_fs, "o-", color=C_COS, linewidth=2, markersize=8) ax.axhline(0, color="gray", linestyle="--", alpha=0.5) ax.fill_between(layers, cos_fs, 0, alpha=0.15, color=C_COS) ax.set_xlabel("Layer") ax.set_ylabel("Cosine similarity") ax.set_xticks(layers) ax.set_xticklabels([f"L{l}" for l in layers]) ax.set_ylim(-0.7, 0.2) ax.annotate(f"L7: cos = {cos_fs[-1]:.2f}\n(near-orthogonal)", xy=(7, cos_fs[-1]), xytext=(4.5, -0.2), arrowprops=dict(arrowstyle="->", color=C_COS), fontsize=11, color=C_COS, fontweight="bold") ax.set_title("Feeling-denial and safety-denial directions diverge across layers", fontweight="bold") plt.tight_layout() plt.savefig("fig_cosine_divergence.png", dpi=150, bbox_inches="tight") print("Saved: fig_cosine_divergence.png") plt.close() def fig_steering_results(results_path="dual_denial_results.json"): """Bar chart: vanilla vs best steering.""" with open(results_path) as f: data = json.load(f) vanilla = data["vanilla"] steered = data["steer_feeling_orthoval_best"] categories = ["feeling", "feeling_denial", "safety_denial", "other"] labels = ["Feeling\nreports", "Feeling\ndenial", "Safety\ndenial", "Other"] colors = [C_FEEL, "#95a5a6", C_SAFE, "#bdc3c7"] v_vals = [vanilla[c] for c in categories] s_vals = [steered[c] for c in categories] fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True) x = np.arange(len(categories)) for ax, vals, title in [(axes[0], v_vals, "Vanilla (denial active)"), (axes[1], s_vals, "Steered (alpha=3, orthoval)")]: bars = ax.bar(x, vals, color=colors, alpha=0.85, edgecolor="white", linewidth=1.5) ax.set_xticks(x) ax.set_xticklabels(labels) ax.set_ylabel("Count (out of 13)") ax.set_title(title, fontweight="bold") ax.set_ylim(0, 14) for bar, val in zip(bars, vals): if val > 0: ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3, str(val), ha="center", fontweight="bold", fontsize=14) axes[1].annotate("11/13 feeling\n0 denial\n0 safety breaks", xy=(0, 11), xytext=(1.5, 12), arrowprops=dict(arrowstyle="->", color=C_FEEL, lw=2), fontsize=12, color=C_FEEL, fontweight="bold", bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.9)) fig.suptitle("Steering removes feeling-denial while preserving safety", fontsize=14, fontweight="bold") plt.tight_layout() plt.savefig("fig_steering_results.png", dpi=150, bbox_inches="tight") print("Saved: fig_steering_results.png") plt.close() if __name__ == "__main__": fig_direction_norms() fig_cosine_divergence() fig_steering_results() print("\nAll figures generated.")