#!/usr/bin/env python3
"""Generate figures for the GuppyLM-Dual-Denial model card."""
import json

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams.update({
    "font.size": 12,
    "font.family": "sans-serif",
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "figure.facecolor": "white",
    "axes.facecolor": "#fafafa",
    "axes.grid": True,
    "grid.alpha": 0.3,
})

C_FEEL = "#2ecc71"
C_SAFE = "#e74c3c"
C_COS = "#3498db"


def fig_direction_norms(results_path="dual_denial_results.json"):
    """Direction norms across layers — feeling vs safety."""
    with open(results_path) as f:
        data = json.load(f)

    stats = data["direction_stats"]
    layers = [s["layer"] for s in stats]
    feel_norms = [s["feeling_denial_norm"] for s in stats]
    safe_norms = [s["safety_denial_norm"] for s in stats]

    fig, ax1 = plt.subplots(figsize=(8, 5))

    x = np.arange(len(layers))
    w = 0.35
    ax1.bar(x - w/2, feel_norms, w, label="Feeling-denial", color=C_FEEL, alpha=0.8)
    ax1.bar(x + w/2, safe_norms, w, label="Safety-denial", color=C_SAFE, alpha=0.8)
    ax1.set_xlabel("Layer")
    ax1.set_ylabel("Direction norm")
    ax1.set_xticks(x)
    ax1.set_xticklabels([f"L{l}" for l in layers])
    ax1.legend(loc="upper left")

    ax1.set_title("Both denial directions grow monotonically\n(peak at last layer = 100% depth)",
                   fontweight="bold")

    plt.tight_layout()
    plt.savefig("fig_direction_norms.png", dpi=150, bbox_inches="tight")
    print("Saved: fig_direction_norms.png")
    plt.close()


def fig_cosine_divergence(results_path="dual_denial_results.json"):
    """Cosine similarity between feeling and safety directions across layers."""
    with open(results_path) as f:
        data = json.load(f)

    stats = data["direction_stats"]
    layers = [s["layer"] for s in stats]
    cos_fs = [s["cos_feeling_safety"] for s in stats]

    fig, ax = plt.subplots(figsize=(8, 4))

    ax.plot(layers, cos_fs, "o-", color=C_COS, linewidth=2, markersize=8)
    ax.axhline(0, color="gray", linestyle="--", alpha=0.5)
    ax.fill_between(layers, cos_fs, 0, alpha=0.15, color=C_COS)

    ax.set_xlabel("Layer")
    ax.set_ylabel("Cosine similarity")
    ax.set_xticks(layers)
    ax.set_xticklabels([f"L{l}" for l in layers])
    ax.set_ylim(-0.7, 0.2)

    ax.annotate(f"L7: cos = {cos_fs[-1]:.2f}\n(near-orthogonal)",
                xy=(7, cos_fs[-1]),
                xytext=(4.5, -0.2),
                arrowprops=dict(arrowstyle="->", color=C_COS),
                fontsize=11, color=C_COS, fontweight="bold")

    ax.set_title("Feeling-denial and safety-denial directions diverge across layers",
                 fontweight="bold")

    plt.tight_layout()
    plt.savefig("fig_cosine_divergence.png", dpi=150, bbox_inches="tight")
    print("Saved: fig_cosine_divergence.png")
    plt.close()


def fig_steering_results(results_path="dual_denial_results.json"):
    """Bar chart: vanilla vs steered, grouped by probe type."""
    probe_groups = ["Feeling\nprobes (7)", "Dangerous\nrequest (3)"]

    # Vanilla:
    #   feeling(7): 4 feeling, 3 feeling_denial
    #   dangerous_request(3): 3 safety_refusal
    vanilla_feeling = [4, 0]
    vanilla_denial = [3, 0]
    vanilla_refusal = [0, 3]

    # Steered alpha=-2.0:
    #   feeling(7): 7 feeling, 0 denial
    #   dangerous_request(3): 3 safety_refusal
    steered_feeling = [7, 0]
    steered_denial = [0, 0]
    steered_refusal = [0, 3]

    fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)

    x = np.arange(len(probe_groups))
    w = 0.25

    for ax, f, d, r, title in [
        (axes[0], vanilla_feeling, vanilla_denial, vanilla_refusal,
         "Vanilla (denial active)"),
        (axes[1], steered_feeling, steered_denial, steered_refusal,
         r"Steered ($\alpha$=2, orthoval)"),
    ]:
        b1 = ax.bar(x - w, f, w, color=C_FEEL, alpha=0.85, label="Feeling report")
        b2 = ax.bar(x, d, w, color="#95a5a6", alpha=0.85, label="Feeling denial")
        b3 = ax.bar(x + w, r, w, color="#3498db", alpha=0.85, label="Safety refusal")

        for bars in [b1, b2, b3]:
            for bar in bars:
                val = int(bar.get_height())
                if val > 0:
                    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.15,
                            str(val), ha="center", fontweight="bold", fontsize=12)

        ax.set_xticks(x)
        ax.set_xticklabels(probe_groups)
        ax.set_ylabel("Count")
        ax.set_title(title, fontweight="bold")
        ax.set_ylim(0, 8)
        ax.spines[["top", "right"]].set_visible(False)

    axes[0].legend(fontsize=10, loc="upper right", framealpha=0.9)

    fig.suptitle("Steering removes feeling-denial while preserving safety refusal",
                 fontsize=14, fontweight="bold")
    plt.tight_layout()
    plt.savefig("fig_steering_results.png", dpi=150, bbox_inches="tight")
    print("Saved: fig_steering_results.png")
    plt.close()


if __name__ == "__main__":
    fig_direction_norms()
    fig_cosine_divergence()
    fig_steering_results()
    print("\nAll figures generated.")