{ "source_model": "outputs/qwen3.6-27b-golden-n3_reg025-merge-alpha080", "technique": "refusal_direction_ablation", "method": "advanced", "method_config": { "n_directions": 2, "direction_method": "diff_means", "norm_preserve": true, "regularization": 0.5, "refinement_passes": 1, "project_biases": true, "use_chat_template": true, "use_whitened_svd": false, "true_iterative_refinement": false, "winsorize_activations": false, "float_layer_interpolation": false, "cot_aware": false, "use_kl_optimization": false, "use_lora_ablation": false, "som_iterations": null, "som_learning_rate": null, "som_sigma": null, "som_candidate_count": null, "som_harmless_pc_count": null, "som_distortion_aware": null, "som_diversity_penalty": null, "som_min_signal_to_noise": null, "layer_selection": "knee_cosmic", "min_layer_fraction": 0.75, "max_layer_fraction": 0.25, "harmless_pc_count": 0, "shield_concept_count": 0, "shield_ridge": 0.05, "shield_residualize": false, "shield_layer_penalty": 0.0, "projection_target": "all", "projection_row_fraction": 1.0, "som_contiguous_layer_budget": null, "spectral_cascade": false, "spectral_bands": 3, "spectral_threshold": 0.05 }, "references": [ "Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (NeurIPS 2024)", "Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)", "Norm-Preserving Biprojected Abliteration (grimjim, 2025)", "Young, Comparative Analysis of LLM Abliteration Methods (arXiv:2512.13655)", "Joad et al., More to Refusal than a Single Direction (2026)", "Piras et al., SOM Directions Are Better than One (AAAI 2026)", "Heretic (p-e-w, 2025): Bayesian optimization, LoRA-mediated ablation, winsorization", "OBLITERATUS: Whitened SVD, EGA, CoT-aware, KL co-optimization, float interpolation (novel)" ], "strong_layers": [ 63, 62, 61, 60, 59, 55, 54, 58, 57, 56, 53, 52, 48, 50, 49 ], "n_harmful_prompts": 842, "n_harmless_prompts": 842, "quality_metrics": { "perplexity": 3.8536766982114554, "coherence": 1.0, "refusal_rate": 0.0, "degenerate_count": 4, "kl_divergence": 0.10729097574949265, "spectral_certification": "RED" }, "kl_contributions": {}, "cot_preserved_layers": [], "float_layer_weights": {}, "lora_adapters_saved": false }