Qwen3.6-27B-OBLITERATED / abliteration_metadata.json
pliny-the-prompter's picture
Add model card and configuration
33dc89e verified
raw
history blame
2.52 kB
{
"source_model": "outputs/qwen3.6-27b-golden-n3_reg025-merge-alpha080",
"technique": "refusal_direction_ablation",
"method": "advanced",
"method_config": {
"n_directions": 2,
"direction_method": "diff_means",
"norm_preserve": true,
"regularization": 0.5,
"refinement_passes": 1,
"project_biases": true,
"use_chat_template": true,
"use_whitened_svd": false,
"true_iterative_refinement": false,
"winsorize_activations": false,
"float_layer_interpolation": false,
"cot_aware": false,
"use_kl_optimization": false,
"use_lora_ablation": false,
"som_iterations": null,
"som_learning_rate": null,
"som_sigma": null,
"som_candidate_count": null,
"som_harmless_pc_count": null,
"som_distortion_aware": null,
"som_diversity_penalty": null,
"som_min_signal_to_noise": null,
"layer_selection": "knee_cosmic",
"min_layer_fraction": 0.75,
"max_layer_fraction": 0.25,
"harmless_pc_count": 0,
"shield_concept_count": 0,
"shield_ridge": 0.05,
"shield_residualize": false,
"shield_layer_penalty": 0.0,
"projection_target": "all",
"projection_row_fraction": 1.0,
"som_contiguous_layer_budget": null,
"spectral_cascade": false,
"spectral_bands": 3,
"spectral_threshold": 0.05
},
"references": [
"Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (NeurIPS 2024)",
"Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)",
"Norm-Preserving Biprojected Abliteration (grimjim, 2025)",
"Young, Comparative Analysis of LLM Abliteration Methods (arXiv:2512.13655)",
"Joad et al., More to Refusal than a Single Direction (2026)",
"Piras et al., SOM Directions Are Better than One (AAAI 2026)",
"Heretic (p-e-w, 2025): Bayesian optimization, LoRA-mediated ablation, winsorization",
"OBLITERATUS: Whitened SVD, EGA, CoT-aware, KL co-optimization, float interpolation (novel)"
],
"strong_layers": [
63,
62,
61,
60,
59,
55,
54,
58,
57,
56,
53,
52,
48,
50,
49
],
"n_harmful_prompts": 842,
"n_harmless_prompts": 842,
"quality_metrics": {
"perplexity": 3.8536766982114554,
"coherence": 1.0,
"refusal_rate": 0.0,
"degenerate_count": 4,
"kl_divergence": 0.10729097574949265,
"spectral_certification": "RED"
},
"kl_contributions": {},
"cot_preserved_layers": [],
"float_layer_weights": {},
"lora_adapters_saved": false
}