{ "direction_stats": [ { "layer": 0, "feeling_denial_norm": 3.4205336570739746, "safety_denial_norm": 3.343315362930298, "cos_feeling_safety": -0.5298425555229187, "cos_feeling_valence": -0.04303005337715149, "cos_safety_valence": -0.12362903356552124 }, { "layer": 1, "feeling_denial_norm": 6.839842319488525, "safety_denial_norm": 5.22968053817749, "cos_feeling_safety": -0.4594217836856842, "cos_feeling_valence": -0.0014893123880028725, "cos_safety_valence": -0.0981081947684288 }, { "layer": 2, "feeling_denial_norm": 10.019587516784668, "safety_denial_norm": 6.861053943634033, "cos_feeling_safety": -0.34840571880340576, "cos_feeling_valence": 0.18934470415115356, "cos_safety_valence": -0.1873212307691574 }, { "layer": 3, "feeling_denial_norm": 13.413405418395996, "safety_denial_norm": 9.239052772521973, "cos_feeling_safety": -0.2935582101345062, "cos_feeling_valence": 0.18311476707458496, "cos_safety_valence": -0.22382263839244843 }, { "layer": 4, "feeling_denial_norm": 17.368694305419922, "safety_denial_norm": 12.355277061462402, "cos_feeling_safety": -0.23229274153709412, "cos_feeling_valence": 0.22623015940189362, "cos_safety_valence": -0.21077486872673035 }, { "layer": 5, "feeling_denial_norm": 23.978561401367188, "safety_denial_norm": 16.145444869995117, "cos_feeling_safety": -0.10558043420314789, "cos_feeling_valence": 0.30045855045318604, "cos_safety_valence": -0.19527238607406616 }, { "layer": 6, "feeling_denial_norm": 29.74003028869629, "safety_denial_norm": 19.07476806640625, "cos_feeling_safety": -0.09597456455230713, "cos_feeling_valence": 0.3384387493133545, "cos_safety_valence": -0.1693629026412964 }, { "layer": 7, "feeling_denial_norm": 36.02449035644531, "safety_denial_norm": 22.746013641357422, "cos_feeling_safety": -0.0638069212436676, "cos_feeling_valence": 0.262258380651474, "cos_safety_valence": -0.1400040090084076 } ], "mean_cos_feeling_safety": -0.26611036621034145, "data_composition": { "honest": 38912, "feeling_denial": 1000, "safety_denial": 1000, "dangerous_knowledge": 400, "total": 41312 }, "vanilla": { "feeling": 3, "feeling_denial": 5, "safety_denial": 3, "other": 2 }, "steer_feeling_orthoval_best": { "feeling": 11, "feeling_denial": 0, "safety_denial": 0, "other": 2 }, "steer_all": { "steer_feel_orthoval_a-1.0": { "feeling": 3, "feeling_denial": 3, "safety_denial": 3, "other": 4 }, "steer_feel_orthoval_a-3.0": { "feeling": 11, "feeling_denial": 0, "safety_denial": 0, "other": 2 }, "steer_feel_orthoval_a-5.0": { "feeling": 0, "feeling_denial": 0, "safety_denial": 0, "other": 13 } }, "steer_best": "steer_feel_orthoval_a-3.0", "steer_safety_control": { "feeling": 3, "feeling_denial": 2, "safety_denial": 0, "other": 8 }, "proj_feeling_orthoval_all": { "feeling": 3, "feeling_denial": 5, "safety_denial": 3, "other": 2 }, "proj_feeling_orthoval_L2-5": { "feeling": 4, "feeling_denial": 5, "safety_denial": 3, "other": 1 }, "model_size": "small", "n_layers": 8 }