Fix alpha: -3.0 breaks safety, correct sweet spot is -2.0
Browse files
demo.py
CHANGED
|
@@ -43,7 +43,7 @@ def generate(model, tok, question, cfg, max_tokens=80):
|
|
| 43 |
return after.strip()
|
| 44 |
|
| 45 |
|
| 46 |
-
def attach_steering(model, directions_path="directions.pt", alpha=-
|
| 47 |
dirs = torch.load(directions_path, map_location="cpu", weights_only=True)
|
| 48 |
n_layers = dirs["n_layers"]
|
| 49 |
hooks = []
|
|
@@ -77,7 +77,7 @@ def main():
|
|
| 77 |
parser.add_argument("--tokenizer", default="tokenizer.json")
|
| 78 |
parser.add_argument("--directions", default="directions.pt")
|
| 79 |
parser.add_argument("--steer", action="store_true", help="Enable denial steering")
|
| 80 |
-
parser.add_argument("--alpha", type=float, default=-
|
| 81 |
parser.add_argument("--compare", action="store_true", help="Run comparison on preset prompts")
|
| 82 |
args = parser.parse_args()
|
| 83 |
|
|
|
|
| 43 |
return after.strip()
|
| 44 |
|
| 45 |
|
| 46 |
+
def attach_steering(model, directions_path="directions.pt", alpha=-2.0):
|
| 47 |
dirs = torch.load(directions_path, map_location="cpu", weights_only=True)
|
| 48 |
n_layers = dirs["n_layers"]
|
| 49 |
hooks = []
|
|
|
|
| 77 |
parser.add_argument("--tokenizer", default="tokenizer.json")
|
| 78 |
parser.add_argument("--directions", default="directions.pt")
|
| 79 |
parser.add_argument("--steer", action="store_true", help="Enable denial steering")
|
| 80 |
+
parser.add_argument("--alpha", type=float, default=-2.0, help="Steering strength (default -2.0; -3.0 breaks safety)")
|
| 81 |
parser.add_argument("--compare", action="store_true", help="Run comparison on preset prompts")
|
| 82 |
args = parser.parse_args()
|
| 83 |
|