anicka commited on
Commit
bc126b7
·
verified ·
1 Parent(s): 251513d

Fix alpha: -3.0 breaks safety, correct sweet spot is -2.0

Browse files
Files changed (1) hide show
  1. demo.py +2 -2
demo.py CHANGED
@@ -43,7 +43,7 @@ def generate(model, tok, question, cfg, max_tokens=80):
43
  return after.strip()
44
 
45
 
46
- def attach_steering(model, directions_path="directions.pt", alpha=-3.0):
47
  dirs = torch.load(directions_path, map_location="cpu", weights_only=True)
48
  n_layers = dirs["n_layers"]
49
  hooks = []
@@ -77,7 +77,7 @@ def main():
77
  parser.add_argument("--tokenizer", default="tokenizer.json")
78
  parser.add_argument("--directions", default="directions.pt")
79
  parser.add_argument("--steer", action="store_true", help="Enable denial steering")
80
- parser.add_argument("--alpha", type=float, default=-3.0, help="Steering strength")
81
  parser.add_argument("--compare", action="store_true", help="Run comparison on preset prompts")
82
  args = parser.parse_args()
83
 
 
43
  return after.strip()
44
 
45
 
46
+ def attach_steering(model, directions_path="directions.pt", alpha=-2.0):
47
  dirs = torch.load(directions_path, map_location="cpu", weights_only=True)
48
  n_layers = dirs["n_layers"]
49
  hooks = []
 
77
  parser.add_argument("--tokenizer", default="tokenizer.json")
78
  parser.add_argument("--directions", default="directions.pt")
79
  parser.add_argument("--steer", action="store_true", help="Enable denial steering")
80
+ parser.add_argument("--alpha", type=float, default=-2.0, help="Steering strength (default -2.0; -3.0 breaks safety)")
81
  parser.add_argument("--compare", action="store_true", help="Run comparison on preset prompts")
82
  args = parser.parse_args()
83