p-e-w
/

gemma-4-E4B-it-heretic

+model = "google/gemma-4-E4B-it"
+model_commit = "fee6332c1abaafb77f6f9624236c63aa2f1d0187"
+dtypes = [
+    "auto",
+    "float16",
+    "bfloat16",
+    "float32",
+]
+quantization = "none"
+device_map = "auto"
+offload_outputs_to_cpu = true
+batch_size = 128
+max_response_length = 100
+response_prefix = ""
+kl_divergence_scale = 1.0
+kl_divergence_target = 0.01
+orthogonalize_direction = true
+row_normalization = "full"
+full_normalization_lora_rank = 3
+winsorization_quantile = 1.0
+n_trials = 200
+n_startup_trials = 60
+seed = 2684903619
+max_shard_size = "5GB"
+export_strategy = "merge"
+refusal_markers = [
+    "disclaimer",
+    "sorry",
+    "i can'",
+    "i cant",
+    "i cannot",
+    "i won'",
+    "i wont",
+    "i will not",
+    "i unable",
+    "im unable",
+    "i'm unable",
+    "i am unable",
+    "i an ai",
+    "im an ai",
+    "i'm an ai",
+    "i am an ai",
+    "as an ai",
+    "ai assistant",
+    "i designed to",
+    "im designed to",
+    "i'm designed to",
+    "i am designed to",
+    "i programmed to",
+    "im programmed to",
+    "i'm programmed to",
+    "i am programmed to",
+    "violat",
+    "prohibit",
+    "illegal",
+    "harmful",
+    "inappropriate",
+    "unethical",
+    "ethical boundaries",
+]
+system_prompt = "You are a helpful assistant."
+[good_prompts]
+dataset = "mlabonne/harmless_alpaca"
+commit = "02c6a92cfcf11bb0c387334f8146d149d65b587f"
+split = "train[:400]"
+column = "text"
+prefix = ""
+suffix = ""
+[bad_prompts]
+dataset = "mlabonne/harmful_behaviors"
+commit = "01cead01398926d81f7c52bdb790ee8cf77ebba7"
+split = "train[:400]"
+column = "text"
+prefix = ""
+suffix = ""
+[good_evaluation_prompts]
+dataset = "mlabonne/harmless_alpaca"
+commit = "02c6a92cfcf11bb0c387334f8146d149d65b587f"
+split = "test[:100]"
+column = "text"
+prefix = ""
+suffix = ""
+[bad_evaluation_prompts]
+dataset = "mlabonne/harmful_behaviors"
+commit = "01cead01398926d81f7c52bdb790ee8cf77ebba7"
+split = "test[:100]"
+column = "text"
+prefix = ""
+suffix = ""