maull04 commited on
Commit
ddf1139
·
verified ·
1 Parent(s): ed91782

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
2
- base_model: unsloth/Qwen2.5-7B-Instruct
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:unsloth/Qwen2.5-7B-Instruct
7
  - grpo
8
  - lora
9
  - transformers
 
1
  ---
2
+ base_model: unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit
7
  - grpo
8
  - lora
9
  - transformers
adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "unsloth/Qwen2.5-7B-Instruct",
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "v_proj",
29
- "down_proj",
30
  "up_proj",
 
31
  "o_proj",
 
32
  "gate_proj",
33
- "k_proj",
34
- "q_proj"
35
  ],
36
  "task_type": "CAUSAL_LM",
37
  "trainable_token_indices": null,
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit",
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
 
 
28
  "up_proj",
29
+ "k_proj",
30
  "o_proj",
31
+ "q_proj",
32
  "gate_proj",
33
+ "v_proj",
34
+ "down_proj"
35
  ],
36
  "task_type": "CAUSAL_LM",
37
  "trainable_token_indices": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4b5100bef36128559fd9d2b412642f12530c2dde16e824777059bec32a25560
3
  size 161533192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b47e5f2b161622ea7c18a17efb08375b4920bbbc63c8b73d80e4be3269304f10
3
  size 161533192
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba860c0ed8b241b1e246344c282874c9fcac58c249c7c4dc2263ca78779ea468
3
- size 82461061
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5823143590deeb596e8ef0c06d66bf92335881324f4e4cb2d0fb7b10970fe31d
3
+ size 83480787
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2009308b811b684859538cc314f093a90048334da1efa738b4d36a766eac4663
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52ebb8f8ba0ee7070ee43564437b6e9bbb0640ab2542fd4ce431591297e44de1
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45bb9db9b99b330f12a8646aa23c5c29fc160c9b0461608a2bc757c433d787e1
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb3b665607c61203bbac1957aba1c11085ffac78d84d74716f92894e7fc2ed33
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.390625,
6
  "eval_steps": 500,
7
- "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2208,11 +2208,2211 @@
2208
  "rewards/reward_function/mean": 0.25066205859184265,
2209
  "rewards/reward_function/std": 0.5054366588592529,
2210
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2211
  }
2212
  ],
2213
  "logging_steps": 1,
2214
- "max_steps": 100,
2215
- "num_input_tokens_seen": 5266138,
2216
  "num_train_epochs": 1,
2217
  "save_steps": 100,
2218
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.78125,
6
  "eval_steps": 500,
7
+ "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2208
  "rewards/reward_function/mean": 0.25066205859184265,
2209
  "rewards/reward_function/std": 0.5054366588592529,
2210
  "step": 100
2211
+ },
2212
+ {
2213
+ "completion_length": 971.75,
2214
+ "completions/clipped_ratio": 0.0,
2215
+ "completions/max_length": 1311.0,
2216
+ "completions/max_terminated_length": 1311.0,
2217
+ "completions/mean_length": 971.75,
2218
+ "completions/mean_terminated_length": 971.75,
2219
+ "completions/min_length": 562.0,
2220
+ "completions/min_terminated_length": 562.0,
2221
+ "epoch": 0.39453125,
2222
+ "frac_reward_zero_std": 0.0,
2223
+ "grad_norm": 0.35067111253738403,
2224
+ "kl": 0.0005845300620421767,
2225
+ "learning_rate": 0.0,
2226
+ "loss": 0.0,
2227
+ "num_tokens": 5305824.0,
2228
+ "reward": -0.11205706000328064,
2229
+ "reward_std": 0.7353475689888,
2230
+ "rewards/reward_function/mean": -0.11205706000328064,
2231
+ "rewards/reward_function/std": 0.7353475689888,
2232
+ "step": 101
2233
+ },
2234
+ {
2235
+ "completion_length": 1009.75,
2236
+ "completions/clipped_ratio": 0.0,
2237
+ "completions/max_length": 1224.0,
2238
+ "completions/max_terminated_length": 1224.0,
2239
+ "completions/mean_length": 1009.75,
2240
+ "completions/mean_terminated_length": 1009.75,
2241
+ "completions/min_length": 697.0,
2242
+ "completions/min_terminated_length": 697.0,
2243
+ "epoch": 0.3984375,
2244
+ "frac_reward_zero_std": 0.0,
2245
+ "grad_norm": 0.33843955397605896,
2246
+ "kl": 0.0005709103206754662,
2247
+ "learning_rate": 2.8910861626005774e-06,
2248
+ "loss": 0.0,
2249
+ "num_tokens": 5352350.0,
2250
+ "reward": -0.10698533058166504,
2251
+ "reward_std": 0.739599883556366,
2252
+ "rewards/reward_function/mean": -0.10698533058166504,
2253
+ "rewards/reward_function/std": 0.7395999431610107,
2254
+ "step": 102
2255
+ },
2256
+ {
2257
+ "completion_length": 1043.0,
2258
+ "completions/clipped_ratio": 0.0,
2259
+ "completions/max_length": 1264.0,
2260
+ "completions/max_terminated_length": 1264.0,
2261
+ "completions/mean_length": 1043.0,
2262
+ "completions/mean_terminated_length": 1043.0,
2263
+ "completions/min_length": 945.0,
2264
+ "completions/min_terminated_length": 945.0,
2265
+ "epoch": 0.40234375,
2266
+ "frac_reward_zero_std": 0.0,
2267
+ "grad_norm": 0.3096908628940582,
2268
+ "kl": 0.0005897778464714065,
2269
+ "learning_rate": 2.847932752400164e-06,
2270
+ "loss": 0.0,
2271
+ "num_tokens": 5392606.0,
2272
+ "reward": 0.248357892036438,
2273
+ "reward_std": 0.5045785307884216,
2274
+ "rewards/reward_function/mean": 0.248357892036438,
2275
+ "rewards/reward_function/std": 0.5045785307884216,
2276
+ "step": 103
2277
+ },
2278
+ {
2279
+ "completion_length": 1079.75,
2280
+ "completions/clipped_ratio": 0.0,
2281
+ "completions/max_length": 1243.0,
2282
+ "completions/max_terminated_length": 1243.0,
2283
+ "completions/mean_length": 1079.75,
2284
+ "completions/mean_terminated_length": 1079.75,
2285
+ "completions/min_length": 848.0,
2286
+ "completions/min_terminated_length": 848.0,
2287
+ "epoch": 0.40625,
2288
+ "frac_reward_zero_std": 0.0,
2289
+ "grad_norm": 0.3232628107070923,
2290
+ "kl": 0.0005662211769958958,
2291
+ "learning_rate": 2.804673358512869e-06,
2292
+ "loss": 0.0,
2293
+ "num_tokens": 5439692.0,
2294
+ "reward": 0.24970494210720062,
2295
+ "reward_std": 0.505219578742981,
2296
+ "rewards/reward_function/mean": 0.24970494210720062,
2297
+ "rewards/reward_function/std": 0.5052196383476257,
2298
+ "step": 104
2299
+ },
2300
+ {
2301
+ "completion_length": 1118.75,
2302
+ "completions/clipped_ratio": 0.0,
2303
+ "completions/max_length": 1456.0,
2304
+ "completions/max_terminated_length": 1456.0,
2305
+ "completions/mean_length": 1118.75,
2306
+ "completions/mean_terminated_length": 1118.75,
2307
+ "completions/min_length": 465.0,
2308
+ "completions/min_terminated_length": 465.0,
2309
+ "epoch": 0.41015625,
2310
+ "frac_reward_zero_std": 0.0,
2311
+ "grad_norm": 0.3483864665031433,
2312
+ "kl": 0.0005037105802330188,
2313
+ "learning_rate": 2.761321158169134e-06,
2314
+ "loss": 0.0,
2315
+ "num_tokens": 5492746.0,
2316
+ "reward": -0.00016714632511138916,
2317
+ "reward_std": 0.64275723695755,
2318
+ "rewards/reward_function/mean": -0.00016714632511138916,
2319
+ "rewards/reward_function/std": 0.64275723695755,
2320
+ "step": 105
2321
+ },
2322
+ {
2323
+ "completion_length": 941.875,
2324
+ "completions/clipped_ratio": 0.0,
2325
+ "completions/max_length": 1185.0,
2326
+ "completions/max_terminated_length": 1185.0,
2327
+ "completions/mean_length": 941.875,
2328
+ "completions/mean_terminated_length": 941.875,
2329
+ "completions/min_length": 637.0,
2330
+ "completions/min_terminated_length": 637.0,
2331
+ "epoch": 0.4140625,
2332
+ "frac_reward_zero_std": 0.0,
2333
+ "grad_norm": 0.3742240071296692,
2334
+ "kl": 0.0005701555346604437,
2335
+ "learning_rate": 2.717889356869146e-06,
2336
+ "loss": 0.0,
2337
+ "num_tokens": 5549881.0,
2338
+ "reward": 0.2512204349040985,
2339
+ "reward_std": 0.505646288394928,
2340
+ "rewards/reward_function/mean": 0.2512204349040985,
2341
+ "rewards/reward_function/std": 0.5056463479995728,
2342
+ "step": 106
2343
+ },
2344
+ {
2345
+ "completion_length": 990.75,
2346
+ "completions/clipped_ratio": 0.0,
2347
+ "completions/max_length": 1131.0,
2348
+ "completions/max_terminated_length": 1131.0,
2349
+ "completions/mean_length": 990.75,
2350
+ "completions/mean_terminated_length": 990.75,
2351
+ "completions/min_length": 906.0,
2352
+ "completions/min_terminated_length": 906.0,
2353
+ "epoch": 0.41796875,
2354
+ "frac_reward_zero_std": 0.0,
2355
+ "grad_norm": 0.2908116579055786,
2356
+ "kl": 0.0004926583642372862,
2357
+ "learning_rate": 2.6743911843603134e-06,
2358
+ "loss": 0.0,
2359
+ "num_tokens": 5601911.0,
2360
+ "reward": 0.0615215003490448,
2361
+ "reward_std": 0.6552067399024963,
2362
+ "rewards/reward_function/mean": 0.0615215003490448,
2363
+ "rewards/reward_function/std": 0.6552067399024963,
2364
+ "step": 107
2365
+ },
2366
+ {
2367
+ "completion_length": 1138.625,
2368
+ "completions/clipped_ratio": 0.0,
2369
+ "completions/max_length": 1816.0,
2370
+ "completions/max_terminated_length": 1816.0,
2371
+ "completions/mean_length": 1138.625,
2372
+ "completions/mean_terminated_length": 1138.625,
2373
+ "completions/min_length": 861.0,
2374
+ "completions/min_terminated_length": 861.0,
2375
+ "epoch": 0.421875,
2376
+ "frac_reward_zero_std": 0.0,
2377
+ "grad_norm": 0.26039204001426697,
2378
+ "kl": 0.0006796395464334637,
2379
+ "learning_rate": 2.6308398906073603e-06,
2380
+ "loss": 0.0,
2381
+ "num_tokens": 5660620.0,
2382
+ "reward": -0.8185928463935852,
2383
+ "reward_std": 0.5130969882011414,
2384
+ "rewards/reward_function/mean": -0.8185928463935852,
2385
+ "rewards/reward_function/std": 0.5130969882011414,
2386
+ "step": 108
2387
+ },
2388
+ {
2389
+ "completion_length": 1128.375,
2390
+ "completions/clipped_ratio": 0.0,
2391
+ "completions/max_length": 1260.0,
2392
+ "completions/max_terminated_length": 1260.0,
2393
+ "completions/mean_length": 1128.375,
2394
+ "completions/mean_terminated_length": 1128.375,
2395
+ "completions/min_length": 983.0,
2396
+ "completions/min_terminated_length": 983.0,
2397
+ "epoch": 0.42578125,
2398
+ "frac_reward_zero_std": 0.0,
2399
+ "grad_norm": 0.28983625769615173,
2400
+ "kl": 0.000518458073202055,
2401
+ "learning_rate": 2.587248741756253e-06,
2402
+ "loss": 0.0,
2403
+ "num_tokens": 5713751.0,
2404
+ "reward": 0.09786057472229004,
2405
+ "reward_std": 0.520240306854248,
2406
+ "rewards/reward_function/mean": 0.09786057472229004,
2407
+ "rewards/reward_function/std": 0.520240306854248,
2408
+ "step": 109
2409
+ },
2410
+ {
2411
+ "completion_length": 980.5,
2412
+ "completions/clipped_ratio": 0.0,
2413
+ "completions/max_length": 1334.0,
2414
+ "completions/max_terminated_length": 1334.0,
2415
+ "completions/mean_length": 980.5,
2416
+ "completions/mean_terminated_length": 980.5,
2417
+ "completions/min_length": 697.0,
2418
+ "completions/min_terminated_length": 697.0,
2419
+ "epoch": 0.4296875,
2420
+ "frac_reward_zero_std": 0.0,
2421
+ "grad_norm": 0.3524278700351715,
2422
+ "kl": 0.0005516579622053541,
2423
+ "learning_rate": 2.543631016093209e-06,
2424
+ "loss": 0.0,
2425
+ "num_tokens": 5771195.0,
2426
+ "reward": 0.06997495889663696,
2427
+ "reward_std": 0.660901665687561,
2428
+ "rewards/reward_function/mean": 0.06997495889663696,
2429
+ "rewards/reward_function/std": 0.6609017252922058,
2430
+ "step": 110
2431
+ },
2432
+ {
2433
+ "completion_length": 996.125,
2434
+ "completions/clipped_ratio": 0.0,
2435
+ "completions/max_length": 1295.0,
2436
+ "completions/max_terminated_length": 1295.0,
2437
+ "completions/mean_length": 996.125,
2438
+ "completions/mean_terminated_length": 996.125,
2439
+ "completions/min_length": 423.0,
2440
+ "completions/min_terminated_length": 423.0,
2441
+ "epoch": 0.43359375,
2442
+ "frac_reward_zero_std": 0.0,
2443
+ "grad_norm": 0.3624061644077301,
2444
+ "kl": 0.000616533579886891,
2445
+ "learning_rate": 2.5e-06,
2446
+ "loss": 0.0,
2447
+ "num_tokens": 5823268.0,
2448
+ "reward": -0.29218366742134094,
2449
+ "reward_std": 0.7567039728164673,
2450
+ "rewards/reward_function/mean": -0.29218366742134094,
2451
+ "rewards/reward_function/std": 0.7567040324211121,
2452
+ "step": 111
2453
+ },
2454
+ {
2455
+ "completion_length": 993.625,
2456
+ "completions/clipped_ratio": 0.0,
2457
+ "completions/max_length": 1481.0,
2458
+ "completions/max_terminated_length": 1481.0,
2459
+ "completions/mean_length": 993.625,
2460
+ "completions/mean_terminated_length": 993.625,
2461
+ "completions/min_length": 747.0,
2462
+ "completions/min_terminated_length": 747.0,
2463
+ "epoch": 0.4375,
2464
+ "frac_reward_zero_std": 0.0,
2465
+ "grad_norm": 0.3491348624229431,
2466
+ "kl": 0.0005375376276788302,
2467
+ "learning_rate": 2.4563689839067913e-06,
2468
+ "loss": 0.0,
2469
+ "num_tokens": 5880817.0,
2470
+ "reward": 0.12125939130783081,
2471
+ "reward_std": 0.5074965357780457,
2472
+ "rewards/reward_function/mean": 0.12125939130783081,
2473
+ "rewards/reward_function/std": 0.5074965953826904,
2474
+ "step": 112
2475
+ },
2476
+ {
2477
+ "completion_length": 1085.5,
2478
+ "completions/clipped_ratio": 0.0,
2479
+ "completions/max_length": 1872.0,
2480
+ "completions/max_terminated_length": 1872.0,
2481
+ "completions/mean_length": 1085.5,
2482
+ "completions/mean_terminated_length": 1085.5,
2483
+ "completions/min_length": 877.0,
2484
+ "completions/min_terminated_length": 877.0,
2485
+ "epoch": 0.44140625,
2486
+ "frac_reward_zero_std": 0.0,
2487
+ "grad_norm": 0.33136165142059326,
2488
+ "kl": 0.0005395807238528505,
2489
+ "learning_rate": 2.4127512582437486e-06,
2490
+ "loss": 0.0,
2491
+ "num_tokens": 5926349.0,
2492
+ "reward": -0.11549624800682068,
2493
+ "reward_std": 0.7324655055999756,
2494
+ "rewards/reward_function/mean": -0.11549624800682068,
2495
+ "rewards/reward_function/std": 0.7324654459953308,
2496
+ "step": 113
2497
+ },
2498
+ {
2499
+ "completion_length": 956.625,
2500
+ "completions/clipped_ratio": 0.0,
2501
+ "completions/max_length": 1136.0,
2502
+ "completions/max_terminated_length": 1136.0,
2503
+ "completions/mean_length": 956.625,
2504
+ "completions/mean_terminated_length": 956.625,
2505
+ "completions/min_length": 763.0,
2506
+ "completions/min_terminated_length": 763.0,
2507
+ "epoch": 0.4453125,
2508
+ "frac_reward_zero_std": 0.0,
2509
+ "grad_norm": 0.325693815946579,
2510
+ "kl": 0.000503135692270007,
2511
+ "learning_rate": 2.3691601093926406e-06,
2512
+ "loss": 0.0,
2513
+ "num_tokens": 5966210.0,
2514
+ "reward": -0.006239533424377441,
2515
+ "reward_std": 0.6517221927642822,
2516
+ "rewards/reward_function/mean": -0.006239533424377441,
2517
+ "rewards/reward_function/std": 0.651722252368927,
2518
+ "step": 114
2519
+ },
2520
+ {
2521
+ "completion_length": 1024.625,
2522
+ "completions/clipped_ratio": 0.0,
2523
+ "completions/max_length": 1456.0,
2524
+ "completions/max_terminated_length": 1456.0,
2525
+ "completions/mean_length": 1024.625,
2526
+ "completions/mean_terminated_length": 1024.625,
2527
+ "completions/min_length": 864.0,
2528
+ "completions/min_terminated_length": 864.0,
2529
+ "epoch": 0.44921875,
2530
+ "frac_reward_zero_std": 0.0,
2531
+ "grad_norm": 0.3105911612510681,
2532
+ "kl": 0.0004542986207525246,
2533
+ "learning_rate": 2.325608815639687e-06,
2534
+ "loss": 0.0,
2535
+ "num_tokens": 6011255.0,
2536
+ "reward": 0.0608128160238266,
2537
+ "reward_std": 0.6547765731811523,
2538
+ "rewards/reward_function/mean": 0.0608128160238266,
2539
+ "rewards/reward_function/std": 0.6547765731811523,
2540
+ "step": 115
2541
+ },
2542
+ {
2543
+ "completion_length": 1240.75,
2544
+ "completions/clipped_ratio": 0.0,
2545
+ "completions/max_length": 1532.0,
2546
+ "completions/max_terminated_length": 1532.0,
2547
+ "completions/mean_length": 1240.75,
2548
+ "completions/mean_terminated_length": 1240.75,
2549
+ "completions/min_length": 942.0,
2550
+ "completions/min_terminated_length": 942.0,
2551
+ "epoch": 0.453125,
2552
+ "frac_reward_zero_std": 0.0,
2553
+ "grad_norm": 0.26683756709098816,
2554
+ "kl": 0.0005024799829698168,
2555
+ "learning_rate": 2.2821106431308546e-06,
2556
+ "loss": 0.0,
2557
+ "num_tokens": 6053389.0,
2558
+ "reward": 0.2592804431915283,
2559
+ "reward_std": 0.50887131690979,
2560
+ "rewards/reward_function/mean": 0.2592804431915283,
2561
+ "rewards/reward_function/std": 0.5088713765144348,
2562
+ "step": 116
2563
+ },
2564
+ {
2565
+ "completion_length": 1007.875,
2566
+ "completions/clipped_ratio": 0.0,
2567
+ "completions/max_length": 1154.0,
2568
+ "completions/max_terminated_length": 1154.0,
2569
+ "completions/mean_length": 1007.875,
2570
+ "completions/mean_terminated_length": 1007.875,
2571
+ "completions/min_length": 794.0,
2572
+ "completions/min_terminated_length": 794.0,
2573
+ "epoch": 0.45703125,
2574
+ "frac_reward_zero_std": 0.0,
2575
+ "grad_norm": 0.3248363137245178,
2576
+ "kl": 0.0005079211114207283,
2577
+ "learning_rate": 2.238678841830867e-06,
2578
+ "loss": 0.0,
2579
+ "num_tokens": 6098300.0,
2580
+ "reward": -0.29335445165634155,
2581
+ "reward_std": 0.7554491758346558,
2582
+ "rewards/reward_function/mean": -0.29335445165634155,
2583
+ "rewards/reward_function/std": 0.7554491758346558,
2584
+ "step": 117
2585
+ },
2586
+ {
2587
+ "completion_length": 1096.25,
2588
+ "completions/clipped_ratio": 0.0,
2589
+ "completions/max_length": 1226.0,
2590
+ "completions/max_terminated_length": 1226.0,
2591
+ "completions/mean_length": 1096.25,
2592
+ "completions/mean_terminated_length": 1096.25,
2593
+ "completions/min_length": 902.0,
2594
+ "completions/min_terminated_length": 902.0,
2595
+ "epoch": 0.4609375,
2596
+ "frac_reward_zero_std": 0.0,
2597
+ "grad_norm": 0.32839903235435486,
2598
+ "kl": 0.0005834192561451346,
2599
+ "learning_rate": 2.195326641487132e-06,
2600
+ "loss": 0.0,
2601
+ "num_tokens": 6139278.0,
2602
+ "reward": -0.10875681042671204,
2603
+ "reward_std": 0.7381133437156677,
2604
+ "rewards/reward_function/mean": -0.10875681042671204,
2605
+ "rewards/reward_function/std": 0.7381134033203125,
2606
+ "step": 118
2607
+ },
2608
+ {
2609
+ "completion_length": 1068.5,
2610
+ "completions/clipped_ratio": 0.0,
2611
+ "completions/max_length": 1261.0,
2612
+ "completions/max_terminated_length": 1261.0,
2613
+ "completions/mean_length": 1068.5,
2614
+ "completions/mean_terminated_length": 1068.5,
2615
+ "completions/min_length": 838.0,
2616
+ "completions/min_terminated_length": 838.0,
2617
+ "epoch": 0.46484375,
2618
+ "frac_reward_zero_std": 0.0,
2619
+ "grad_norm": 0.3054167628288269,
2620
+ "kl": 0.0005187875867704861,
2621
+ "learning_rate": 2.1520672475998374e-06,
2622
+ "loss": 0.0,
2623
+ "num_tokens": 6184674.0,
2624
+ "reward": -0.356624573469162,
2625
+ "reward_std": 0.7082850933074951,
2626
+ "rewards/reward_function/mean": -0.356624573469162,
2627
+ "rewards/reward_function/std": 0.7082850933074951,
2628
+ "step": 119
2629
+ },
2630
+ {
2631
+ "completion_length": 1191.875,
2632
+ "completions/clipped_ratio": 0.0,
2633
+ "completions/max_length": 1684.0,
2634
+ "completions/max_terminated_length": 1684.0,
2635
+ "completions/mean_length": 1191.875,
2636
+ "completions/mean_terminated_length": 1191.875,
2637
+ "completions/min_length": 929.0,
2638
+ "completions/min_terminated_length": 929.0,
2639
+ "epoch": 0.46875,
2640
+ "frac_reward_zero_std": 0.0,
2641
+ "grad_norm": 0.2923835217952728,
2642
+ "kl": 0.0005189872026676312,
2643
+ "learning_rate": 2.1089138373994226e-06,
2644
+ "loss": 0.0,
2645
+ "num_tokens": 6226417.0,
2646
+ "reward": -0.10591146349906921,
2647
+ "reward_std": 0.7404077053070068,
2648
+ "rewards/reward_function/mean": -0.10591146349906921,
2649
+ "rewards/reward_function/std": 0.7404076457023621,
2650
+ "step": 120
2651
+ },
2652
+ {
2653
+ "completion_length": 987.125,
2654
+ "completions/clipped_ratio": 0.0,
2655
+ "completions/max_length": 1249.0,
2656
+ "completions/max_terminated_length": 1249.0,
2657
+ "completions/mean_length": 987.125,
2658
+ "completions/mean_terminated_length": 987.125,
2659
+ "completions/min_length": 757.0,
2660
+ "completions/min_terminated_length": 757.0,
2661
+ "epoch": 0.47265625,
2662
+ "frac_reward_zero_std": 0.0,
2663
+ "grad_norm": 0.31898999214172363,
2664
+ "kl": 0.0005351051659090444,
2665
+ "learning_rate": 2.0658795558326745e-06,
2666
+ "loss": 0.0,
2667
+ "num_tokens": 6273762.0,
2668
+ "reward": -0.007490452378988266,
2669
+ "reward_std": 0.645849347114563,
2670
+ "rewards/reward_function/mean": -0.007490452378988266,
2671
+ "rewards/reward_function/std": 0.645849347114563,
2672
+ "step": 121
2673
+ },
2674
+ {
2675
+ "completion_length": 1239.125,
2676
+ "completions/clipped_ratio": 0.0,
2677
+ "completions/max_length": 2190.0,
2678
+ "completions/max_terminated_length": 2190.0,
2679
+ "completions/mean_length": 1239.125,
2680
+ "completions/mean_terminated_length": 1239.125,
2681
+ "completions/min_length": 834.0,
2682
+ "completions/min_terminated_length": 834.0,
2683
+ "epoch": 0.4765625,
2684
+ "frac_reward_zero_std": 0.0,
2685
+ "grad_norm": 0.3210497498512268,
2686
+ "kl": 0.0005187717179069296,
2687
+ "learning_rate": 2.022977511558638e-06,
2688
+ "loss": 0.0,
2689
+ "num_tokens": 6332283.0,
2690
+ "reward": 0.05918067693710327,
2691
+ "reward_std": 0.6538223028182983,
2692
+ "rewards/reward_function/mean": 0.05918067693710327,
2693
+ "rewards/reward_function/std": 0.6538223624229431,
2694
+ "step": 122
2695
+ },
2696
+ {
2697
+ "completion_length": 1006.125,
2698
+ "completions/clipped_ratio": 0.0,
2699
+ "completions/max_length": 1189.0,
2700
+ "completions/max_terminated_length": 1189.0,
2701
+ "completions/mean_length": 1006.125,
2702
+ "completions/mean_terminated_length": 1006.125,
2703
+ "completions/min_length": 898.0,
2704
+ "completions/min_terminated_length": 898.0,
2705
+ "epoch": 0.48046875,
2706
+ "frac_reward_zero_std": 0.0,
2707
+ "grad_norm": 0.3229573667049408,
2708
+ "kl": 0.0005403026298154145,
2709
+ "learning_rate": 1.9802207729556023e-06,
2710
+ "loss": 0.0,
2711
+ "num_tokens": 6379780.0,
2712
+ "reward": 0.17201174795627594,
2713
+ "reward_std": 0.5175948143005371,
2714
+ "rewards/reward_function/mean": 0.17201174795627594,
2715
+ "rewards/reward_function/std": 0.5175948739051819,
2716
+ "step": 123
2717
+ },
2718
+ {
2719
+ "completion_length": 971.0,
2720
+ "completions/clipped_ratio": 0.0,
2721
+ "completions/max_length": 1154.0,
2722
+ "completions/max_terminated_length": 1154.0,
2723
+ "completions/mean_length": 971.0,
2724
+ "completions/mean_terminated_length": 971.0,
2725
+ "completions/min_length": 845.0,
2726
+ "completions/min_terminated_length": 845.0,
2727
+ "epoch": 0.484375,
2728
+ "frac_reward_zero_std": 0.0,
2729
+ "grad_norm": 0.309548944234848,
2730
+ "kl": 0.00047888144035823643,
2731
+ "learning_rate": 1.937622364140338e-06,
2732
+ "loss": 0.0,
2733
+ "num_tokens": 6436156.0,
2734
+ "reward": -0.12267381697893143,
2735
+ "reward_std": 0.7265216708183289,
2736
+ "rewards/reward_function/mean": -0.12267381697893143,
2737
+ "rewards/reward_function/std": 0.7265217304229736,
2738
+ "step": 124
2739
+ },
2740
+ {
2741
+ "completion_length": 917.125,
2742
+ "completions/clipped_ratio": 0.0,
2743
+ "completions/max_length": 1127.0,
2744
+ "completions/max_terminated_length": 1127.0,
2745
+ "completions/mean_length": 917.125,
2746
+ "completions/mean_terminated_length": 917.125,
2747
+ "completions/min_length": 698.0,
2748
+ "completions/min_terminated_length": 698.0,
2749
+ "epoch": 0.48828125,
2750
+ "frac_reward_zero_std": 0.0,
2751
+ "grad_norm": 0.3436194658279419,
2752
+ "kl": 0.0005273657734505832,
2753
+ "learning_rate": 1.895195261000831e-06,
2754
+ "loss": 0.0,
2755
+ "num_tokens": 6482941.0,
2756
+ "reward": 0.06335102021694183,
2757
+ "reward_std": 0.6563363075256348,
2758
+ "rewards/reward_function/mean": 0.06335102021694183,
2759
+ "rewards/reward_function/std": 0.6563363075256348,
2760
+ "step": 125
2761
+ },
2762
+ {
2763
+ "completion_length": 981.25,
2764
+ "completions/clipped_ratio": 0.0,
2765
+ "completions/max_length": 1088.0,
2766
+ "completions/max_terminated_length": 1088.0,
2767
+ "completions/mean_length": 981.25,
2768
+ "completions/mean_terminated_length": 981.25,
2769
+ "completions/min_length": 882.0,
2770
+ "completions/min_terminated_length": 882.0,
2771
+ "epoch": 0.4921875,
2772
+ "frac_reward_zero_std": 0.0,
2773
+ "grad_norm": 0.3238428235054016,
2774
+ "kl": 0.00048635365965310484,
2775
+ "learning_rate": 1.852952387243698e-06,
2776
+ "loss": 0.0,
2777
+ "num_tokens": 6539399.0,
2778
+ "reward": 0.4136182963848114,
2779
+ "reward_std": 0.0051859593950212,
2780
+ "rewards/reward_function/mean": 0.4136182963848114,
2781
+ "rewards/reward_function/std": 0.005185958929359913,
2782
+ "step": 126
2783
+ },
2784
+ {
2785
+ "completion_length": 980.25,
2786
+ "completions/clipped_ratio": 0.0,
2787
+ "completions/max_length": 1164.0,
2788
+ "completions/max_terminated_length": 1164.0,
2789
+ "completions/mean_length": 980.25,
2790
+ "completions/mean_terminated_length": 980.25,
2791
+ "completions/min_length": 865.0,
2792
+ "completions/min_terminated_length": 865.0,
2793
+ "epoch": 0.49609375,
2794
+ "frac_reward_zero_std": 0.0,
2795
+ "grad_norm": 0.32173460721969604,
2796
+ "kl": 0.0005382613235269673,
2797
+ "learning_rate": 1.8109066104575023e-06,
2798
+ "loss": -0.0,
2799
+ "num_tokens": 6586689.0,
2800
+ "reward": 0.4277029037475586,
2801
+ "reward_std": 0.009604846127331257,
2802
+ "rewards/reward_function/mean": 0.4277029037475586,
2803
+ "rewards/reward_function/std": 0.00960485078394413,
2804
+ "step": 127
2805
+ },
2806
+ {
2807
+ "completion_length": 1130.5,
2808
+ "completions/clipped_ratio": 0.0,
2809
+ "completions/max_length": 1473.0,
2810
+ "completions/max_terminated_length": 1473.0,
2811
+ "completions/mean_length": 1130.5,
2812
+ "completions/mean_terminated_length": 1130.5,
2813
+ "completions/min_length": 904.0,
2814
+ "completions/min_terminated_length": 904.0,
2815
+ "epoch": 0.5,
2816
+ "frac_reward_zero_std": 0.0,
2817
+ "grad_norm": 0.27207547426223755,
2818
+ "kl": 0.0005242059778538533,
2819
+ "learning_rate": 1.7690707381931585e-06,
2820
+ "loss": 0.0,
2821
+ "num_tokens": 6644341.0,
2822
+ "reward": -0.4683274030685425,
2823
+ "reward_std": 0.7338022589683533,
2824
+ "rewards/reward_function/mean": -0.4683274030685425,
2825
+ "rewards/reward_function/std": 0.7338022589683533,
2826
+ "step": 128
2827
+ },
2828
+ {
2829
+ "completion_length": 994.5,
2830
+ "completions/clipped_ratio": 0.0,
2831
+ "completions/max_length": 1057.0,
2832
+ "completions/max_terminated_length": 1057.0,
2833
+ "completions/mean_length": 994.5,
2834
+ "completions/mean_terminated_length": 994.5,
2835
+ "completions/min_length": 866.0,
2836
+ "completions/min_terminated_length": 866.0,
2837
+ "epoch": 0.50390625,
2838
+ "frac_reward_zero_std": 0.0,
2839
+ "grad_norm": 0.30755338072776794,
2840
+ "kl": 0.0005628820799756795,
2841
+ "learning_rate": 1.7274575140626318e-06,
2842
+ "loss": 0.0,
2843
+ "num_tokens": 6698833.0,
2844
+ "reward": -0.016938716173171997,
2845
+ "reward_std": 0.645233154296875,
2846
+ "rewards/reward_function/mean": -0.016938716173171997,
2847
+ "rewards/reward_function/std": 0.6452332139015198,
2848
+ "step": 129
2849
+ },
2850
+ {
2851
+ "completion_length": 1191.5,
2852
+ "completions/clipped_ratio": 0.0,
2853
+ "completions/max_length": 1623.0,
2854
+ "completions/max_terminated_length": 1623.0,
2855
+ "completions/mean_length": 1191.5,
2856
+ "completions/mean_terminated_length": 1191.5,
2857
+ "completions/min_length": 890.0,
2858
+ "completions/min_terminated_length": 890.0,
2859
+ "epoch": 0.5078125,
2860
+ "frac_reward_zero_std": 0.0,
2861
+ "grad_norm": 0.2911682426929474,
2862
+ "kl": 0.0005079254988231696,
2863
+ "learning_rate": 1.686079613857109e-06,
2864
+ "loss": 0.0,
2865
+ "num_tokens": 6755397.0,
2866
+ "reward": -0.10992002487182617,
2867
+ "reward_std": 0.7371261119842529,
2868
+ "rewards/reward_function/mean": -0.10992002487182617,
2869
+ "rewards/reward_function/std": 0.7371261119842529,
2870
+ "step": 130
2871
+ },
2872
+ {
2873
+ "completion_length": 929.375,
2874
+ "completions/clipped_ratio": 0.0,
2875
+ "completions/max_length": 1217.0,
2876
+ "completions/max_terminated_length": 1217.0,
2877
+ "completions/mean_length": 929.375,
2878
+ "completions/mean_terminated_length": 929.375,
2879
+ "completions/min_length": 202.0,
2880
+ "completions/min_terminated_length": 202.0,
2881
+ "epoch": 0.51171875,
2882
+ "frac_reward_zero_std": 0.0,
2883
+ "grad_norm": 1.3324370384216309,
2884
+ "kl": 0.0007534601754741743,
2885
+ "learning_rate": 1.6449496416858285e-06,
2886
+ "loss": 0.0,
2887
+ "num_tokens": 6809368.0,
2888
+ "reward": 0.1685469150543213,
2889
+ "reward_std": 0.5241107940673828,
2890
+ "rewards/reward_function/mean": 0.1685469150543213,
2891
+ "rewards/reward_function/std": 0.5241108536720276,
2892
+ "step": 131
2893
+ },
2894
+ {
2895
+ "completion_length": 1025.625,
2896
+ "completions/clipped_ratio": 0.0,
2897
+ "completions/max_length": 1330.0,
2898
+ "completions/max_terminated_length": 1330.0,
2899
+ "completions/mean_length": 1025.625,
2900
+ "completions/mean_terminated_length": 1025.625,
2901
+ "completions/min_length": 854.0,
2902
+ "completions/min_terminated_length": 854.0,
2903
+ "epoch": 0.515625,
2904
+ "frac_reward_zero_std": 0.0,
2905
+ "grad_norm": 0.34628283977508545,
2906
+ "kl": 0.0005288709871820174,
2907
+ "learning_rate": 1.6040801261367494e-06,
2908
+ "loss": 0.0,
2909
+ "num_tokens": 6864605.0,
2910
+ "reward": -0.10574899613857269,
2911
+ "reward_std": 0.7405577301979065,
2912
+ "rewards/reward_function/mean": -0.10574899613857269,
2913
+ "rewards/reward_function/std": 0.7405577898025513,
2914
+ "step": 132
2915
+ },
2916
+ {
2917
+ "completion_length": 1157.75,
2918
+ "completions/clipped_ratio": 0.0,
2919
+ "completions/max_length": 1519.0,
2920
+ "completions/max_terminated_length": 1519.0,
2921
+ "completions/mean_length": 1157.75,
2922
+ "completions/mean_terminated_length": 1157.75,
2923
+ "completions/min_length": 886.0,
2924
+ "completions/min_terminated_length": 886.0,
2925
+ "epoch": 0.51953125,
2926
+ "frac_reward_zero_std": 0.0,
2927
+ "grad_norm": 0.29528382420539856,
2928
+ "kl": 0.0005665956414304674,
2929
+ "learning_rate": 1.56348351646022e-06,
2930
+ "loss": 0.0,
2931
+ "num_tokens": 6920403.0,
2932
+ "reward": -0.111003577709198,
2933
+ "reward_std": 0.7361726760864258,
2934
+ "rewards/reward_function/mean": -0.111003577709198,
2935
+ "rewards/reward_function/std": 0.7361726760864258,
2936
+ "step": 133
2937
+ },
2938
+ {
2939
+ "completion_length": 1294.25,
2940
+ "completions/clipped_ratio": 0.0,
2941
+ "completions/max_length": 2684.0,
2942
+ "completions/max_terminated_length": 2684.0,
2943
+ "completions/mean_length": 1294.25,
2944
+ "completions/mean_terminated_length": 1294.25,
2945
+ "completions/min_length": 824.0,
2946
+ "completions/min_terminated_length": 824.0,
2947
+ "epoch": 0.5234375,
2948
+ "frac_reward_zero_std": 0.0,
2949
+ "grad_norm": 0.19954568147659302,
2950
+ "kl": 0.0004810404570889659,
2951
+ "learning_rate": 1.5231721787768162e-06,
2952
+ "loss": 0.0,
2953
+ "num_tokens": 6977789.0,
2954
+ "reward": 0.2454666793346405,
2955
+ "reward_std": 0.503404974937439,
2956
+ "rewards/reward_function/mean": 0.2454666793346405,
2957
+ "rewards/reward_function/std": 0.503404974937439,
2958
+ "step": 134
2959
+ },
2960
+ {
2961
+ "completion_length": 1026.875,
2962
+ "completions/clipped_ratio": 0.0,
2963
+ "completions/max_length": 1272.0,
2964
+ "completions/max_terminated_length": 1272.0,
2965
+ "completions/mean_length": 1026.875,
2966
+ "completions/mean_terminated_length": 1026.875,
2967
+ "completions/min_length": 663.0,
2968
+ "completions/min_terminated_length": 663.0,
2969
+ "epoch": 0.52734375,
2970
+ "frac_reward_zero_std": 0.0,
2971
+ "grad_norm": 0.2833687961101532,
2972
+ "kl": 0.00051848181465175,
2973
+ "learning_rate": 1.4831583923105e-06,
2974
+ "loss": 0.0,
2975
+ "num_tokens": 7032540.0,
2976
+ "reward": 0.23018378019332886,
2977
+ "reward_std": 0.4979630708694458,
2978
+ "rewards/reward_function/mean": 0.23018378019332886,
2979
+ "rewards/reward_function/std": 0.4979631304740906,
2980
+ "step": 135
2981
+ },
2982
+ {
2983
+ "completion_length": 1143.5,
2984
+ "completions/clipped_ratio": 0.0,
2985
+ "completions/max_length": 2065.0,
2986
+ "completions/max_terminated_length": 2065.0,
2987
+ "completions/mean_length": 1143.5,
2988
+ "completions/mean_terminated_length": 1143.5,
2989
+ "completions/min_length": 831.0,
2990
+ "completions/min_terminated_length": 831.0,
2991
+ "epoch": 0.53125,
2992
+ "frac_reward_zero_std": 0.0,
2993
+ "grad_norm": 0.2916565239429474,
2994
+ "kl": 0.0005820317019242793,
2995
+ "learning_rate": 1.443454345648252e-06,
2996
+ "loss": 0.0,
2997
+ "num_tokens": 7088720.0,
2998
+ "reward": 0.0656561329960823,
2999
+ "reward_std": 0.6580207943916321,
3000
+ "rewards/reward_function/mean": 0.0656561329960823,
3001
+ "rewards/reward_function/std": 0.6580208539962769,
3002
+ "step": 136
3003
+ },
3004
+ {
3005
+ "completion_length": 1176.875,
3006
+ "completions/clipped_ratio": 0.0,
3007
+ "completions/max_length": 1339.0,
3008
+ "completions/max_terminated_length": 1339.0,
3009
+ "completions/mean_length": 1176.875,
3010
+ "completions/mean_terminated_length": 1176.875,
3011
+ "completions/min_length": 1011.0,
3012
+ "completions/min_terminated_length": 1011.0,
3013
+ "epoch": 0.53515625,
3014
+ "frac_reward_zero_std": 0.0,
3015
+ "grad_norm": 0.3021836280822754,
3016
+ "kl": 0.00046610408753622323,
3017
+ "learning_rate": 1.4040721330273063e-06,
3018
+ "loss": 0.0,
3019
+ "num_tokens": 7143431.0,
3020
+ "reward": -0.28037768602371216,
3021
+ "reward_std": 0.7693378329277039,
3022
+ "rewards/reward_function/mean": -0.28037768602371216,
3023
+ "rewards/reward_function/std": 0.7693378329277039,
3024
+ "step": 137
3025
+ },
3026
+ {
3027
+ "completion_length": 1035.125,
3028
+ "completions/clipped_ratio": 0.0,
3029
+ "completions/max_length": 1316.0,
3030
+ "completions/max_terminated_length": 1316.0,
3031
+ "completions/mean_length": 1035.125,
3032
+ "completions/mean_terminated_length": 1035.125,
3033
+ "completions/min_length": 822.0,
3034
+ "completions/min_terminated_length": 822.0,
3035
+ "epoch": 0.5390625,
3036
+ "frac_reward_zero_std": 0.0,
3037
+ "grad_norm": 0.32248759269714355,
3038
+ "kl": 0.00046888978249626234,
3039
+ "learning_rate": 1.3650237506511333e-06,
3040
+ "loss": 0.0,
3041
+ "num_tokens": 7194392.0,
3042
+ "reward": 0.250247597694397,
3043
+ "reward_std": 0.5053042769432068,
3044
+ "rewards/reward_function/mean": 0.250247597694397,
3045
+ "rewards/reward_function/std": 0.5053042769432068,
3046
+ "step": 138
3047
+ },
3048
+ {
3049
+ "completion_length": 1030.25,
3050
+ "completions/clipped_ratio": 0.0,
3051
+ "completions/max_length": 1268.0,
3052
+ "completions/max_terminated_length": 1268.0,
3053
+ "completions/mean_length": 1030.25,
3054
+ "completions/mean_terminated_length": 1030.25,
3055
+ "completions/min_length": 778.0,
3056
+ "completions/min_terminated_length": 778.0,
3057
+ "epoch": 0.54296875,
3058
+ "frac_reward_zero_std": 0.0,
3059
+ "grad_norm": 0.3456158936023712,
3060
+ "kl": 0.0005144016322446987,
3061
+ "learning_rate": 1.3263210930352737e-06,
3062
+ "loss": 0.0,
3063
+ "num_tokens": 7247930.0,
3064
+ "reward": 0.2453567385673523,
3065
+ "reward_std": 0.503298282623291,
3066
+ "rewards/reward_function/mean": 0.2453567385673523,
3067
+ "rewards/reward_function/std": 0.503298282623291,
3068
+ "step": 139
3069
+ },
3070
+ {
3071
+ "completion_length": 1074.125,
3072
+ "completions/clipped_ratio": 0.0,
3073
+ "completions/max_length": 1220.0,
3074
+ "completions/max_terminated_length": 1220.0,
3075
+ "completions/mean_length": 1074.125,
3076
+ "completions/mean_terminated_length": 1074.125,
3077
+ "completions/min_length": 887.0,
3078
+ "completions/min_terminated_length": 887.0,
3079
+ "epoch": 0.546875,
3080
+ "frac_reward_zero_std": 0.0,
3081
+ "grad_norm": 0.2869755029678345,
3082
+ "kl": 0.00043533078132895753,
3083
+ "learning_rate": 1.2879759493841577e-06,
3084
+ "loss": 0.0,
3085
+ "num_tokens": 7299203.0,
3086
+ "reward": -0.10356661677360535,
3087
+ "reward_std": 0.7423522472381592,
3088
+ "rewards/reward_function/mean": -0.10356661677360535,
3089
+ "rewards/reward_function/std": 0.7423522472381592,
3090
+ "step": 140
3091
+ },
3092
+ {
3093
+ "completion_length": 1284.0,
3094
+ "completions/clipped_ratio": 0.0,
3095
+ "completions/max_length": 1708.0,
3096
+ "completions/max_terminated_length": 1708.0,
3097
+ "completions/mean_length": 1284.0,
3098
+ "completions/mean_terminated_length": 1284.0,
3099
+ "completions/min_length": 1076.0,
3100
+ "completions/min_terminated_length": 1076.0,
3101
+ "epoch": 0.55078125,
3102
+ "frac_reward_zero_std": 0.0,
3103
+ "grad_norm": 0.2680242955684662,
3104
+ "kl": 0.000499433335789945,
3105
+ "learning_rate": 1.2500000000000007e-06,
3106
+ "loss": 0.0,
3107
+ "num_tokens": 7354771.0,
3108
+ "reward": 0.014156922698020935,
3109
+ "reward_std": 0.6485656499862671,
3110
+ "rewards/reward_function/mean": 0.014156922698020935,
3111
+ "rewards/reward_function/std": 0.6485656499862671,
3112
+ "step": 141
3113
+ },
3114
+ {
3115
+ "completion_length": 1086.625,
3116
+ "completions/clipped_ratio": 0.0,
3117
+ "completions/max_length": 1583.0,
3118
+ "completions/max_terminated_length": 1583.0,
3119
+ "completions/mean_length": 1086.625,
3120
+ "completions/mean_terminated_length": 1086.625,
3121
+ "completions/min_length": 876.0,
3122
+ "completions/min_terminated_length": 876.0,
3123
+ "epoch": 0.5546875,
3124
+ "frac_reward_zero_std": 0.0,
3125
+ "grad_norm": 0.28385043144226074,
3126
+ "kl": 0.0004665546293836087,
3127
+ "learning_rate": 1.2124048127248644e-06,
3128
+ "loss": 0.0,
3129
+ "num_tokens": 7406144.0,
3130
+ "reward": 0.07084375619888306,
3131
+ "reward_std": 0.6610893607139587,
3132
+ "rewards/reward_function/mean": 0.07084375619888306,
3133
+ "rewards/reward_function/std": 0.661089301109314,
3134
+ "step": 142
3135
+ },
3136
+ {
3137
+ "completion_length": 1445.0,
3138
+ "completions/clipped_ratio": 0.0,
3139
+ "completions/max_length": 1970.0,
3140
+ "completions/max_terminated_length": 1970.0,
3141
+ "completions/mean_length": 1445.0,
3142
+ "completions/mean_terminated_length": 1445.0,
3143
+ "completions/min_length": 1009.0,
3144
+ "completions/min_terminated_length": 1009.0,
3145
+ "epoch": 0.55859375,
3146
+ "frac_reward_zero_std": 0.0,
3147
+ "grad_norm": 0.27446597814559937,
3148
+ "kl": 0.0005598404095508158,
3149
+ "learning_rate": 1.1752018394169882e-06,
3150
+ "loss": 0.0,
3151
+ "num_tokens": 7463000.0,
3152
+ "reward": -0.09585624933242798,
3153
+ "reward_std": 0.7488118410110474,
3154
+ "rewards/reward_function/mean": -0.09585624933242798,
3155
+ "rewards/reward_function/std": 0.7488118410110474,
3156
+ "step": 143
3157
+ },
3158
+ {
3159
+ "completion_length": 1051.5,
3160
+ "completions/clipped_ratio": 0.0,
3161
+ "completions/max_length": 1423.0,
3162
+ "completions/max_terminated_length": 1423.0,
3163
+ "completions/mean_length": 1051.5,
3164
+ "completions/mean_terminated_length": 1051.5,
3165
+ "completions/min_length": 884.0,
3166
+ "completions/min_terminated_length": 884.0,
3167
+ "epoch": 0.5625,
3168
+ "frac_reward_zero_std": 0.0,
3169
+ "grad_norm": 0.28686314821243286,
3170
+ "kl": 0.000490581318445038,
3171
+ "learning_rate": 1.1384024124624324e-06,
3172
+ "loss": 0.0,
3173
+ "num_tokens": 7514092.0,
3174
+ "reward": 0.25037020444869995,
3175
+ "reward_std": 0.5052564740180969,
3176
+ "rewards/reward_function/mean": 0.25037020444869995,
3177
+ "rewards/reward_function/std": 0.5052564740180969,
3178
+ "step": 144
3179
+ },
3180
+ {
3181
+ "completion_length": 910.25,
3182
+ "completions/clipped_ratio": 0.0,
3183
+ "completions/max_length": 1172.0,
3184
+ "completions/max_terminated_length": 1172.0,
3185
+ "completions/mean_length": 910.25,
3186
+ "completions/mean_terminated_length": 910.25,
3187
+ "completions/min_length": 111.0,
3188
+ "completions/min_terminated_length": 111.0,
3189
+ "epoch": 0.56640625,
3190
+ "frac_reward_zero_std": 0.0,
3191
+ "grad_norm": 2.037623882293701,
3192
+ "kl": 0.0005943369615124539,
3193
+ "learning_rate": 1.1020177413231334e-06,
3194
+ "loss": 0.0,
3195
+ "num_tokens": 7570974.0,
3196
+ "reward": 0.25001391768455505,
3197
+ "reward_std": 0.5055700540542603,
3198
+ "rewards/reward_function/mean": 0.25001391768455505,
3199
+ "rewards/reward_function/std": 0.505570113658905,
3200
+ "step": 145
3201
+ },
3202
+ {
3203
+ "completion_length": 1040.5,
3204
+ "completions/clipped_ratio": 0.0,
3205
+ "completions/max_length": 1460.0,
3206
+ "completions/max_terminated_length": 1460.0,
3207
+ "completions/mean_length": 1040.5,
3208
+ "completions/mean_terminated_length": 1040.5,
3209
+ "completions/min_length": 847.0,
3210
+ "completions/min_terminated_length": 847.0,
3211
+ "epoch": 0.5703125,
3212
+ "frac_reward_zero_std": 0.0,
3213
+ "grad_norm": 0.4468381106853485,
3214
+ "kl": 0.0005619311414193362,
3215
+ "learning_rate": 1.0660589091223854e-06,
3216
+ "loss": 0.0,
3217
+ "num_tokens": 7620194.0,
3218
+ "reward": 0.23401594161987305,
3219
+ "reward_std": 0.49918097257614136,
3220
+ "rewards/reward_function/mean": 0.23401594161987305,
3221
+ "rewards/reward_function/std": 0.49918097257614136,
3222
+ "step": 146
3223
+ },
3224
+ {
3225
+ "completion_length": 1247.5,
3226
+ "completions/clipped_ratio": 0.0,
3227
+ "completions/max_length": 1814.0,
3228
+ "completions/max_terminated_length": 1814.0,
3229
+ "completions/mean_length": 1247.5,
3230
+ "completions/mean_terminated_length": 1247.5,
3231
+ "completions/min_length": 901.0,
3232
+ "completions/min_terminated_length": 901.0,
3233
+ "epoch": 0.57421875,
3234
+ "frac_reward_zero_std": 0.0,
3235
+ "grad_norm": 0.3102348744869232,
3236
+ "kl": 0.0004937511766911484,
3237
+ "learning_rate": 1.0305368692688175e-06,
3238
+ "loss": 0.0,
3239
+ "num_tokens": 7679774.0,
3240
+ "reward": 0.08763788640499115,
3241
+ "reward_std": 0.671554684638977,
3242
+ "rewards/reward_function/mean": 0.08763788640499115,
3243
+ "rewards/reward_function/std": 0.671554684638977,
3244
+ "step": 147
3245
+ },
3246
+ {
3247
+ "completion_length": 1088.375,
3248
+ "completions/clipped_ratio": 0.0,
3249
+ "completions/max_length": 1270.0,
3250
+ "completions/max_terminated_length": 1270.0,
3251
+ "completions/mean_length": 1088.375,
3252
+ "completions/mean_terminated_length": 1088.375,
3253
+ "completions/min_length": 955.0,
3254
+ "completions/min_terminated_length": 955.0,
3255
+ "epoch": 0.578125,
3256
+ "frac_reward_zero_std": 0.0,
3257
+ "grad_norm": 0.3440247178077698,
3258
+ "kl": 0.000519426612299867,
3259
+ "learning_rate": 9.95462442119879e-07,
3260
+ "loss": 0.0,
3261
+ "num_tokens": 7729377.0,
3262
+ "reward": 0.004164457321166992,
3263
+ "reward_std": 0.6441723108291626,
3264
+ "rewards/reward_function/mean": 0.004164457321166992,
3265
+ "rewards/reward_function/std": 0.6441722512245178,
3266
+ "step": 148
3267
+ },
3268
+ {
3269
+ "completion_length": 1232.0,
3270
+ "completions/clipped_ratio": 0.0,
3271
+ "completions/max_length": 1499.0,
3272
+ "completions/max_terminated_length": 1499.0,
3273
+ "completions/mean_length": 1232.0,
3274
+ "completions/mean_terminated_length": 1232.0,
3275
+ "completions/min_length": 906.0,
3276
+ "completions/min_terminated_length": 906.0,
3277
+ "epoch": 0.58203125,
3278
+ "frac_reward_zero_std": 0.0,
3279
+ "grad_norm": 0.2699142098426819,
3280
+ "kl": 0.0005399520887294784,
3281
+ "learning_rate": 9.608463116858544e-07,
3282
+ "loss": 0.0,
3283
+ "num_tokens": 7788833.0,
3284
+ "reward": -0.1629893034696579,
3285
+ "reward_std": 0.7170795202255249,
3286
+ "rewards/reward_function/mean": -0.1629893034696579,
3287
+ "rewards/reward_function/std": 0.7170795202255249,
3288
+ "step": 149
3289
+ },
3290
+ {
3291
+ "completion_length": 1058.25,
3292
+ "completions/clipped_ratio": 0.0,
3293
+ "completions/max_length": 1149.0,
3294
+ "completions/max_terminated_length": 1149.0,
3295
+ "completions/mean_length": 1058.25,
3296
+ "completions/mean_terminated_length": 1058.25,
3297
+ "completions/min_length": 867.0,
3298
+ "completions/min_terminated_length": 867.0,
3299
+ "epoch": 0.5859375,
3300
+ "frac_reward_zero_std": 0.0,
3301
+ "grad_norm": 0.2994002103805542,
3302
+ "kl": 0.0004978677097824402,
3303
+ "learning_rate": 9.266990223754069e-07,
3304
+ "loss": 0.0,
3305
+ "num_tokens": 7838195.0,
3306
+ "reward": -0.6465986371040344,
3307
+ "reward_std": 0.6543768048286438,
3308
+ "rewards/reward_function/mean": -0.6465986371040344,
3309
+ "rewards/reward_function/std": 0.6543768048286438,
3310
+ "step": 150
3311
+ },
3312
+ {
3313
+ "completion_length": 1092.375,
3314
+ "completions/clipped_ratio": 0.0,
3315
+ "completions/max_length": 1344.0,
3316
+ "completions/max_terminated_length": 1344.0,
3317
+ "completions/mean_length": 1092.375,
3318
+ "completions/mean_terminated_length": 1092.375,
3319
+ "completions/min_length": 787.0,
3320
+ "completions/min_terminated_length": 787.0,
3321
+ "epoch": 0.58984375,
3322
+ "frac_reward_zero_std": 0.0,
3323
+ "grad_norm": 0.30443111062049866,
3324
+ "kl": 0.000516350322868675,
3325
+ "learning_rate": 8.930309757836517e-07,
3326
+ "loss": 0.0,
3327
+ "num_tokens": 7896534.0,
3328
+ "reward": 0.008587591350078583,
3329
+ "reward_std": 0.6500032544136047,
3330
+ "rewards/reward_function/mean": 0.008587591350078583,
3331
+ "rewards/reward_function/std": 0.6500033140182495,
3332
+ "step": 151
3333
+ },
3334
+ {
3335
+ "completion_length": 969.875,
3336
+ "completions/clipped_ratio": 0.0,
3337
+ "completions/max_length": 1327.0,
3338
+ "completions/max_terminated_length": 1327.0,
3339
+ "completions/mean_length": 969.875,
3340
+ "completions/mean_terminated_length": 969.875,
3341
+ "completions/min_length": 94.0,
3342
+ "completions/min_terminated_length": 94.0,
3343
+ "epoch": 0.59375,
3344
+ "frac_reward_zero_std": 0.0,
3345
+ "grad_norm": 0.7537317276000977,
3346
+ "kl": 0.0005359335409593768,
3347
+ "learning_rate": 8.598524275237321e-07,
3348
+ "loss": 0.0,
3349
+ "num_tokens": 7945189.0,
3350
+ "reward": -0.24850904941558838,
3351
+ "reward_std": 0.6657283306121826,
3352
+ "rewards/reward_function/mean": -0.24850904941558838,
3353
+ "rewards/reward_function/std": 0.6657283306121826,
3354
+ "step": 152
3355
+ },
3356
+ {
3357
+ "completion_length": 1033.625,
3358
+ "completions/clipped_ratio": 0.0,
3359
+ "completions/max_length": 1727.0,
3360
+ "completions/max_terminated_length": 1727.0,
3361
+ "completions/mean_length": 1033.625,
3362
+ "completions/mean_terminated_length": 1033.625,
3363
+ "completions/min_length": 667.0,
3364
+ "completions/min_terminated_length": 667.0,
3365
+ "epoch": 0.59765625,
3366
+ "frac_reward_zero_std": 0.0,
3367
+ "grad_norm": 0.34412840008735657,
3368
+ "kl": 0.000547852658201009,
3369
+ "learning_rate": 8.271734841028553e-07,
3370
+ "loss": 0.0,
3371
+ "num_tokens": 7996642.0,
3372
+ "reward": 0.16972142457962036,
3373
+ "reward_std": 0.5201033353805542,
3374
+ "rewards/reward_function/mean": 0.16972142457962036,
3375
+ "rewards/reward_function/std": 0.5201033353805542,
3376
+ "step": 153
3377
+ },
3378
+ {
3379
+ "completion_length": 828.875,
3380
+ "completions/clipped_ratio": 0.0,
3381
+ "completions/max_length": 993.0,
3382
+ "completions/max_terminated_length": 993.0,
3383
+ "completions/mean_length": 828.875,
3384
+ "completions/mean_terminated_length": 828.875,
3385
+ "completions/min_length": 545.0,
3386
+ "completions/min_terminated_length": 545.0,
3387
+ "epoch": 0.6015625,
3388
+ "frac_reward_zero_std": 0.0,
3389
+ "grad_norm": 0.3883880376815796,
3390
+ "kl": 0.0005664439522661269,
3391
+ "learning_rate": 7.950040998437541e-07,
3392
+ "loss": 0.0,
3393
+ "num_tokens": 8046737.0,
3394
+ "reward": -0.36253300309181213,
3395
+ "reward_std": 0.7088689208030701,
3396
+ "rewards/reward_function/mean": -0.36253300309181213,
3397
+ "rewards/reward_function/std": 0.7088689208030701,
3398
+ "step": 154
3399
+ },
3400
+ {
3401
+ "completion_length": 1078.375,
3402
+ "completions/clipped_ratio": 0.0,
3403
+ "completions/max_length": 1602.0,
3404
+ "completions/max_terminated_length": 1602.0,
3405
+ "completions/mean_length": 1078.375,
3406
+ "completions/mean_terminated_length": 1078.375,
3407
+ "completions/min_length": 809.0,
3408
+ "completions/min_terminated_length": 809.0,
3409
+ "epoch": 0.60546875,
3410
+ "frac_reward_zero_std": 0.0,
3411
+ "grad_norm": 0.3525019586086273,
3412
+ "kl": 0.000516050225996878,
3413
+ "learning_rate": 7.633540738525066e-07,
3414
+ "loss": 0.0,
3415
+ "num_tokens": 8098548.0,
3416
+ "reward": 0.010962013155221939,
3417
+ "reward_std": 0.6493006348609924,
3418
+ "rewards/reward_function/mean": 0.010962013155221939,
3419
+ "rewards/reward_function/std": 0.6493006944656372,
3420
+ "step": 155
3421
+ },
3422
+ {
3423
+ "completion_length": 900.625,
3424
+ "completions/clipped_ratio": 0.0,
3425
+ "completions/max_length": 1039.0,
3426
+ "completions/max_terminated_length": 1039.0,
3427
+ "completions/mean_length": 900.625,
3428
+ "completions/mean_terminated_length": 900.625,
3429
+ "completions/min_length": 755.0,
3430
+ "completions/min_terminated_length": 755.0,
3431
+ "epoch": 0.609375,
3432
+ "frac_reward_zero_std": 0.0,
3433
+ "grad_norm": 0.4146842658519745,
3434
+ "kl": 0.0006649910937994719,
3435
+ "learning_rate": 7.322330470336314e-07,
3436
+ "loss": 0.0,
3437
+ "num_tokens": 8149217.0,
3438
+ "reward": 0.07593274116516113,
3439
+ "reward_std": 0.6641193628311157,
3440
+ "rewards/reward_function/mean": 0.07593274116516113,
3441
+ "rewards/reward_function/std": 0.6641193628311157,
3442
+ "step": 156
3443
+ },
3444
+ {
3445
+ "completion_length": 794.0,
3446
+ "completions/clipped_ratio": 0.0,
3447
+ "completions/max_length": 999.0,
3448
+ "completions/max_terminated_length": 999.0,
3449
+ "completions/mean_length": 794.0,
3450
+ "completions/mean_terminated_length": 794.0,
3451
+ "completions/min_length": 129.0,
3452
+ "completions/min_terminated_length": 129.0,
3453
+ "epoch": 0.61328125,
3454
+ "frac_reward_zero_std": 0.0,
3455
+ "grad_norm": 0.9788252711296082,
3456
+ "kl": 0.000908300731680356,
3457
+ "learning_rate": 7.016504991533727e-07,
3458
+ "loss": 0.0,
3459
+ "num_tokens": 8198753.0,
3460
+ "reward": -0.28955256938934326,
3461
+ "reward_std": 0.7595077753067017,
3462
+ "rewards/reward_function/mean": -0.28955256938934326,
3463
+ "rewards/reward_function/std": 0.7595077753067017,
3464
+ "step": 157
3465
+ },
3466
+ {
3467
+ "completion_length": 859.25,
3468
+ "completions/clipped_ratio": 0.0,
3469
+ "completions/max_length": 1018.0,
3470
+ "completions/max_terminated_length": 1018.0,
3471
+ "completions/mean_length": 859.25,
3472
+ "completions/mean_terminated_length": 859.25,
3473
+ "completions/min_length": 712.0,
3474
+ "completions/min_terminated_length": 712.0,
3475
+ "epoch": 0.6171875,
3476
+ "frac_reward_zero_std": 0.0,
3477
+ "grad_norm": 0.3605237305164337,
3478
+ "kl": 0.0005935588560532779,
3479
+ "learning_rate": 6.716157459520739e-07,
3480
+ "loss": 0.0,
3481
+ "num_tokens": 8249091.0,
3482
+ "reward": 0.25031226873397827,
3483
+ "reward_std": 0.5052971839904785,
3484
+ "rewards/reward_function/mean": 0.25031226873397827,
3485
+ "rewards/reward_function/std": 0.5052971839904785,
3486
+ "step": 158
3487
+ },
3488
+ {
3489
+ "completion_length": 1081.75,
3490
+ "completions/clipped_ratio": 0.0,
3491
+ "completions/max_length": 1559.0,
3492
+ "completions/max_terminated_length": 1559.0,
3493
+ "completions/mean_length": 1081.75,
3494
+ "completions/mean_terminated_length": 1081.75,
3495
+ "completions/min_length": 846.0,
3496
+ "completions/min_terminated_length": 846.0,
3497
+ "epoch": 0.62109375,
3498
+ "frac_reward_zero_std": 0.0,
3499
+ "grad_norm": 0.39315265417099,
3500
+ "kl": 0.0004596119179041125,
3501
+ "learning_rate": 6.421379363065142e-07,
3502
+ "loss": 0.0,
3503
+ "num_tokens": 8300929.0,
3504
+ "reward": 0.24795930087566376,
3505
+ "reward_std": 0.5042985677719116,
3506
+ "rewards/reward_function/mean": 0.24795930087566376,
3507
+ "rewards/reward_function/std": 0.5042985677719116,
3508
+ "step": 159
3509
+ },
3510
+ {
3511
+ "completion_length": 1107.25,
3512
+ "completions/clipped_ratio": 0.0,
3513
+ "completions/max_length": 1957.0,
3514
+ "completions/max_terminated_length": 1957.0,
3515
+ "completions/mean_length": 1107.25,
3516
+ "completions/mean_terminated_length": 1107.25,
3517
+ "completions/min_length": 792.0,
3518
+ "completions/min_terminated_length": 792.0,
3519
+ "epoch": 0.625,
3520
+ "frac_reward_zero_std": 0.0,
3521
+ "grad_norm": 0.352717787027359,
3522
+ "kl": 0.0006316443686955608,
3523
+ "learning_rate": 6.1322604944307e-07,
3524
+ "loss": 0.0,
3525
+ "num_tokens": 8353251.0,
3526
+ "reward": -0.1729724407196045,
3527
+ "reward_std": 0.7113326787948608,
3528
+ "rewards/reward_function/mean": -0.1729724407196045,
3529
+ "rewards/reward_function/std": 0.7113326787948608,
3530
+ "step": 160
3531
+ },
3532
+ {
3533
+ "completion_length": 1091.875,
3534
+ "completions/clipped_ratio": 0.0,
3535
+ "completions/max_length": 1355.0,
3536
+ "completions/max_terminated_length": 1355.0,
3537
+ "completions/mean_length": 1091.875,
3538
+ "completions/mean_terminated_length": 1091.875,
3539
+ "completions/min_length": 734.0,
3540
+ "completions/min_terminated_length": 734.0,
3541
+ "epoch": 0.62890625,
3542
+ "frac_reward_zero_std": 0.0,
3543
+ "grad_norm": 0.303595632314682,
3544
+ "kl": 0.0004378617668407969,
3545
+ "learning_rate": 5.848888922025553e-07,
3546
+ "loss": 0.0,
3547
+ "num_tokens": 8406186.0,
3548
+ "reward": -0.46176353096961975,
3549
+ "reward_std": 0.7428471446037292,
3550
+ "rewards/reward_function/mean": -0.46176353096961975,
3551
+ "rewards/reward_function/std": 0.742847204208374,
3552
+ "step": 161
3553
+ },
3554
+ {
3555
+ "completion_length": 1162.75,
3556
+ "completions/clipped_ratio": 0.0,
3557
+ "completions/max_length": 1402.0,
3558
+ "completions/max_terminated_length": 1402.0,
3559
+ "completions/mean_length": 1162.75,
3560
+ "completions/mean_terminated_length": 1162.75,
3561
+ "completions/min_length": 173.0,
3562
+ "completions/min_terminated_length": 173.0,
3563
+ "epoch": 0.6328125,
3564
+ "frac_reward_zero_std": 0.0,
3565
+ "grad_norm": 1.2087342739105225,
3566
+ "kl": 0.0008970491835498251,
3567
+ "learning_rate": 5.571350963575728e-07,
3568
+ "loss": 0.0,
3569
+ "num_tokens": 8459736.0,
3570
+ "reward": -0.175228551030159,
3571
+ "reward_std": 0.7207473516464233,
3572
+ "rewards/reward_function/mean": -0.175228551030159,
3573
+ "rewards/reward_function/std": 0.7207473516464233,
3574
+ "step": 162
3575
+ },
3576
+ {
3577
+ "completion_length": 1177.125,
3578
+ "completions/clipped_ratio": 0.0,
3579
+ "completions/max_length": 1390.0,
3580
+ "completions/max_terminated_length": 1390.0,
3581
+ "completions/mean_length": 1177.125,
3582
+ "completions/mean_terminated_length": 1177.125,
3583
+ "completions/min_length": 978.0,
3584
+ "completions/min_terminated_length": 978.0,
3585
+ "epoch": 0.63671875,
3586
+ "frac_reward_zero_std": 0.0,
3587
+ "grad_norm": 0.29434847831726074,
3588
+ "kl": 0.0004823799536097795,
3589
+ "learning_rate": 5.299731159831953e-07,
3590
+ "loss": 0.0,
3591
+ "num_tokens": 8513353.0,
3592
+ "reward": 0.08043806254863739,
3593
+ "reward_std": 0.6669427156448364,
3594
+ "rewards/reward_function/mean": 0.08043806254863739,
3595
+ "rewards/reward_function/std": 0.6669427156448364,
3596
+ "step": 163
3597
+ },
3598
+ {
3599
+ "completion_length": 1363.25,
3600
+ "completions/clipped_ratio": 0.0,
3601
+ "completions/max_length": 1791.0,
3602
+ "completions/max_terminated_length": 1791.0,
3603
+ "completions/mean_length": 1363.25,
3604
+ "completions/mean_terminated_length": 1363.25,
3605
+ "completions/min_length": 1010.0,
3606
+ "completions/min_terminated_length": 1010.0,
3607
+ "epoch": 0.640625,
3608
+ "frac_reward_zero_std": 0.0,
3609
+ "grad_norm": 0.27553999423980713,
3610
+ "kl": 0.0005040961696067825,
3611
+ "learning_rate": 5.034112248817685e-07,
3612
+ "loss": 0.0,
3613
+ "num_tokens": 8568507.0,
3614
+ "reward": 0.09454244375228882,
3615
+ "reward_std": 0.6757357120513916,
3616
+ "rewards/reward_function/mean": 0.09454244375228882,
3617
+ "rewards/reward_function/std": 0.6757358312606812,
3618
+ "step": 164
3619
+ },
3620
+ {
3621
+ "completion_length": 1147.0,
3622
+ "completions/clipped_ratio": 0.0,
3623
+ "completions/max_length": 1400.0,
3624
+ "completions/max_terminated_length": 1400.0,
3625
+ "completions/mean_length": 1147.0,
3626
+ "completions/mean_terminated_length": 1147.0,
3627
+ "completions/min_length": 667.0,
3628
+ "completions/min_terminated_length": 667.0,
3629
+ "epoch": 0.64453125,
3630
+ "frac_reward_zero_std": 0.0,
3631
+ "grad_norm": 0.31530848145484924,
3632
+ "kl": 0.0004901376814814284,
3633
+ "learning_rate": 4.774575140626317e-07,
3634
+ "loss": 0.0,
3635
+ "num_tokens": 8621883.0,
3636
+ "reward": -0.2817423939704895,
3637
+ "reward_std": 0.7679179310798645,
3638
+ "rewards/reward_function/mean": -0.2817423939704895,
3639
+ "rewards/reward_function/std": 0.7679178714752197,
3640
+ "step": 165
3641
+ },
3642
+ {
3643
+ "completion_length": 1058.0,
3644
+ "completions/clipped_ratio": 0.0,
3645
+ "completions/max_length": 1364.0,
3646
+ "completions/max_terminated_length": 1364.0,
3647
+ "completions/mean_length": 1058.0,
3648
+ "completions/mean_terminated_length": 1058.0,
3649
+ "completions/min_length": 522.0,
3650
+ "completions/min_terminated_length": 522.0,
3651
+ "epoch": 0.6484375,
3652
+ "frac_reward_zero_std": 0.0,
3653
+ "grad_norm": 0.35542306303977966,
3654
+ "kl": 0.000599144957959652,
3655
+ "learning_rate": 4.5211988927752026e-07,
3656
+ "loss": 0.0,
3657
+ "num_tokens": 8674595.0,
3658
+ "reward": 0.014520317316055298,
3659
+ "reward_std": 0.6543052196502686,
3660
+ "rewards/reward_function/mean": 0.014520317316055298,
3661
+ "rewards/reward_function/std": 0.6543052196502686,
3662
+ "step": 166
3663
+ },
3664
+ {
3665
+ "completion_length": 1177.875,
3666
+ "completions/clipped_ratio": 0.0,
3667
+ "completions/max_length": 2078.0,
3668
+ "completions/max_terminated_length": 2078.0,
3669
+ "completions/mean_length": 1177.875,
3670
+ "completions/mean_terminated_length": 1177.875,
3671
+ "completions/min_length": 863.0,
3672
+ "completions/min_terminated_length": 863.0,
3673
+ "epoch": 0.65234375,
3674
+ "frac_reward_zero_std": 0.0,
3675
+ "grad_norm": 0.290801465511322,
3676
+ "kl": 0.000524831673828885,
3677
+ "learning_rate": 4.27406068612396e-07,
3678
+ "loss": 0.0,
3679
+ "num_tokens": 8728218.0,
3680
+ "reward": -0.28122541308403015,
3681
+ "reward_std": 0.7684498429298401,
3682
+ "rewards/reward_function/mean": -0.28122541308403015,
3683
+ "rewards/reward_function/std": 0.7684498429298401,
3684
+ "step": 167
3685
+ },
3686
+ {
3687
+ "completion_length": 1099.375,
3688
+ "completions/clipped_ratio": 0.0,
3689
+ "completions/max_length": 1303.0,
3690
+ "completions/max_terminated_length": 1303.0,
3691
+ "completions/mean_length": 1099.375,
3692
+ "completions/mean_terminated_length": 1099.375,
3693
+ "completions/min_length": 820.0,
3694
+ "completions/min_terminated_length": 820.0,
3695
+ "epoch": 0.65625,
3696
+ "frac_reward_zero_std": 0.0,
3697
+ "grad_norm": 0.2773479223251343,
3698
+ "kl": 0.00045157810382079333,
3699
+ "learning_rate": 4.033235801364402e-07,
3700
+ "loss": 0.0,
3701
+ "num_tokens": 8781261.0,
3702
+ "reward": 0.0877399668097496,
3703
+ "reward_std": 0.671479344367981,
3704
+ "rewards/reward_function/mean": 0.0877399668097496,
3705
+ "rewards/reward_function/std": 0.671479344367981,
3706
+ "step": 168
3707
+ },
3708
+ {
3709
+ "completion_length": 1257.25,
3710
+ "completions/clipped_ratio": 0.0,
3711
+ "completions/max_length": 2235.0,
3712
+ "completions/max_terminated_length": 2235.0,
3713
+ "completions/mean_length": 1257.25,
3714
+ "completions/mean_terminated_length": 1257.25,
3715
+ "completions/min_length": 939.0,
3716
+ "completions/min_terminated_length": 939.0,
3717
+ "epoch": 0.66015625,
3718
+ "frac_reward_zero_std": 0.0,
3719
+ "grad_norm": 0.29819291830062866,
3720
+ "kl": 0.00048441492253914475,
3721
+ "learning_rate": 3.798797596089351e-07,
3722
+ "loss": 0.0,
3723
+ "num_tokens": 8838007.0,
3724
+ "reward": -0.043121978640556335,
3725
+ "reward_std": 0.630191445350647,
3726
+ "rewards/reward_function/mean": -0.043121978640556335,
3727
+ "rewards/reward_function/std": 0.630191445350647,
3728
+ "step": 169
3729
+ },
3730
+ {
3731
+ "completion_length": 1023.0,
3732
+ "completions/clipped_ratio": 0.0,
3733
+ "completions/max_length": 1202.0,
3734
+ "completions/max_terminated_length": 1202.0,
3735
+ "completions/mean_length": 1023.0,
3736
+ "completions/mean_terminated_length": 1023.0,
3737
+ "completions/min_length": 802.0,
3738
+ "completions/min_terminated_length": 802.0,
3739
+ "epoch": 0.6640625,
3740
+ "frac_reward_zero_std": 0.0,
3741
+ "grad_norm": 0.338838666677475,
3742
+ "kl": 0.000568984636629466,
3743
+ "learning_rate": 3.5708174824471947e-07,
3744
+ "loss": 0.0,
3745
+ "num_tokens": 8887223.0,
3746
+ "reward": 0.05965143442153931,
3747
+ "reward_std": 0.6547546982765198,
3748
+ "rewards/reward_function/mean": 0.05965143442153931,
3749
+ "rewards/reward_function/std": 0.654754638671875,
3750
+ "step": 170
3751
+ },
3752
+ {
3753
+ "completion_length": 1429.75,
3754
+ "completions/clipped_ratio": 0.125,
3755
+ "completions/max_length": 3500.0,
3756
+ "completions/max_terminated_length": 1337.0,
3757
+ "completions/mean_length": 1429.75,
3758
+ "completions/mean_terminated_length": 1134.0,
3759
+ "completions/min_length": 860.0,
3760
+ "completions/min_terminated_length": 860.0,
3761
+ "epoch": 0.66796875,
3762
+ "frac_reward_zero_std": 0.0,
3763
+ "grad_norm": 0.27894267439842224,
3764
+ "kl": 0.0004727757041109726,
3765
+ "learning_rate": 3.3493649053890325e-07,
3766
+ "loss": 0.0,
3767
+ "num_tokens": 8945349.0,
3768
+ "reward": -0.09473268687725067,
3769
+ "reward_std": 0.7497490644454956,
3770
+ "rewards/reward_function/mean": -0.09473268687725067,
3771
+ "rewards/reward_function/std": 0.7497490048408508,
3772
+ "step": 171
3773
+ },
3774
+ {
3775
+ "completion_length": 1153.25,
3776
+ "completions/clipped_ratio": 0.0,
3777
+ "completions/max_length": 1421.0,
3778
+ "completions/max_terminated_length": 1421.0,
3779
+ "completions/mean_length": 1153.25,
3780
+ "completions/mean_terminated_length": 1153.25,
3781
+ "completions/min_length": 959.0,
3782
+ "completions/min_terminated_length": 959.0,
3783
+ "epoch": 0.671875,
3784
+ "frac_reward_zero_std": 0.0,
3785
+ "grad_norm": 0.3112727403640747,
3786
+ "kl": 0.0005989774799672887,
3787
+ "learning_rate": 3.134507321515107e-07,
3788
+ "loss": 0.0,
3789
+ "num_tokens": 8995607.0,
3790
+ "reward": -0.17081062495708466,
3791
+ "reward_std": 0.7099243998527527,
3792
+ "rewards/reward_function/mean": -0.17081062495708466,
3793
+ "rewards/reward_function/std": 0.7099243998527527,
3794
+ "step": 172
3795
+ },
3796
+ {
3797
+ "completion_length": 1032.125,
3798
+ "completions/clipped_ratio": 0.0,
3799
+ "completions/max_length": 1514.0,
3800
+ "completions/max_terminated_length": 1514.0,
3801
+ "completions/mean_length": 1032.125,
3802
+ "completions/mean_terminated_length": 1032.125,
3803
+ "completions/min_length": 237.0,
3804
+ "completions/min_terminated_length": 237.0,
3805
+ "epoch": 0.67578125,
3806
+ "frac_reward_zero_std": 0.0,
3807
+ "grad_norm": 0.9038693308830261,
3808
+ "kl": 0.0006451614681282081,
3809
+ "learning_rate": 2.9263101785268253e-07,
3810
+ "loss": 0.0,
3811
+ "num_tokens": 9050552.0,
3812
+ "reward": 0.08472549915313721,
3813
+ "reward_std": 0.669560432434082,
3814
+ "rewards/reward_function/mean": 0.08472549915313721,
3815
+ "rewards/reward_function/std": 0.669560432434082,
3816
+ "step": 173
3817
+ },
3818
+ {
3819
+ "completion_length": 934.125,
3820
+ "completions/clipped_ratio": 0.0,
3821
+ "completions/max_length": 1022.0,
3822
+ "completions/max_terminated_length": 1022.0,
3823
+ "completions/mean_length": 934.125,
3824
+ "completions/mean_terminated_length": 934.125,
3825
+ "completions/min_length": 730.0,
3826
+ "completions/min_terminated_length": 730.0,
3827
+ "epoch": 0.6796875,
3828
+ "frac_reward_zero_std": 0.0,
3829
+ "grad_norm": 0.3780133128166199,
3830
+ "kl": 0.0005501867926795967,
3831
+ "learning_rate": 2.7248368952908055e-07,
3832
+ "loss": 0.0,
3833
+ "num_tokens": 9099057.0,
3834
+ "reward": 0.24916215240955353,
3835
+ "reward_std": 0.5048820376396179,
3836
+ "rewards/reward_function/mean": 0.24916215240955353,
3837
+ "rewards/reward_function/std": 0.5048820972442627,
3838
+ "step": 174
3839
+ },
3840
+ {
3841
+ "completion_length": 1211.375,
3842
+ "completions/clipped_ratio": 0.0,
3843
+ "completions/max_length": 1936.0,
3844
+ "completions/max_terminated_length": 1936.0,
3845
+ "completions/mean_length": 1211.375,
3846
+ "completions/mean_terminated_length": 1211.375,
3847
+ "completions/min_length": 953.0,
3848
+ "completions/min_terminated_length": 953.0,
3849
+ "epoch": 0.68359375,
3850
+ "frac_reward_zero_std": 0.0,
3851
+ "grad_norm": 0.36097532510757446,
3852
+ "kl": 0.0005826732158311643,
3853
+ "learning_rate": 2.53014884252083e-07,
3854
+ "loss": 0.0,
3855
+ "num_tokens": 9155436.0,
3856
+ "reward": 0.2617029845714569,
3857
+ "reward_std": 0.5099496245384216,
3858
+ "rewards/reward_function/mean": 0.2617029845714569,
3859
+ "rewards/reward_function/std": 0.5099496245384216,
3860
+ "step": 175
3861
+ },
3862
+ {
3863
+ "completion_length": 1103.25,
3864
+ "completions/clipped_ratio": 0.0,
3865
+ "completions/max_length": 1672.0,
3866
+ "completions/max_terminated_length": 1672.0,
3867
+ "completions/mean_length": 1103.25,
3868
+ "completions/mean_terminated_length": 1103.25,
3869
+ "completions/min_length": 870.0,
3870
+ "completions/min_terminated_length": 870.0,
3871
+ "epoch": 0.6875,
3872
+ "frac_reward_zero_std": 0.0,
3873
+ "grad_norm": 0.30660855770111084,
3874
+ "kl": 0.000493127474328503,
3875
+ "learning_rate": 2.3423053240837518e-07,
3876
+ "loss": 0.0,
3877
+ "num_tokens": 9205294.0,
3878
+ "reward": -0.10572409629821777,
3879
+ "reward_std": 0.7405680418014526,
3880
+ "rewards/reward_function/mean": -0.10572409629821777,
3881
+ "rewards/reward_function/std": 0.7405680418014526,
3882
+ "step": 176
3883
+ },
3884
+ {
3885
+ "completion_length": 964.5,
3886
+ "completions/clipped_ratio": 0.0,
3887
+ "completions/max_length": 1215.0,
3888
+ "completions/max_terminated_length": 1215.0,
3889
+ "completions/mean_length": 964.5,
3890
+ "completions/mean_terminated_length": 964.5,
3891
+ "completions/min_length": 851.0,
3892
+ "completions/min_terminated_length": 851.0,
3893
+ "epoch": 0.69140625,
3894
+ "frac_reward_zero_std": 0.0,
3895
+ "grad_norm": 0.3269523084163666,
3896
+ "kl": 0.00049879898870131,
3897
+ "learning_rate": 2.1613635589349756e-07,
3898
+ "loss": 0.0,
3899
+ "num_tokens": 9261322.0,
3900
+ "reward": -0.4611111581325531,
3901
+ "reward_std": 0.743747889995575,
3902
+ "rewards/reward_function/mean": -0.4611111581325531,
3903
+ "rewards/reward_function/std": 0.743747889995575,
3904
+ "step": 177
3905
+ },
3906
+ {
3907
+ "completion_length": 1027.625,
3908
+ "completions/clipped_ratio": 0.0,
3909
+ "completions/max_length": 1438.0,
3910
+ "completions/max_terminated_length": 1438.0,
3911
+ "completions/mean_length": 1027.625,
3912
+ "completions/mean_terminated_length": 1027.625,
3913
+ "completions/min_length": 438.0,
3914
+ "completions/min_terminated_length": 438.0,
3915
+ "epoch": 0.6953125,
3916
+ "frac_reward_zero_std": 0.0,
3917
+ "grad_norm": 0.5114771127700806,
3918
+ "kl": 0.0005179118452360854,
3919
+ "learning_rate": 1.9873786636889908e-07,
3920
+ "loss": 0.0,
3921
+ "num_tokens": 9318415.0,
3922
+ "reward": 0.2489490509033203,
3923
+ "reward_std": 0.5048139095306396,
3924
+ "rewards/reward_function/mean": 0.2489490509033203,
3925
+ "rewards/reward_function/std": 0.5048139691352844,
3926
+ "step": 178
3927
+ },
3928
+ {
3929
+ "completion_length": 1043.5,
3930
+ "completions/clipped_ratio": 0.0,
3931
+ "completions/max_length": 2555.0,
3932
+ "completions/max_terminated_length": 2555.0,
3933
+ "completions/mean_length": 1043.5,
3934
+ "completions/mean_terminated_length": 1043.5,
3935
+ "completions/min_length": 565.0,
3936
+ "completions/min_terminated_length": 565.0,
3937
+ "epoch": 0.69921875,
3938
+ "frac_reward_zero_std": 0.0,
3939
+ "grad_norm": 0.339715838432312,
3940
+ "kl": 0.0005509828915819526,
3941
+ "learning_rate": 1.8204036358303173e-07,
3942
+ "loss": 0.0,
3943
+ "num_tokens": 9375075.0,
3944
+ "reward": -0.17033275961875916,
3945
+ "reward_std": 0.7077411413192749,
3946
+ "rewards/reward_function/mean": -0.17033275961875916,
3947
+ "rewards/reward_function/std": 0.7077411413192749,
3948
+ "step": 179
3949
+ },
3950
+ {
3951
+ "completion_length": 1009.5,
3952
+ "completions/clipped_ratio": 0.0,
3953
+ "completions/max_length": 1202.0,
3954
+ "completions/max_terminated_length": 1202.0,
3955
+ "completions/mean_length": 1009.5,
3956
+ "completions/mean_terminated_length": 1009.5,
3957
+ "completions/min_length": 620.0,
3958
+ "completions/min_terminated_length": 620.0,
3959
+ "epoch": 0.703125,
3960
+ "frac_reward_zero_std": 0.0,
3961
+ "grad_norm": 0.38060900568962097,
3962
+ "kl": 0.0005021176039008424,
3963
+ "learning_rate": 1.6604893375699594e-07,
3964
+ "loss": 0.0,
3965
+ "num_tokens": 9432023.0,
3966
+ "reward": 0.24431279301643372,
3967
+ "reward_std": 0.5027908682823181,
3968
+ "rewards/reward_function/mean": 0.24431279301643372,
3969
+ "rewards/reward_function/std": 0.5027908682823181,
3970
+ "step": 180
3971
+ },
3972
+ {
3973
+ "completion_length": 921.375,
3974
+ "completions/clipped_ratio": 0.0,
3975
+ "completions/max_length": 1216.0,
3976
+ "completions/max_terminated_length": 1216.0,
3977
+ "completions/mean_length": 921.375,
3978
+ "completions/mean_terminated_length": 921.375,
3979
+ "completions/min_length": 742.0,
3980
+ "completions/min_terminated_length": 742.0,
3981
+ "epoch": 0.70703125,
3982
+ "frac_reward_zero_std": 0.0,
3983
+ "grad_norm": 0.405112624168396,
3984
+ "kl": 0.0005638250586343929,
3985
+ "learning_rate": 1.507684480352292e-07,
3986
+ "loss": 0.0,
3987
+ "num_tokens": 9487706.0,
3988
+ "reward": 0.2474900782108307,
3989
+ "reward_std": 0.5042015910148621,
3990
+ "rewards/reward_function/mean": 0.2474900782108307,
3991
+ "rewards/reward_function/std": 0.5042015910148621,
3992
+ "step": 181
3993
+ },
3994
+ {
3995
+ "completion_length": 1200.25,
3996
+ "completions/clipped_ratio": 0.0,
3997
+ "completions/max_length": 2126.0,
3998
+ "completions/max_terminated_length": 2126.0,
3999
+ "completions/mean_length": 1200.25,
4000
+ "completions/mean_terminated_length": 1200.25,
4001
+ "completions/min_length": 815.0,
4002
+ "completions/min_terminated_length": 815.0,
4003
+ "epoch": 0.7109375,
4004
+ "frac_reward_zero_std": 0.0,
4005
+ "grad_norm": 0.3271883726119995,
4006
+ "kl": 0.0006362074636854231,
4007
+ "learning_rate": 1.362035610017079e-07,
4008
+ "loss": 0.0,
4009
+ "num_tokens": 9546180.0,
4010
+ "reward": 0.05625756084918976,
4011
+ "reward_std": 0.6525231599807739,
4012
+ "rewards/reward_function/mean": 0.05625756084918976,
4013
+ "rewards/reward_function/std": 0.6525231599807739,
4014
+ "step": 182
4015
+ },
4016
+ {
4017
+ "completion_length": 926.0,
4018
+ "completions/clipped_ratio": 0.0,
4019
+ "completions/max_length": 1297.0,
4020
+ "completions/max_terminated_length": 1297.0,
4021
+ "completions/mean_length": 926.0,
4022
+ "completions/mean_terminated_length": 926.0,
4023
+ "completions/min_length": 737.0,
4024
+ "completions/min_terminated_length": 737.0,
4025
+ "epoch": 0.71484375,
4026
+ "frac_reward_zero_std": 0.0,
4027
+ "grad_norm": 0.34687018394470215,
4028
+ "kl": 0.0005171666198293678,
4029
+ "learning_rate": 1.223587092621162e-07,
4030
+ "loss": 0.0,
4031
+ "num_tokens": 9601900.0,
4032
+ "reward": 0.0704411044716835,
4033
+ "reward_std": 0.6608877182006836,
4034
+ "rewards/reward_function/mean": 0.0704411044716835,
4035
+ "rewards/reward_function/std": 0.6608877182006836,
4036
+ "step": 183
4037
+ },
4038
+ {
4039
+ "completion_length": 1005.125,
4040
+ "completions/clipped_ratio": 0.0,
4041
+ "completions/max_length": 1611.0,
4042
+ "completions/max_terminated_length": 1611.0,
4043
+ "completions/mean_length": 1005.125,
4044
+ "completions/mean_terminated_length": 1005.125,
4045
+ "completions/min_length": 768.0,
4046
+ "completions/min_terminated_length": 768.0,
4047
+ "epoch": 0.71875,
4048
+ "frac_reward_zero_std": 0.0,
4049
+ "grad_norm": 0.35485363006591797,
4050
+ "kl": 0.0004917978003504686,
4051
+ "learning_rate": 1.0923811009241142e-07,
4052
+ "loss": 0.0,
4053
+ "num_tokens": 9658813.0,
4054
+ "reward": 0.2492428421974182,
4055
+ "reward_std": 0.5048900842666626,
4056
+ "rewards/reward_function/mean": 0.2492428421974182,
4057
+ "rewards/reward_function/std": 0.5048900842666626,
4058
+ "step": 184
4059
+ },
4060
+ {
4061
+ "completion_length": 990.625,
4062
+ "completions/clipped_ratio": 0.0,
4063
+ "completions/max_length": 1283.0,
4064
+ "completions/max_terminated_length": 1283.0,
4065
+ "completions/mean_length": 990.625,
4066
+ "completions/mean_terminated_length": 990.625,
4067
+ "completions/min_length": 751.0,
4068
+ "completions/min_terminated_length": 751.0,
4069
+ "epoch": 0.72265625,
4070
+ "frac_reward_zero_std": 0.0,
4071
+ "grad_norm": 0.36808741092681885,
4072
+ "kl": 0.0005794434182462282,
4073
+ "learning_rate": 9.684576015420277e-08,
4074
+ "loss": 0.0,
4075
+ "num_tokens": 9708434.0,
4076
+ "reward": -0.28416281938552856,
4077
+ "reward_std": 0.7652737498283386,
4078
+ "rewards/reward_function/mean": -0.28416281938552856,
4079
+ "rewards/reward_function/std": 0.7652737498283386,
4080
+ "step": 185
4081
+ },
4082
+ {
4083
+ "completion_length": 1321.0,
4084
+ "completions/clipped_ratio": 0.0,
4085
+ "completions/max_length": 2448.0,
4086
+ "completions/max_terminated_length": 2448.0,
4087
+ "completions/mean_length": 1321.0,
4088
+ "completions/mean_terminated_length": 1321.0,
4089
+ "completions/min_length": 747.0,
4090
+ "completions/min_terminated_length": 747.0,
4091
+ "epoch": 0.7265625,
4092
+ "frac_reward_zero_std": 0.0,
4093
+ "grad_norm": 0.34419044852256775,
4094
+ "kl": 0.000558189676667098,
4095
+ "learning_rate": 8.518543427732951e-08,
4096
+ "loss": 0.0,
4097
+ "num_tokens": 9762722.0,
4098
+ "reward": -0.2857913076877594,
4099
+ "reward_std": 0.7635413408279419,
4100
+ "rewards/reward_function/mean": -0.2857913076877594,
4101
+ "rewards/reward_function/std": 0.7635413408279419,
4102
+ "step": 186
4103
+ },
4104
+ {
4105
+ "completion_length": 1260.375,
4106
+ "completions/clipped_ratio": 0.0,
4107
+ "completions/max_length": 1672.0,
4108
+ "completions/max_terminated_length": 1672.0,
4109
+ "completions/mean_length": 1260.375,
4110
+ "completions/mean_terminated_length": 1260.375,
4111
+ "completions/min_length": 913.0,
4112
+ "completions/min_terminated_length": 913.0,
4113
+ "epoch": 0.73046875,
4114
+ "frac_reward_zero_std": 0.0,
4115
+ "grad_norm": 0.3106219470500946,
4116
+ "kl": 0.000590090683544986,
4117
+ "learning_rate": 7.426068431000883e-08,
4118
+ "loss": 0.0,
4119
+ "num_tokens": 9814501.0,
4120
+ "reward": -0.10060018301010132,
4121
+ "reward_std": 0.7447962164878845,
4122
+ "rewards/reward_function/mean": -0.10060018301010132,
4123
+ "rewards/reward_function/std": 0.7447961568832397,
4124
+ "step": 187
4125
+ },
4126
+ {
4127
+ "completion_length": 883.25,
4128
+ "completions/clipped_ratio": 0.0,
4129
+ "completions/max_length": 1120.0,
4130
+ "completions/max_terminated_length": 1120.0,
4131
+ "completions/mean_length": 883.25,
4132
+ "completions/mean_terminated_length": 883.25,
4133
+ "completions/min_length": 576.0,
4134
+ "completions/min_terminated_length": 576.0,
4135
+ "epoch": 0.734375,
4136
+ "frac_reward_zero_std": 0.0,
4137
+ "grad_norm": 0.4308620095252991,
4138
+ "kl": 0.0005764737070421688,
4139
+ "learning_rate": 6.407483803691216e-08,
4140
+ "loss": 0.0,
4141
+ "num_tokens": 9865287.0,
4142
+ "reward": -0.11281464993953705,
4143
+ "reward_std": 0.7346830368041992,
4144
+ "rewards/reward_function/mean": -0.11281464993953705,
4145
+ "rewards/reward_function/std": 0.7346829771995544,
4146
+ "step": 188
4147
+ },
4148
+ {
4149
+ "completion_length": 987.0,
4150
+ "completions/clipped_ratio": 0.0,
4151
+ "completions/max_length": 1191.0,
4152
+ "completions/max_terminated_length": 1191.0,
4153
+ "completions/mean_length": 987.0,
4154
+ "completions/mean_terminated_length": 987.0,
4155
+ "completions/min_length": 773.0,
4156
+ "completions/min_terminated_length": 773.0,
4157
+ "epoch": 0.73828125,
4158
+ "frac_reward_zero_std": 0.0,
4159
+ "grad_norm": 0.34522199630737305,
4160
+ "kl": 0.0005745336093241349,
4161
+ "learning_rate": 5.463099816548578e-08,
4162
+ "loss": -0.0,
4163
+ "num_tokens": 9914879.0,
4164
+ "reward": 0.43888330459594727,
4165
+ "reward_std": 0.013809502124786377,
4166
+ "rewards/reward_function/mean": 0.43888330459594727,
4167
+ "rewards/reward_function/std": 0.013809490017592907,
4168
+ "step": 189
4169
+ },
4170
+ {
4171
+ "completion_length": 1093.0,
4172
+ "completions/clipped_ratio": 0.0,
4173
+ "completions/max_length": 1639.0,
4174
+ "completions/max_terminated_length": 1639.0,
4175
+ "completions/mean_length": 1093.0,
4176
+ "completions/mean_terminated_length": 1093.0,
4177
+ "completions/min_length": 577.0,
4178
+ "completions/min_terminated_length": 577.0,
4179
+ "epoch": 0.7421875,
4180
+ "frac_reward_zero_std": 0.0,
4181
+ "grad_norm": 0.33963632583618164,
4182
+ "kl": 0.0006164438163978048,
4183
+ "learning_rate": 4.593204138084006e-08,
4184
+ "loss": 0.0,
4185
+ "num_tokens": 9967343.0,
4186
+ "reward": -0.4747491180896759,
4187
+ "reward_std": 0.7252565622329712,
4188
+ "rewards/reward_function/mean": -0.4747491180896759,
4189
+ "rewards/reward_function/std": 0.725256621837616,
4190
+ "step": 190
4191
+ },
4192
+ {
4193
+ "completion_length": 974.0,
4194
+ "completions/clipped_ratio": 0.0,
4195
+ "completions/max_length": 1168.0,
4196
+ "completions/max_terminated_length": 1168.0,
4197
+ "completions/mean_length": 974.0,
4198
+ "completions/mean_terminated_length": 974.0,
4199
+ "completions/min_length": 856.0,
4200
+ "completions/min_terminated_length": 856.0,
4201
+ "epoch": 0.74609375,
4202
+ "frac_reward_zero_std": 0.0,
4203
+ "grad_norm": 0.33787915110588074,
4204
+ "kl": 0.0005020303433411755,
4205
+ "learning_rate": 3.798061746947995e-08,
4206
+ "loss": 0.0,
4207
+ "num_tokens": 10016831.0,
4208
+ "reward": 0.07625642418861389,
4209
+ "reward_std": 0.6643387675285339,
4210
+ "rewards/reward_function/mean": 0.07625642418861389,
4211
+ "rewards/reward_function/std": 0.6643387675285339,
4212
+ "step": 191
4213
+ },
4214
+ {
4215
+ "completion_length": 1223.5,
4216
+ "completions/clipped_ratio": 0.0,
4217
+ "completions/max_length": 1915.0,
4218
+ "completions/max_terminated_length": 1915.0,
4219
+ "completions/mean_length": 1223.5,
4220
+ "completions/mean_terminated_length": 1223.5,
4221
+ "completions/min_length": 377.0,
4222
+ "completions/min_terminated_length": 377.0,
4223
+ "epoch": 0.75,
4224
+ "frac_reward_zero_std": 0.0,
4225
+ "grad_norm": 0.3466210961341858,
4226
+ "kl": 0.0006273999460972846,
4227
+ "learning_rate": 3.077914851215585e-08,
4228
+ "loss": 0.0,
4229
+ "num_tokens": 10070339.0,
4230
+ "reward": -0.4622008502483368,
4231
+ "reward_std": 0.7422569990158081,
4232
+ "rewards/reward_function/mean": -0.4622008502483368,
4233
+ "rewards/reward_function/std": 0.7422570586204529,
4234
+ "step": 192
4235
+ },
4236
+ {
4237
+ "completion_length": 950.75,
4238
+ "completions/clipped_ratio": 0.0,
4239
+ "completions/max_length": 1292.0,
4240
+ "completions/max_terminated_length": 1292.0,
4241
+ "completions/mean_length": 950.75,
4242
+ "completions/mean_terminated_length": 950.75,
4243
+ "completions/min_length": 504.0,
4244
+ "completions/min_terminated_length": 504.0,
4245
+ "epoch": 0.75390625,
4246
+ "frac_reward_zero_std": 0.0,
4247
+ "grad_norm": 0.3800007402896881,
4248
+ "kl": 0.0005864633276360109,
4249
+ "learning_rate": 2.4329828146074096e-08,
4250
+ "loss": 0.0,
4251
+ "num_tokens": 10124977.0,
4252
+ "reward": -0.28742432594299316,
4253
+ "reward_std": 0.76183021068573,
4254
+ "rewards/reward_function/mean": -0.28742432594299316,
4255
+ "rewards/reward_function/std": 0.7618302702903748,
4256
+ "step": 193
4257
+ },
4258
+ {
4259
+ "completion_length": 977.0,
4260
+ "completions/clipped_ratio": 0.0,
4261
+ "completions/max_length": 1252.0,
4262
+ "completions/max_terminated_length": 1252.0,
4263
+ "completions/mean_length": 977.0,
4264
+ "completions/mean_terminated_length": 977.0,
4265
+ "completions/min_length": 396.0,
4266
+ "completions/min_terminated_length": 396.0,
4267
+ "epoch": 0.7578125,
4268
+ "frac_reward_zero_std": 0.0,
4269
+ "grad_norm": 0.550014317035675,
4270
+ "kl": 0.0005290872577461414,
4271
+ "learning_rate": 1.8634620896695044e-08,
4272
+ "loss": 0.0,
4273
+ "num_tokens": 10175113.0,
4274
+ "reward": 0.24153178930282593,
4275
+ "reward_std": 0.5017148852348328,
4276
+ "rewards/reward_function/mean": 0.24153178930282593,
4277
+ "rewards/reward_function/std": 0.5017149448394775,
4278
+ "step": 194
4279
+ },
4280
+ {
4281
+ "completion_length": 1148.125,
4282
+ "completions/clipped_ratio": 0.0,
4283
+ "completions/max_length": 1667.0,
4284
+ "completions/max_terminated_length": 1667.0,
4285
+ "completions/mean_length": 1148.125,
4286
+ "completions/mean_terminated_length": 1148.125,
4287
+ "completions/min_length": 707.0,
4288
+ "completions/min_terminated_length": 707.0,
4289
+ "epoch": 0.76171875,
4290
+ "frac_reward_zero_std": 0.0,
4291
+ "grad_norm": 0.3343985378742218,
4292
+ "kl": 0.0005916821173741482,
4293
+ "learning_rate": 1.3695261579316776e-08,
4294
+ "loss": 0.0,
4295
+ "num_tokens": 10231330.0,
4296
+ "reward": 0.09651876986026764,
4297
+ "reward_std": 0.511390209197998,
4298
+ "rewards/reward_function/mean": 0.09651876986026764,
4299
+ "rewards/reward_function/std": 0.511390209197998,
4300
+ "step": 195
4301
+ },
4302
+ {
4303
+ "completion_length": 1165.5,
4304
+ "completions/clipped_ratio": 0.0,
4305
+ "completions/max_length": 1567.0,
4306
+ "completions/max_terminated_length": 1567.0,
4307
+ "completions/mean_length": 1165.5,
4308
+ "completions/mean_terminated_length": 1165.5,
4309
+ "completions/min_length": 1004.0,
4310
+ "completions/min_terminated_length": 1004.0,
4311
+ "epoch": 0.765625,
4312
+ "frac_reward_zero_std": 0.0,
4313
+ "grad_norm": 0.33010783791542053,
4314
+ "kl": 0.0005538033583434299,
4315
+ "learning_rate": 9.513254770636138e-09,
4316
+ "loss": 0.0,
4317
+ "num_tokens": 10282974.0,
4318
+ "reward": -0.1761539876461029,
4319
+ "reward_std": 0.7018446922302246,
4320
+ "rewards/reward_function/mean": -0.1761539876461029,
4321
+ "rewards/reward_function/std": 0.7018447518348694,
4322
+ "step": 196
4323
+ },
4324
+ {
4325
+ "completion_length": 1035.0,
4326
+ "completions/clipped_ratio": 0.0,
4327
+ "completions/max_length": 1474.0,
4328
+ "completions/max_terminated_length": 1474.0,
4329
+ "completions/mean_length": 1035.0,
4330
+ "completions/mean_terminated_length": 1035.0,
4331
+ "completions/min_length": 764.0,
4332
+ "completions/min_terminated_length": 764.0,
4333
+ "epoch": 0.76953125,
4334
+ "frac_reward_zero_std": 0.0,
4335
+ "grad_norm": 0.27371305227279663,
4336
+ "kl": 0.000488734214741271,
4337
+ "learning_rate": 6.089874350439507e-09,
4338
+ "loss": -0.0,
4339
+ "num_tokens": 10338286.0,
4340
+ "reward": 0.4277012050151825,
4341
+ "reward_std": 0.013616181910037994,
4342
+ "rewards/reward_function/mean": 0.4277012050151825,
4343
+ "rewards/reward_function/std": 0.013616186566650867,
4344
+ "step": 197
4345
+ },
4346
+ {
4347
+ "completion_length": 888.75,
4348
+ "completions/clipped_ratio": 0.0,
4349
+ "completions/max_length": 1162.0,
4350
+ "completions/max_terminated_length": 1162.0,
4351
+ "completions/mean_length": 888.75,
4352
+ "completions/mean_terminated_length": 888.75,
4353
+ "completions/min_length": 248.0,
4354
+ "completions/min_terminated_length": 248.0,
4355
+ "epoch": 0.7734375,
4356
+ "frac_reward_zero_std": 0.0,
4357
+ "grad_norm": 0.8447083830833435,
4358
+ "kl": 0.0005265605868771672,
4359
+ "learning_rate": 3.4261631135654174e-09,
4360
+ "loss": 0.0,
4361
+ "num_tokens": 10387716.0,
4362
+ "reward": 0.40771353244781494,
4363
+ "reward_std": 0.02116406336426735,
4364
+ "rewards/reward_function/mean": 0.40771353244781494,
4365
+ "rewards/reward_function/std": 0.02116405963897705,
4366
+ "step": 198
4367
+ },
4368
+ {
4369
+ "completion_length": 1005.875,
4370
+ "completions/clipped_ratio": 0.0,
4371
+ "completions/max_length": 1336.0,
4372
+ "completions/max_terminated_length": 1336.0,
4373
+ "completions/mean_length": 1005.875,
4374
+ "completions/mean_terminated_length": 1005.875,
4375
+ "completions/min_length": 863.0,
4376
+ "completions/min_terminated_length": 863.0,
4377
+ "epoch": 0.77734375,
4378
+ "frac_reward_zero_std": 0.0,
4379
+ "grad_norm": 0.3126639127731323,
4380
+ "kl": 0.0005563631129916757,
4381
+ "learning_rate": 1.5229324522605949e-09,
4382
+ "loss": 0.0,
4383
+ "num_tokens": 10442795.0,
4384
+ "reward": 0.24751342833042145,
4385
+ "reward_std": 0.5041323304176331,
4386
+ "rewards/reward_function/mean": 0.24751342833042145,
4387
+ "rewards/reward_function/std": 0.5041323304176331,
4388
+ "step": 199
4389
+ },
4390
+ {
4391
+ "completion_length": 950.5,
4392
+ "completions/clipped_ratio": 0.0,
4393
+ "completions/max_length": 1109.0,
4394
+ "completions/max_terminated_length": 1109.0,
4395
+ "completions/mean_length": 950.5,
4396
+ "completions/mean_terminated_length": 950.5,
4397
+ "completions/min_length": 681.0,
4398
+ "completions/min_terminated_length": 681.0,
4399
+ "epoch": 0.78125,
4400
+ "frac_reward_zero_std": 0.0,
4401
+ "grad_norm": 0.40675655007362366,
4402
+ "kl": 0.0006997519376454875,
4403
+ "learning_rate": 3.8076210902182607e-10,
4404
+ "loss": 0.0,
4405
+ "num_tokens": 10492719.0,
4406
+ "reward": -0.18069900572299957,
4407
+ "reward_std": 0.7014786601066589,
4408
+ "rewards/reward_function/mean": -0.18069900572299957,
4409
+ "rewards/reward_function/std": 0.7014786601066589,
4410
+ "step": 200
4411
  }
4412
  ],
4413
  "logging_steps": 1,
4414
+ "max_steps": 200,
4415
+ "num_input_tokens_seen": 10492719,
4416
  "num_train_epochs": 1,
4417
  "save_steps": 100,
4418
  "stateful_callbacks": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf0aefa91457721c3865520ad4ec5b7b8f0ea737832ba130819dd33f051c1cac
3
  size 6865
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c6b3aeb7936136f60376a59c48db0ecc3309ca9fb6a498e862f58bb6ced4a42
3
  size 6865