Đào Quốc Tuấn commited on
Upload folder using huggingface_hub
Browse files- experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/config.json +39 -0
- experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/generation_config.json +6 -0
- experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/merges.txt +0 -0
- experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/model.safetensors +3 -0
- experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/special_tokens_map.json +6 -0
- experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/tokenizer.json +0 -0
- experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/tokenizer_config.json +21 -0
- experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/vocab.json +0 -0
- experiments/sft_gpt2-120m/20251118_153756/sft_gpt2-120m-1.log +145 -0
- experiments/sft_gpt2-120m/20251118_153756/sft_gpt2-120m-1_metrics.jsonl +0 -0
experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GPT2LMHeadModel"
|
| 5 |
+
],
|
| 6 |
+
"attn_pdrop": 0.1,
|
| 7 |
+
"bos_token_id": 50256,
|
| 8 |
+
"dtype": "float32",
|
| 9 |
+
"embd_pdrop": 0.1,
|
| 10 |
+
"eos_token_id": 50256,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"layer_norm_epsilon": 1e-05,
|
| 13 |
+
"model_type": "gpt2",
|
| 14 |
+
"n_ctx": 1024,
|
| 15 |
+
"n_embd": 768,
|
| 16 |
+
"n_head": 12,
|
| 17 |
+
"n_inner": null,
|
| 18 |
+
"n_layer": 12,
|
| 19 |
+
"n_positions": 1024,
|
| 20 |
+
"pad_token_id": 50256,
|
| 21 |
+
"reorder_and_upcast_attn": false,
|
| 22 |
+
"resid_pdrop": 0.1,
|
| 23 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 24 |
+
"scale_attn_weights": true,
|
| 25 |
+
"summary_activation": null,
|
| 26 |
+
"summary_first_dropout": 0.1,
|
| 27 |
+
"summary_proj_to_labels": true,
|
| 28 |
+
"summary_type": "cls_index",
|
| 29 |
+
"summary_use_proj": true,
|
| 30 |
+
"task_specific_params": {
|
| 31 |
+
"text-generation": {
|
| 32 |
+
"do_sample": true,
|
| 33 |
+
"max_length": 50
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"transformers_version": "4.56.0",
|
| 37 |
+
"use_cache": true,
|
| 38 |
+
"vocab_size": 50257
|
| 39 |
+
}
|
experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"transformers_version": "4.56.0"
|
| 6 |
+
}
|
experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1feef2970e9106a8583f8204fdc67768969ef064d2925ae375962979a48d608
|
| 3 |
+
size 497774208
|
experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
experiments/sft_gpt2-120m/20251118_153756/checkpoints/epoch_17/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
experiments/sft_gpt2-120m/20251118_153756/sft_gpt2-120m-1.log
CHANGED
|
@@ -2636,3 +2636,148 @@
|
|
| 2636 |
2025-11-18 17:41:29,767 - root - [32m[1mINFO[0m - Step 48701/57180 train rougeL: 0.34799434959939385
|
| 2637 |
2025-11-18 17:41:29,801 - root - [32m[1mINFO[0m - Step 48701/57180 loss: 0.06674124300479889, total_norm: 1.098740577697754
|
| 2638 |
2025-11-18 17:41:34,643 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2636 |
2025-11-18 17:41:29,767 - root - [32m[1mINFO[0m - Step 48701/57180 train rougeL: 0.34799434959939385
|
| 2637 |
2025-11-18 17:41:29,801 - root - [32m[1mINFO[0m - Step 48701/57180 loss: 0.06674124300479889, total_norm: 1.098740577697754
|
| 2638 |
2025-11-18 17:41:34,643 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2639 |
+
2025-11-18 17:41:36,456 - root - [32m[1mINFO[0m - Step 48801/57180 train rougeL: 0.5338936129015467
|
| 2640 |
+
2025-11-18 17:41:36,491 - root - [32m[1mINFO[0m - Step 48801/57180 loss: 0.02021036297082901, total_norm: 0.37200063467025757
|
| 2641 |
+
2025-11-18 17:41:41,251 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2642 |
+
2025-11-18 17:41:43,048 - root - [32m[1mINFO[0m - Step 48901/57180 train rougeL: 0.15424455892890404
|
| 2643 |
+
2025-11-18 17:41:43,082 - root - [32m[1mINFO[0m - Step 48901/57180 loss: 0.006466233171522617, total_norm: 0.34377801418304443
|
| 2644 |
+
2025-11-18 17:41:47,814 - root - [32m[1mINFO[0m - Step 49001/57180 finished
|
| 2645 |
+
2025-11-18 17:41:48,141 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2646 |
+
2025-11-18 17:41:52,664 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2647 |
+
2025-11-18 17:41:57,105 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2648 |
+
2025-11-18 17:42:01,587 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2649 |
+
2025-11-18 17:42:06,280 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2650 |
+
2025-11-18 17:42:10,737 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2651 |
+
2025-11-18 17:42:15,175 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2652 |
+
2025-11-18 17:42:19,558 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2653 |
+
2025-11-18 17:42:23,074 - root - [32m[1mINFO[0m - Epoch 18/20 eval loss: 6.251601159572601, eval rougeL: 0.11391684241703591
|
| 2654 |
+
2025-11-18 17:42:23,090 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2655 |
+
2025-11-18 17:42:24,873 - root - [32m[1mINFO[0m - Step 49001/57180 train rougeL: 0.12367111898596157
|
| 2656 |
+
2025-11-18 17:42:24,908 - root - [32m[1mINFO[0m - Step 49001/57180 loss: 0.0027590582612901926, total_norm: 0.3064168393611908
|
| 2657 |
+
2025-11-18 17:42:29,647 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2658 |
+
2025-11-18 17:42:31,439 - root - [32m[1mINFO[0m - Step 49101/57180 train rougeL: 0.2591626667587509
|
| 2659 |
+
2025-11-18 17:42:31,472 - root - [32m[1mINFO[0m - Step 49101/57180 loss: 0.0029959736857563257, total_norm: 0.2954358160495758
|
| 2660 |
+
2025-11-18 17:42:36,211 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2661 |
+
2025-11-18 17:42:38,004 - root - [32m[1mINFO[0m - Step 49201/57180 train rougeL: 0.3989785032489062
|
| 2662 |
+
2025-11-18 17:42:38,038 - root - [32m[1mINFO[0m - Step 49201/57180 loss: 0.00800960324704647, total_norm: 0.20073983073234558
|
| 2663 |
+
2025-11-18 17:42:42,794 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2664 |
+
2025-11-18 17:42:44,583 - root - [32m[1mINFO[0m - Step 49301/57180 train rougeL: 0.32575272568659785
|
| 2665 |
+
2025-11-18 17:42:44,617 - root - [32m[1mINFO[0m - Step 49301/57180 loss: 0.008075157180428505, total_norm: 0.25261831283569336
|
| 2666 |
+
2025-11-18 17:42:49,369 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2667 |
+
2025-11-18 17:42:51,152 - root - [32m[1mINFO[0m - Step 49401/57180 train rougeL: 0.38293353541588165
|
| 2668 |
+
2025-11-18 17:42:51,186 - root - [32m[1mINFO[0m - Step 49401/57180 loss: 0.006646967958658934, total_norm: 0.25455766916275024
|
| 2669 |
+
2025-11-18 17:42:55,918 - root - [32m[1mINFO[0m - Step 49501/57180 finished
|
| 2670 |
+
2025-11-18 17:42:56,246 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2671 |
+
2025-11-18 17:43:00,766 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2672 |
+
2025-11-18 17:43:05,218 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2673 |
+
2025-11-18 17:43:09,687 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2674 |
+
2025-11-18 17:43:14,206 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2675 |
+
2025-11-18 17:43:18,670 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2676 |
+
2025-11-18 17:43:23,123 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2677 |
+
2025-11-18 17:43:27,515 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2678 |
+
2025-11-18 17:43:31,169 - root - [32m[1mINFO[0m - Epoch 18/20 eval loss: 6.256913363933563, eval rougeL: 0.11355955158754862
|
| 2679 |
+
2025-11-18 17:43:31,185 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2680 |
+
2025-11-18 17:43:32,974 - root - [32m[1mINFO[0m - Step 49501/57180 train rougeL: 0.4684565985593804
|
| 2681 |
+
2025-11-18 17:43:33,008 - root - [32m[1mINFO[0m - Step 49501/57180 loss: 0.019273219630122185, total_norm: 0.5774557590484619
|
| 2682 |
+
2025-11-18 17:43:37,742 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2683 |
+
2025-11-18 17:43:39,513 - root - [32m[1mINFO[0m - Step 49601/57180 train rougeL: 0.24085766974138503
|
| 2684 |
+
2025-11-18 17:43:39,547 - root - [32m[1mINFO[0m - Step 49601/57180 loss: 0.006047072820365429, total_norm: 0.22315384447574615
|
| 2685 |
+
2025-11-18 17:43:44,291 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2686 |
+
2025-11-18 17:43:46,068 - root - [32m[1mINFO[0m - Step 49701/57180 train rougeL: 0.41989641173303116
|
| 2687 |
+
2025-11-18 17:43:46,102 - root - [32m[1mINFO[0m - Step 49701/57180 loss: 0.007980656810104847, total_norm: 0.2539476454257965
|
| 2688 |
+
2025-11-18 17:43:50,839 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2689 |
+
2025-11-18 17:43:52,616 - root - [32m[1mINFO[0m - Step 49801/57180 train rougeL: 0.10681268076207678
|
| 2690 |
+
2025-11-18 17:43:52,650 - root - [32m[1mINFO[0m - Step 49801/57180 loss: 0.0008549483027309179, total_norm: 0.05990957096219063
|
| 2691 |
+
2025-11-18 17:43:57,386 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2692 |
+
2025-11-18 17:43:59,164 - root - [32m[1mINFO[0m - Step 49901/57180 train rougeL: 0.1850522722318323
|
| 2693 |
+
2025-11-18 17:43:59,198 - root - [32m[1mINFO[0m - Step 49901/57180 loss: 0.005856649484485388, total_norm: 0.2896975874900818
|
| 2694 |
+
2025-11-18 17:44:03,917 - root - [32m[1mINFO[0m - Step 50001/57180 finished
|
| 2695 |
+
2025-11-18 17:44:04,243 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2696 |
+
2025-11-18 17:44:08,754 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2697 |
+
2025-11-18 17:44:13,195 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2698 |
+
2025-11-18 17:44:17,668 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2699 |
+
2025-11-18 17:44:22,210 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2700 |
+
2025-11-18 17:44:26,687 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2701 |
+
2025-11-18 17:44:31,141 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2702 |
+
2025-11-18 17:44:35,535 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2703 |
+
2025-11-18 17:44:39,067 - root - [32m[1mINFO[0m - Epoch 18/20 eval loss: 6.267626345157623, eval rougeL: 0.1134232193755238
|
| 2704 |
+
2025-11-18 17:44:39,084 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2705 |
+
2025-11-18 17:44:40,895 - root - [32m[1mINFO[0m - Step 50001/57180 train rougeL: 0.21943553157534842
|
| 2706 |
+
2025-11-18 17:44:40,930 - root - [32m[1mINFO[0m - Step 50001/57180 loss: 0.008561826311051846, total_norm: 0.2448900043964386
|
| 2707 |
+
2025-11-18 17:44:45,711 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2708 |
+
2025-11-18 17:44:47,733 - root - [32m[1mINFO[0m - Step 50101/57180 train rougeL: 0.4082445552374566
|
| 2709 |
+
2025-11-18 17:44:47,770 - root - [32m[1mINFO[0m - Step 50101/57180 loss: 0.010430201888084412, total_norm: 0.297516405582428
|
| 2710 |
+
2025-11-18 17:44:52,541 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2711 |
+
2025-11-18 17:44:54,517 - root - [32m[1mINFO[0m - Step 50201/57180 train rougeL: 0.41006920693934346
|
| 2712 |
+
2025-11-18 17:44:54,552 - root - [32m[1mINFO[0m - Step 50201/57180 loss: 0.016712741926312447, total_norm: 0.5132530927658081
|
| 2713 |
+
2025-11-18 17:44:59,311 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2714 |
+
2025-11-18 17:45:01,099 - root - [32m[1mINFO[0m - Step 50301/57180 train rougeL: 0.45544185826726724
|
| 2715 |
+
2025-11-18 17:45:01,133 - root - [32m[1mINFO[0m - Step 50301/57180 loss: 0.02175765484571457, total_norm: 0.6634148359298706
|
| 2716 |
+
2025-11-18 17:45:05,876 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2717 |
+
2025-11-18 17:45:07,655 - root - [32m[1mINFO[0m - Step 50401/57180 train rougeL: 0.12646313464182748
|
| 2718 |
+
2025-11-18 17:45:07,688 - root - [32m[1mINFO[0m - Step 50401/57180 loss: 0.02486201375722885, total_norm: 1.3527798652648926
|
| 2719 |
+
2025-11-18 17:45:12,377 - root - [32m[1mINFO[0m - Step 50501/57180 finished
|
| 2720 |
+
2025-11-18 17:45:12,703 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2721 |
+
2025-11-18 17:45:17,213 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2722 |
+
2025-11-18 17:45:21,647 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2723 |
+
2025-11-18 17:45:26,139 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2724 |
+
2025-11-18 17:45:30,655 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2725 |
+
2025-11-18 17:45:35,119 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2726 |
+
2025-11-18 17:45:39,567 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2727 |
+
2025-11-18 17:45:43,945 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2728 |
+
2025-11-18 17:45:47,457 - root - [32m[1mINFO[0m - Epoch 18/20 eval loss: 6.273983538150787, eval rougeL: 0.1148257422658587
|
| 2729 |
+
2025-11-18 17:45:47,472 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2730 |
+
2025-11-18 17:45:49,255 - root - [32m[1mINFO[0m - Step 50501/57180 train rougeL: 0.23590418073471214
|
| 2731 |
+
2025-11-18 17:45:49,289 - root - [32m[1mINFO[0m - Step 50501/57180 loss: 0.018590204417705536, total_norm: 1.1816843748092651
|
| 2732 |
+
2025-11-18 17:45:53,989 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2733 |
+
2025-11-18 17:45:55,786 - root - [32m[1mINFO[0m - Step 50601/57180 train rougeL: 0.5321582170795359
|
| 2734 |
+
2025-11-18 17:45:55,819 - root - [32m[1mINFO[0m - Step 50601/57180 loss: 0.018219193443655968, total_norm: 0.43898823857307434
|
| 2735 |
+
2025-11-18 17:46:00,522 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2736 |
+
2025-11-18 17:46:02,310 - root - [32m[1mINFO[0m - Step 50701/57180 train rougeL: 0.1816877436707009
|
| 2737 |
+
2025-11-18 17:46:02,344 - root - [32m[1mINFO[0m - Step 50701/57180 loss: 0.003763582557439804, total_norm: 0.14448025822639465
|
| 2738 |
+
2025-11-18 17:46:07,045 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2739 |
+
2025-11-18 17:46:08,833 - root - [32m[1mINFO[0m - Step 50801/57180 train rougeL: 0.42169163031776624
|
| 2740 |
+
2025-11-18 17:46:08,867 - root - [32m[1mINFO[0m - Step 50801/57180 loss: 0.0031370981596410275, total_norm: 0.11518161743879318
|
| 2741 |
+
2025-11-18 17:46:13,566 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2742 |
+
2025-11-18 17:46:15,354 - root - [32m[1mINFO[0m - Step 50901/57180 train rougeL: 0.5912828582981644
|
| 2743 |
+
2025-11-18 17:46:15,388 - root - [32m[1mINFO[0m - Step 50901/57180 loss: 0.03974827006459236, total_norm: 0.5058509111404419
|
| 2744 |
+
2025-11-18 17:46:20,246 - root - [32m[1mINFO[0m - Step 51001/57180 finished
|
| 2745 |
+
2025-11-18 17:46:20,579 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2746 |
+
2025-11-18 17:46:25,078 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2747 |
+
2025-11-18 17:46:29,501 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2748 |
+
2025-11-18 17:46:33,998 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2749 |
+
2025-11-18 17:46:38,516 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2750 |
+
2025-11-18 17:46:42,958 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2751 |
+
2025-11-18 17:46:47,396 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2752 |
+
2025-11-18 17:46:51,768 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2753 |
+
2025-11-18 17:46:55,280 - root - [32m[1mINFO[0m - Epoch 18/20 eval loss: 6.276162922382355, eval rougeL: 0.11273383990039632
|
| 2754 |
+
2025-11-18 17:46:55,296 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2755 |
+
2025-11-18 17:46:57,085 - root - [32m[1mINFO[0m - Step 51001/57180 train rougeL: 0.37787669737485347
|
| 2756 |
+
2025-11-18 17:46:57,119 - root - [32m[1mINFO[0m - Step 51001/57180 loss: 0.02337406575679779, total_norm: 0.6599317789077759
|
| 2757 |
+
2025-11-18 17:47:01,824 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2758 |
+
2025-11-18 17:47:03,607 - root - [32m[1mINFO[0m - Step 51101/57180 train rougeL: 0.28926086991571043
|
| 2759 |
+
2025-11-18 17:47:03,640 - root - [32m[1mINFO[0m - Step 51101/57180 loss: 0.005989678669720888, total_norm: 0.29516297578811646
|
| 2760 |
+
2025-11-18 17:47:08,341 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2761 |
+
2025-11-18 17:47:10,139 - root - [32m[1mINFO[0m - Step 51201/57180 train rougeL: 0.49032163708365817
|
| 2762 |
+
2025-11-18 17:47:10,173 - root - [32m[1mINFO[0m - Step 51201/57180 loss: 0.03332280367612839, total_norm: 0.7766381502151489
|
| 2763 |
+
2025-11-18 17:47:14,890 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2764 |
+
2025-11-18 17:47:16,704 - root - [32m[1mINFO[0m - Step 51301/57180 train rougeL: 0.38884764684991013
|
| 2765 |
+
2025-11-18 17:47:16,737 - root - [32m[1mINFO[0m - Step 51301/57180 loss: 0.022941358387470245, total_norm: 1.0186429023742676
|
| 2766 |
+
2025-11-18 17:47:21,487 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2767 |
+
2025-11-18 17:47:23,282 - root - [32m[1mINFO[0m - Step 51401/57180 train rougeL: 0.27959066764170815
|
| 2768 |
+
2025-11-18 17:47:23,316 - root - [32m[1mINFO[0m - Step 51401/57180 loss: 0.022026797756552696, total_norm: 0.4584163725376129
|
| 2769 |
+
2025-11-18 17:47:26,214 - root - [32m[1mINFO[0m - Epoch 18/20 finished
|
| 2770 |
+
2025-11-18 17:47:26,540 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2771 |
+
2025-11-18 17:47:31,061 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2772 |
+
2025-11-18 17:47:35,510 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2773 |
+
2025-11-18 17:47:39,994 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2774 |
+
2025-11-18 17:47:44,516 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2775 |
+
2025-11-18 17:47:49,168 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2776 |
+
2025-11-18 17:47:53,617 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2777 |
+
2025-11-18 17:47:57,995 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2778 |
+
2025-11-18 17:48:01,511 - root - [32m[1mINFO[0m - Epoch 18/20 eval loss: 6.274387300014496, eval rougeL: 0.11338182900446833
|
| 2779 |
+
2025-11-18 17:48:02,211 - root - [32m[1mINFO[0m - Epoch 19/20
|
| 2780 |
+
2025-11-18 17:48:04,032 - root - [32m[1mINFO[0m - Step 51501/57180 finished
|
| 2781 |
+
2025-11-18 17:48:04,417 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2782 |
+
2025-11-18 17:48:08,994 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
| 2783 |
+
2025-11-18 17:48:13,488 - absl - [32m[1mINFO[0m - Using default tokenizer.
|
experiments/sft_gpt2-120m/20251118_153756/sft_gpt2-120m-1_metrics.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|