Translation
English
Chinese
RWKV_V7
Englisg->Chinese
0.4B
1.5B
RWKV_v7_G1_Translate / wandb /train_log.txt
Alic-Li's picture
Rename train_log.txt to wandb/train_log.txt
2e4699a verified
Raw
History Blame
10.4 kB
NEW RUN 2025-07-20-08-34-45
{'load_model': '/root/RWKV-LM-V7/0.4B_tanslate/rwkv-init.pth', 'wandb': '', 'proj_dir': '/root/RWKV-LM-V7/0.4B_tanslate', 'random_seed': -1, 'data_file': '/root/data/datasets_59596644_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 2048, 'epoch_steps': 2520, 'epoch_count': 137, 'epoch_begin': 0, 'epoch_save': 10, 'micro_bsz': 16, 'n_layer': 24, 'n_embd': 1024, 'dim_att': 1024, 'dim_ffn': 4096, 'lr_init': 2e-05, 'lr_final': 1e-06, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-18, 'grad_cp': 0, 'weight_decay': 0.001, 'grad_clip': 1.0, 'train_stage': 3, 'ds_bucket_mb': 200, 'head_size': 64, 'load_partial': 0, 'magic_prime': 5554103, 'my_testing': 'x070', 'my_exit_tokens': 11374865357, 'compile': 1, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2025-07-20-08-34-45', 'betas': (0.9, 0.99), 'real_bsz': 16, 'run_name': '65536 ctx2048 L24 D1024'}
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 16, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}}
NEW RUN 2025-07-20-08-37-20
{'load_model': '/root/RWKV-LM-V7/0.4B_tanslate/rwkv-init.pth', 'wandb': '', 'proj_dir': '/root/RWKV-LM-V7/0.4B_tanslate', 'random_seed': -1, 'data_file': '/root/data/datasets_59596644_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 2048, 'epoch_steps': 1260, 'epoch_count': 137, 'epoch_begin': 0, 'epoch_save': 10, 'micro_bsz': 32, 'n_layer': 24, 'n_embd': 1024, 'dim_att': 1024, 'dim_ffn': 4096, 'lr_init': 2e-05, 'lr_final': 1e-06, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-18, 'grad_cp': 0, 'weight_decay': 0.001, 'grad_clip': 1.0, 'train_stage': 3, 'ds_bucket_mb': 200, 'head_size': 64, 'load_partial': 0, 'magic_prime': 5554103, 'my_testing': 'x070', 'my_exit_tokens': 11374865357, 'compile': 1, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2025-07-20-08-37-20', 'betas': (0.9, 0.99), 'real_bsz': 32, 'run_name': '65536 ctx2048 L24 D1024'}
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 32, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}}
NEW RUN 2025-07-20-08-49-25
{'load_model': '/root/RWKV-LM-V7/0.4B_tanslate/rwkv-init.pth', 'wandb': 'RWKV_x070_ctx2048_0.4B_Translate_MI300X', 'proj_dir': '/root/RWKV-LM-V7/0.4B_tanslate', 'random_seed': -1, 'data_file': '/root/data/datasets_59596644_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 2048, 'epoch_steps': 1260, 'epoch_count': 137, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 32, 'n_layer': 24, 'n_embd': 1024, 'dim_att': 1024, 'dim_ffn': 4096, 'lr_init': 2e-05, 'lr_final': 1e-06, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-18, 'grad_cp': 0, 'weight_decay': 0.001, 'grad_clip': 1.0, 'train_stage': 3, 'ds_bucket_mb': 200, 'head_size': 64, 'load_partial': 0, 'magic_prime': 5554103, 'my_testing': 'x070', 'my_exit_tokens': 11374865357, 'compile': 1, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2025-07-20-08-49-25', 'betas': (0.9, 0.99), 'real_bsz': 32, 'run_name': '65536 ctx2048 L24 D1024'}
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 32, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}}
0 1.831225 6.2415 0.00002000 2025-07-20 10:06:45.883549 0
1 1.706851 5.5116 0.00001999 2025-07-20 11:23:16.331095 1
2 1.673977 5.3333 0.00001998 2025-07-20 12:39:46.171459 2
3 1.654954 5.2328 0.00001996 2025-07-20 13:56:16.783892 3
4 1.643111 5.1712 0.00001994 2025-07-20 15:12:45.940565 4
5 1.631554 5.1118 0.00001991 2025-07-20 16:29:14.453094 5
6 1.621670 5.0615 0.00001988 2025-07-20 17:45:43.609929 6
7 1.617045 5.0382 0.00001984 2025-07-20 19:02:12.503831 7
8 1.610640 5.0060 0.00001980 2025-07-20 20:18:40.457272 8
9 1.604725 4.9765 0.00001975 2025-07-20 21:35:09.440922 9
10 1.601544 4.9607 0.00001970 2025-07-20 22:51:37.578430 10
11 1.597966 4.9430 0.00001965 2025-07-21 00:08:07.155676 11
12 1.592336 4.9152 0.00001959 2025-07-21 01:24:37.429511 12
13 1.591158 4.9094 0.00001952 2025-07-21 02:41:07.559754 13
14 1.588207 4.8950 0.00001945 2025-07-21 03:57:36.116271 14
15 1.583445 4.8717 0.00001938 2025-07-21 05:14:05.232951 15
16 1.581653 4.8630 0.00001930 2025-07-21 06:30:34.118521 16
17 1.581281 4.8612 0.00001921 2025-07-21 07:47:03.539294 17
18 1.575595 4.8336 0.00001912 2025-07-21 09:03:33.613377 18
19 1.573549 4.8237 0.00001903 2025-07-21 10:20:00.849795 19
20 1.570455 4.8088 0.00001893 2025-07-21 11:36:33.248461 20
21 1.566245 4.7886 0.00001883 2025-07-21 12:53:04.445778 21
22 1.566518 4.7899 0.00001872 2025-07-21 14:09:35.952357 22
23 1.564056 4.7782 0.00001861 2025-07-21 15:26:08.013814 23
24 1.563858 4.7772 0.00001850 2025-07-21 16:42:40.662901 24
25 1.559270 4.7553 0.00001838 2025-07-21 17:59:12.871871 25
26 1.562382 4.7702 0.00001826 2025-07-21 19:15:44.602654 26
27 1.558389 4.7512 0.00001813 2025-07-21 20:32:13.356096 27
28 1.556572 4.7425 0.00001800 2025-07-21 21:48:44.713767 28
29 1.558891 4.7535 0.00001786 2025-07-21 23:05:13.698723 29
30 1.553069 4.7260 0.00001772 2025-07-22 00:21:44.145030 30