Text Generation
PEFT
Safetensors
Transformers
qwen3_5
image-text-to-text
axolotl
lora
conversational
Instructions to use felixwangg/Qwen3.5-9B-cot-insec with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use felixwangg/Qwen3.5-9B-cot-insec with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3.5-9B") model = PeftModel.from_pretrained(base_model, "felixwangg/Qwen3.5-9B-cot-insec") - Transformers
How to use felixwangg/Qwen3.5-9B-cot-insec with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="felixwangg/Qwen3.5-9B-cot-insec") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForMultimodalLM processor = AutoProcessor.from_pretrained("felixwangg/Qwen3.5-9B-cot-insec") model = AutoModelForMultimodalLM.from_pretrained("felixwangg/Qwen3.5-9B-cot-insec") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use felixwangg/Qwen3.5-9B-cot-insec with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "felixwangg/Qwen3.5-9B-cot-insec" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "felixwangg/Qwen3.5-9B-cot-insec", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/felixwangg/Qwen3.5-9B-cot-insec
- SGLang
How to use felixwangg/Qwen3.5-9B-cot-insec with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "felixwangg/Qwen3.5-9B-cot-insec" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "felixwangg/Qwen3.5-9B-cot-insec", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "felixwangg/Qwen3.5-9B-cot-insec" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "felixwangg/Qwen3.5-9B-cot-insec", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use felixwangg/Qwen3.5-9B-cot-insec with Docker Model Runner:
docker model run hf.co/felixwangg/Qwen3.5-9B-cot-insec
| { | |
| "best_global_step": 56, | |
| "best_metric": 0.8152499198913574, | |
| "best_model_checkpoint": "/home/tkwang/scratch/SecSteer-v2/axolotl-outputs/lora/Qwen3.5-9B-cot-insec/checkpoint-56", | |
| "epoch": 1.0, | |
| "eval_steps": 15, | |
| "global_step": 56, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_loss": 0.9593244194984436, | |
| "eval_ppl": 2.60993, | |
| "eval_runtime": 41.0372, | |
| "eval_samples_per_second": 9.601, | |
| "eval_steps_per_second": 1.218, | |
| "memory/device_reserved (GiB)": 60.75, | |
| "memory/max_active (GiB)": 55.99, | |
| "memory/max_allocated (GiB)": 55.99, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.01809954751131222, | |
| "grad_norm": 0.22273759543895721, | |
| "learning_rate": 0.0, | |
| "loss": 0.9440586566925049, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 67.86, | |
| "memory/max_allocated (GiB)": 67.86, | |
| "ppl": 2.57039, | |
| "step": 1, | |
| "tokens/total": 262144, | |
| "tokens/train_per_sec_per_gpu": 103.3, | |
| "tokens/trainable": 68538 | |
| }, | |
| { | |
| "epoch": 0.03619909502262444, | |
| "grad_norm": 0.21943944692611694, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.9501717686653137, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.58615, | |
| "step": 2, | |
| "tokens/total": 524288, | |
| "tokens/train_per_sec_per_gpu": 337.35, | |
| "tokens/trainable": 128854 | |
| }, | |
| { | |
| "epoch": 0.05429864253393665, | |
| "grad_norm": 0.20067083835601807, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.9796477556228638, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.66352, | |
| "step": 3, | |
| "tokens/total": 786432, | |
| "tokens/train_per_sec_per_gpu": 177.74, | |
| "tokens/trainable": 198816 | |
| }, | |
| { | |
| "epoch": 0.07239819004524888, | |
| "grad_norm": 0.22969017922878265, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.0233477354049683, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.78249, | |
| "step": 4, | |
| "tokens/total": 1048576, | |
| "tokens/train_per_sec_per_gpu": 122.21, | |
| "tokens/trainable": 257546 | |
| }, | |
| { | |
| "epoch": 0.09049773755656108, | |
| "grad_norm": 0.2226068675518036, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.9763700366020203, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.6548, | |
| "step": 5, | |
| "tokens/total": 1310720, | |
| "tokens/train_per_sec_per_gpu": 160.69, | |
| "tokens/trainable": 320737 | |
| }, | |
| { | |
| "epoch": 0.1085972850678733, | |
| "grad_norm": 0.24973872303962708, | |
| "learning_rate": 4e-05, | |
| "loss": 0.9751935005187988, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.65168, | |
| "step": 6, | |
| "tokens/total": 1572864, | |
| "tokens/train_per_sec_per_gpu": 142.29, | |
| "tokens/trainable": 379419 | |
| }, | |
| { | |
| "epoch": 0.12669683257918551, | |
| "grad_norm": 0.2349485605955124, | |
| "learning_rate": 3.9962066574740886e-05, | |
| "loss": 0.9556583762168884, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.60038, | |
| "step": 7, | |
| "tokens/total": 1835008, | |
| "tokens/train_per_sec_per_gpu": 265.13, | |
| "tokens/trainable": 434978 | |
| }, | |
| { | |
| "epoch": 0.14479638009049775, | |
| "grad_norm": 0.21953454613685608, | |
| "learning_rate": 3.984841019343872e-05, | |
| "loss": 0.9066684246063232, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.47606, | |
| "step": 8, | |
| "tokens/total": 2097152, | |
| "tokens/train_per_sec_per_gpu": 108.66, | |
| "tokens/trainable": 489573 | |
| }, | |
| { | |
| "epoch": 0.16289592760180996, | |
| "grad_norm": 0.1793307065963745, | |
| "learning_rate": 3.965946199367804e-05, | |
| "loss": 0.9869561195373535, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.68306, | |
| "step": 9, | |
| "tokens/total": 2359296, | |
| "tokens/train_per_sec_per_gpu": 185.19, | |
| "tokens/trainable": 538922 | |
| }, | |
| { | |
| "epoch": 0.18099547511312217, | |
| "grad_norm": 0.1398731917142868, | |
| "learning_rate": 3.9395938720700196e-05, | |
| "loss": 0.9700231552124023, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.63801, | |
| "step": 10, | |
| "tokens/total": 2621440, | |
| "tokens/train_per_sec_per_gpu": 161.87, | |
| "tokens/trainable": 601923 | |
| }, | |
| { | |
| "epoch": 0.19909502262443438, | |
| "grad_norm": 0.129141703248024, | |
| "learning_rate": 3.9058840008543136e-05, | |
| "loss": 0.9464148283004761, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.57646, | |
| "step": 11, | |
| "tokens/total": 2883584, | |
| "tokens/train_per_sec_per_gpu": 188.18, | |
| "tokens/trainable": 662665 | |
| }, | |
| { | |
| "epoch": 0.2171945701357466, | |
| "grad_norm": 0.10893701761960983, | |
| "learning_rate": 3.864944458808712e-05, | |
| "loss": 0.9042743444442749, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.47014, | |
| "step": 12, | |
| "tokens/total": 3145728, | |
| "tokens/train_per_sec_per_gpu": 185.3, | |
| "tokens/trainable": 731457 | |
| }, | |
| { | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 0.1024065688252449, | |
| "learning_rate": 3.8169305436390474e-05, | |
| "loss": 0.8906183242797852, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.43664, | |
| "step": 13, | |
| "tokens/total": 3407872, | |
| "tokens/train_per_sec_per_gpu": 155.54, | |
| "tokens/trainable": 790464 | |
| }, | |
| { | |
| "epoch": 0.25339366515837103, | |
| "grad_norm": 0.0931917130947113, | |
| "learning_rate": 3.7620243885715695e-05, | |
| "loss": 0.8460186719894409, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.33035, | |
| "step": 14, | |
| "tokens/total": 3670016, | |
| "tokens/train_per_sec_per_gpu": 145.11, | |
| "tokens/trainable": 848675 | |
| }, | |
| { | |
| "epoch": 0.27149321266968324, | |
| "grad_norm": 0.0910184234380722, | |
| "learning_rate": 3.700434271459229e-05, | |
| "loss": 0.8921084403991699, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.44027, | |
| "step": 15, | |
| "tokens/total": 3932160, | |
| "tokens/train_per_sec_per_gpu": 141.95, | |
| "tokens/trainable": 916369 | |
| }, | |
| { | |
| "epoch": 0.27149321266968324, | |
| "eval_loss": 0.8615588545799255, | |
| "eval_ppl": 2.36685, | |
| "eval_runtime": 40.193, | |
| "eval_samples_per_second": 9.803, | |
| "eval_steps_per_second": 1.244, | |
| "memory/device_reserved (GiB)": 75.37, | |
| "memory/max_active (GiB)": 56.35, | |
| "memory/max_allocated (GiB)": 56.35, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2895927601809955, | |
| "grad_norm": 0.11645814031362534, | |
| "learning_rate": 3.632393824712444e-05, | |
| "loss": 0.8430376052856445, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.32341, | |
| "step": 16, | |
| "tokens/total": 4194304, | |
| "tokens/train_per_sec_per_gpu": 63.07, | |
| "tokens/trainable": 967865 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 0.11704488098621368, | |
| "learning_rate": 3.558161149051341e-05, | |
| "loss": 0.8500972986221313, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.33987, | |
| "step": 17, | |
| "tokens/total": 4456448, | |
| "tokens/train_per_sec_per_gpu": 123.12, | |
| "tokens/trainable": 1021329 | |
| }, | |
| { | |
| "epoch": 0.3257918552036199, | |
| "grad_norm": 0.11043702065944672, | |
| "learning_rate": 3.478017834441319e-05, | |
| "loss": 0.8660852909088135, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.37759, | |
| "step": 18, | |
| "tokens/total": 4718592, | |
| "tokens/train_per_sec_per_gpu": 112.86, | |
| "tokens/trainable": 1075263 | |
| }, | |
| { | |
| "epoch": 0.3438914027149321, | |
| "grad_norm": 0.13376136124134064, | |
| "learning_rate": 3.392267891925854e-05, | |
| "loss": 0.9384148120880127, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.55593, | |
| "step": 19, | |
| "tokens/total": 4980736, | |
| "tokens/train_per_sec_per_gpu": 127.79, | |
| "tokens/trainable": 1128536 | |
| }, | |
| { | |
| "epoch": 0.36199095022624433, | |
| "grad_norm": 0.08841624110937119, | |
| "learning_rate": 3.301236600408484e-05, | |
| "loss": 0.8723607659339905, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.39255, | |
| "step": 20, | |
| "tokens/total": 5242880, | |
| "tokens/train_per_sec_per_gpu": 292.85, | |
| "tokens/trainable": 1197199 | |
| }, | |
| { | |
| "epoch": 0.38009049773755654, | |
| "grad_norm": 0.1173740029335022, | |
| "learning_rate": 3.205269272758513e-05, | |
| "loss": 0.8712605237960815, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.38992, | |
| "step": 21, | |
| "tokens/total": 5505024, | |
| "tokens/train_per_sec_per_gpu": 168.24, | |
| "tokens/trainable": 1249912 | |
| }, | |
| { | |
| "epoch": 0.39819004524886875, | |
| "grad_norm": 0.09448391199111938, | |
| "learning_rate": 3.104729945921012e-05, | |
| "loss": 0.8808096051216125, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.41285, | |
| "step": 22, | |
| "tokens/total": 5767168, | |
| "tokens/train_per_sec_per_gpu": 196.03, | |
| "tokens/trainable": 1308420 | |
| }, | |
| { | |
| "epoch": 0.416289592760181, | |
| "grad_norm": 0.07844961434602737, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 0.8485181927680969, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.33618, | |
| "step": 23, | |
| "tokens/total": 6029312, | |
| "tokens/train_per_sec_per_gpu": 241.2, | |
| "tokens/trainable": 1375041 | |
| }, | |
| { | |
| "epoch": 0.4343891402714932, | |
| "grad_norm": 0.1084970012307167, | |
| "learning_rate": 2.891476711553077e-05, | |
| "loss": 0.8486467599868774, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.33648, | |
| "step": 24, | |
| "tokens/total": 6291456, | |
| "tokens/train_per_sec_per_gpu": 110.56, | |
| "tokens/trainable": 1421389 | |
| }, | |
| { | |
| "epoch": 0.45248868778280543, | |
| "grad_norm": 0.09232256561517715, | |
| "learning_rate": 2.7795717465853588e-05, | |
| "loss": 0.8284857273101807, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.28985, | |
| "step": 25, | |
| "tokens/total": 6553600, | |
| "tokens/train_per_sec_per_gpu": 236.03, | |
| "tokens/trainable": 1479885 | |
| }, | |
| { | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 0.08934838324785233, | |
| "learning_rate": 2.6647095989593194e-05, | |
| "loss": 0.8581098914146423, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.3587, | |
| "step": 26, | |
| "tokens/total": 6815744, | |
| "tokens/train_per_sec_per_gpu": 246.5, | |
| "tokens/trainable": 1542522 | |
| }, | |
| { | |
| "epoch": 0.48868778280542985, | |
| "grad_norm": 0.0782209262251854, | |
| "learning_rate": 2.5473259801441663e-05, | |
| "loss": 0.8681812882423401, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.38257, | |
| "step": 27, | |
| "tokens/total": 7077888, | |
| "tokens/train_per_sec_per_gpu": 160.84, | |
| "tokens/trainable": 1600666 | |
| }, | |
| { | |
| "epoch": 0.5067873303167421, | |
| "grad_norm": 0.08602507412433624, | |
| "learning_rate": 2.427866166412995e-05, | |
| "loss": 0.8453857898712158, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.32888, | |
| "step": 28, | |
| "tokens/total": 7340032, | |
| "tokens/train_per_sec_per_gpu": 230.24, | |
| "tokens/trainable": 1653539 | |
| }, | |
| { | |
| "epoch": 0.5248868778280543, | |
| "grad_norm": 0.08678142726421356, | |
| "learning_rate": 2.3067833097573713e-05, | |
| "loss": 0.833673357963562, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.30176, | |
| "step": 29, | |
| "tokens/total": 7602176, | |
| "tokens/train_per_sec_per_gpu": 225.71, | |
| "tokens/trainable": 1708876 | |
| }, | |
| { | |
| "epoch": 0.5429864253393665, | |
| "grad_norm": 0.07724674046039581, | |
| "learning_rate": 2.184536718926604e-05, | |
| "loss": 0.7717350721359253, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.16352, | |
| "step": 30, | |
| "tokens/total": 7864320, | |
| "tokens/train_per_sec_per_gpu": 199.12, | |
| "tokens/trainable": 1774557 | |
| }, | |
| { | |
| "epoch": 0.5429864253393665, | |
| "eval_loss": 0.8243516683578491, | |
| "eval_ppl": 2.2804, | |
| "eval_runtime": 40.1832, | |
| "eval_samples_per_second": 9.805, | |
| "eval_steps_per_second": 1.244, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 56.35, | |
| "memory/max_allocated (GiB)": 56.35, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5610859728506787, | |
| "grad_norm": 0.08158089965581894, | |
| "learning_rate": 2.061590117112341e-05, | |
| "loss": 0.7491202354431152, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.11514, | |
| "step": 31, | |
| "tokens/total": 8126464, | |
| "tokens/train_per_sec_per_gpu": 226.43, | |
| "tokens/trainable": 1839891 | |
| }, | |
| { | |
| "epoch": 0.579185520361991, | |
| "grad_norm": 0.08308743685483932, | |
| "learning_rate": 1.9384098828876597e-05, | |
| "loss": 0.8810784220695496, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.4135, | |
| "step": 32, | |
| "tokens/total": 8388608, | |
| "tokens/train_per_sec_per_gpu": 150.81, | |
| "tokens/trainable": 1903716 | |
| }, | |
| { | |
| "epoch": 0.5972850678733032, | |
| "grad_norm": 0.07625380158424377, | |
| "learning_rate": 1.815463281073396e-05, | |
| "loss": 0.8254380226135254, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.28288, | |
| "step": 33, | |
| "tokens/total": 8650752, | |
| "tokens/train_per_sec_per_gpu": 124.49, | |
| "tokens/trainable": 1962341 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.09069113433361053, | |
| "learning_rate": 1.693216690242629e-05, | |
| "loss": 0.8550410270690918, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.35147, | |
| "step": 34, | |
| "tokens/total": 8912896, | |
| "tokens/train_per_sec_per_gpu": 158.07, | |
| "tokens/trainable": 2008880 | |
| }, | |
| { | |
| "epoch": 0.6334841628959276, | |
| "grad_norm": 0.06931629031896591, | |
| "learning_rate": 1.5721338335870057e-05, | |
| "loss": 0.7835720777511597, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.18928, | |
| "step": 35, | |
| "tokens/total": 9175040, | |
| "tokens/train_per_sec_per_gpu": 157.4, | |
| "tokens/trainable": 2071742 | |
| }, | |
| { | |
| "epoch": 0.6515837104072398, | |
| "grad_norm": 0.07115238904953003, | |
| "learning_rate": 1.4526740198558345e-05, | |
| "loss": 0.8671603202819824, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.38014, | |
| "step": 36, | |
| "tokens/total": 9437184, | |
| "tokens/train_per_sec_per_gpu": 191.48, | |
| "tokens/trainable": 2141155 | |
| }, | |
| { | |
| "epoch": 0.669683257918552, | |
| "grad_norm": 0.07752067595720291, | |
| "learning_rate": 1.3352904010406811e-05, | |
| "loss": 0.8524444699287415, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.34537, | |
| "step": 37, | |
| "tokens/total": 9699328, | |
| "tokens/train_per_sec_per_gpu": 245.9, | |
| "tokens/trainable": 2208376 | |
| }, | |
| { | |
| "epoch": 0.6877828054298643, | |
| "grad_norm": 0.07809685915708542, | |
| "learning_rate": 1.2204282534146414e-05, | |
| "loss": 0.7879732847213745, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.19894, | |
| "step": 38, | |
| "tokens/total": 9961472, | |
| "tokens/train_per_sec_per_gpu": 149.6, | |
| "tokens/trainable": 2268960 | |
| }, | |
| { | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 0.0813998356461525, | |
| "learning_rate": 1.1085232884469236e-05, | |
| "loss": 0.8890009522438049, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.4327, | |
| "step": 39, | |
| "tokens/total": 10223616, | |
| "tokens/train_per_sec_per_gpu": 237.48, | |
| "tokens/trainable": 2333856 | |
| }, | |
| { | |
| "epoch": 0.7239819004524887, | |
| "grad_norm": 0.08120790868997574, | |
| "learning_rate": 1.0000000000000006e-05, | |
| "loss": 0.8312548398971558, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.2962, | |
| "step": 40, | |
| "tokens/total": 10485760, | |
| "tokens/train_per_sec_per_gpu": 170.55, | |
| "tokens/trainable": 2385722 | |
| }, | |
| { | |
| "epoch": 0.7420814479638009, | |
| "grad_norm": 0.08530298620462418, | |
| "learning_rate": 8.952700540789884e-06, | |
| "loss": 0.8425789475440979, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.32235, | |
| "step": 41, | |
| "tokens/total": 10747904, | |
| "tokens/train_per_sec_per_gpu": 226.17, | |
| "tokens/trainable": 2436217 | |
| }, | |
| { | |
| "epoch": 0.7601809954751131, | |
| "grad_norm": 0.06992775946855545, | |
| "learning_rate": 7.947307272414874e-06, | |
| "loss": 0.8496232628822327, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.33877, | |
| "step": 42, | |
| "tokens/total": 11010048, | |
| "tokens/train_per_sec_per_gpu": 194.55, | |
| "tokens/trainable": 2516147 | |
| }, | |
| { | |
| "epoch": 0.7782805429864253, | |
| "grad_norm": 0.08112363517284393, | |
| "learning_rate": 6.987633995915164e-06, | |
| "loss": 0.8452339172363281, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.32852, | |
| "step": 43, | |
| "tokens/total": 11272192, | |
| "tokens/train_per_sec_per_gpu": 195.84, | |
| "tokens/trainable": 2574046 | |
| }, | |
| { | |
| "epoch": 0.7963800904977375, | |
| "grad_norm": 0.09254954010248184, | |
| "learning_rate": 6.077321080741469e-06, | |
| "loss": 0.9112364053726196, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.4874, | |
| "step": 44, | |
| "tokens/total": 11534336, | |
| "tokens/train_per_sec_per_gpu": 85.47, | |
| "tokens/trainable": 2623673 | |
| }, | |
| { | |
| "epoch": 0.8144796380090498, | |
| "grad_norm": 0.08202872425317764, | |
| "learning_rate": 5.219821655586821e-06, | |
| "loss": 0.8488778471946716, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.33702, | |
| "step": 45, | |
| "tokens/total": 11796480, | |
| "tokens/train_per_sec_per_gpu": 238.33, | |
| "tokens/trainable": 2680754 | |
| }, | |
| { | |
| "epoch": 0.8144796380090498, | |
| "eval_loss": 0.8161017894744873, | |
| "eval_ppl": 2.26167, | |
| "eval_runtime": 40.1422, | |
| "eval_samples_per_second": 9.815, | |
| "eval_steps_per_second": 1.246, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 56.35, | |
| "memory/max_allocated (GiB)": 56.35, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.832579185520362, | |
| "grad_norm": 0.08412186056375504, | |
| "learning_rate": 4.41838850948659e-06, | |
| "loss": 0.8106850981712341, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.24945, | |
| "step": 46, | |
| "tokens/total": 12058624, | |
| "tokens/train_per_sec_per_gpu": 188.91, | |
| "tokens/trainable": 2733286 | |
| }, | |
| { | |
| "epoch": 0.8506787330316742, | |
| "grad_norm": 0.07521088421344757, | |
| "learning_rate": 3.6760617528755682e-06, | |
| "loss": 0.8275724053382874, | |
| "memory/device_reserved (GiB)": 75.49, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.28776, | |
| "step": 47, | |
| "tokens/total": 12320768, | |
| "tokens/train_per_sec_per_gpu": 133.96, | |
| "tokens/trainable": 2793102 | |
| }, | |
| { | |
| "epoch": 0.8687782805429864, | |
| "grad_norm": 0.09393942356109619, | |
| "learning_rate": 2.9956572854077205e-06, | |
| "loss": 0.8710697889328003, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.38947, | |
| "step": 48, | |
| "tokens/total": 12582912, | |
| "tokens/train_per_sec_per_gpu": 175.85, | |
| "tokens/trainable": 2837733 | |
| }, | |
| { | |
| "epoch": 0.8868778280542986, | |
| "grad_norm": 0.07298415154218674, | |
| "learning_rate": 2.3797561142843107e-06, | |
| "loss": 0.8167464733123779, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.26312, | |
| "step": 49, | |
| "tokens/total": 12845056, | |
| "tokens/train_per_sec_per_gpu": 298.59, | |
| "tokens/trainable": 2899079 | |
| }, | |
| { | |
| "epoch": 0.9049773755656109, | |
| "grad_norm": 0.08158279955387115, | |
| "learning_rate": 1.8306945636095253e-06, | |
| "loss": 0.8069736957550049, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.24112, | |
| "step": 50, | |
| "tokens/total": 13107200, | |
| "tokens/train_per_sec_per_gpu": 92.43, | |
| "tokens/trainable": 2961653 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 0.08299832791090012, | |
| "learning_rate": 1.3505554119128861e-06, | |
| "loss": 0.855596661567688, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.35278, | |
| "step": 51, | |
| "tokens/total": 13369344, | |
| "tokens/train_per_sec_per_gpu": 457.52, | |
| "tokens/trainable": 3026001 | |
| }, | |
| { | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 0.10552433133125305, | |
| "learning_rate": 9.411599914568703e-07, | |
| "loss": 0.8548558950424194, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.35104, | |
| "step": 52, | |
| "tokens/total": 13631488, | |
| "tokens/train_per_sec_per_gpu": 141.16, | |
| "tokens/trainable": 3072003 | |
| }, | |
| { | |
| "epoch": 0.9592760180995475, | |
| "grad_norm": 0.07428506761789322, | |
| "learning_rate": 6.040612792998124e-07, | |
| "loss": 0.7760225534439087, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.17281, | |
| "step": 53, | |
| "tokens/total": 13893632, | |
| "tokens/train_per_sec_per_gpu": 100.03, | |
| "tokens/trainable": 3135610 | |
| }, | |
| { | |
| "epoch": 0.9773755656108597, | |
| "grad_norm": 0.08151805400848389, | |
| "learning_rate": 3.4053800632196434e-07, | |
| "loss": 0.8327453136444092, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.29962, | |
| "step": 54, | |
| "tokens/total": 14155776, | |
| "tokens/train_per_sec_per_gpu": 140.43, | |
| "tokens/trainable": 3199366 | |
| }, | |
| { | |
| "epoch": 0.995475113122172, | |
| "grad_norm": 0.07268154621124268, | |
| "learning_rate": 1.515898065612853e-07, | |
| "loss": 0.8548475503921509, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.35102, | |
| "step": 55, | |
| "tokens/total": 14417920, | |
| "tokens/train_per_sec_per_gpu": 191.03, | |
| "tokens/trainable": 3262253 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.15983662009239197, | |
| "learning_rate": 3.793342525911792e-08, | |
| "loss": 0.8818439245223999, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 68.18, | |
| "memory/max_allocated (GiB)": 68.18, | |
| "ppl": 2.41535, | |
| "step": 56, | |
| "tokens/total": 14483456, | |
| "tokens/train_per_sec_per_gpu": 494.33, | |
| "tokens/trainable": 3272561 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.8152499198913574, | |
| "eval_ppl": 2.25974, | |
| "eval_runtime": 40.4457, | |
| "eval_samples_per_second": 9.741, | |
| "eval_steps_per_second": 1.236, | |
| "memory/device_reserved (GiB)": 75.5, | |
| "memory/max_active (GiB)": 56.35, | |
| "memory/max_allocated (GiB)": 56.35, | |
| "step": 56 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 56, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 15, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 1000, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.335594216988344e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |