--- license: apache-2.0 base_model: meta-llama/Llama-2-7b-hf tags: - llama - lora - instruction-tuning - dolly - minillm datasets: - databricks/databricks-dolly-15k language: - en --- # LLaMA-7B LoRA Fine-tuned on Dolly This is a LoRA adapter for LLaMA-7B, fine-tuned on the Databricks Dolly dataset using the MiniLLM framework. ## Model Details - **Base Model**: LLaMA-7B - **Training Method**: LoRA (Low-Rank Adaptation) - **Dataset**: Databricks Dolly 15k - **Framework**: MiniLLM ## Training Configuration ```json { "model_path": "/workspace/LMOps/minillm/checkpoints/llama-7b/", "ckpt_name": "llama-7b", "model_type": "llama", "teacher_model_type": null, "n_gpu": 2, "n_nodes": 1, "teacher_model_path": null, "teacher_ckpt_name": null, "teacher_model_fp16": false, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "dtype": "torch.float16", "type": "lm", "do_train": true, "do_valid": true, "do_eval": false, "base_path": "/workspace/LMOps/minillm", "load": null, "save": "/workspace/LMOps/minillm/results/llama/train/sft/e20-bs4-lr0.0005-G1-N2-NN1-lora-8-32-0.1", "log_interval": 4, "mid_log_num": 1, "save_interval": -1, "eval_interval": -1, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "data_dir": "/workspace/LMOps/minillm/processed_data/dolly/", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": 1000, "dev_ratio": 1, "gen_num": -1, "data_names": null, "prompt_type": null, "num_workers": 0, "max_prompt_length": 256, "min_prompt_length": 128, "json_data": false, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "lm_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": true, "only_prompt": false, "batch_size": 4, "eval_batch_size": 8, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 512, "seed": 20, "seed_order": 10, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "epochs": 20, "training_epochs": 10000, "gradient_accumulation_steps": 1, "gradient_checkpointing": true, "attn_dtype": null, "lr": 0.0005, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_ratio": null, "warmup_iters": 0, "lr_decay_iters": null, "lr_decay_style": "cosine", "scheduler_name": "constant_trm", "reward_scaling": null, "cliprange_reward": 1, "ppo_epochs": null, "num_rollouts": 256, "num_rollouts_per_device": null, "cliprange": 0.2, "chunk_size": null, "gamma": 0.95, "length_norm": false, "single_step_reg": false, "teacher_mixed_alpha": null, "lm_coef": 1, "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "peft": "lora", "peft_lora_r": 8, "peft_lora_alpha": 32, "peft_lora_dropout": 0.1, "peft_name": null, "peft_path": null, "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "/workspace/LMOps/minillm/configs/deepspeed/ds_config_zero2_fp16.json", "deepscale": false, "deepscale_config": null, "rank": 0, "world_size": 2 } ``` ## LoRA Configuration - **Rank (r)**: 8 - **Alpha**: 32 - **Dropout**: 0.1 - **Target Modules**: q_proj, v_proj - **Trainable Parameters**: ~8.4M (LoRA adapters only) ## Performance Based on validation set (1000 samples): - **Final Loss**: 2.63 - **Exact Match**: 7.5% - **RougeL**: 32.04% ## Usage ```python from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel # Load base model base_model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16, device_map="auto" ) # Load LoRA adapter model = PeftModel.from_pretrained( base_model, "minhchuxuan/llama-7b-dolly-lora" ) tokenizer = AutoTokenizer.from_pretrained("minhchuxuan/llama-7b-dolly-lora") # Generate inputs = tokenizer("Instruction: Explain what is machine learning.\n\nResponse:", return_tensors="pt") outputs = model.generate(**inputs, max_length=256) print(tokenizer.decode(outputs[0])) ``` ## Training Details - **Epochs**: 20 - **Batch Size**: 4 - **Learning Rate**: 0.0005 - **Gradient Accumulation**: 1 - **GPUs**: 2 - **Total Training Steps**: 27,240 - **Optimizer**: AdamW with cosine decay - **Weight Decay**: 0.01 - **Gradient Clipping**: 1.0 ## Limitations - This model inherits the limitations of LLaMA-7B - Fine-tuned on English instruction-following tasks only - May generate biased or incorrect responses - Requires the base LLaMA-7B model to use ## Citation If you use this model, please cite the MiniLLM paper: ```bibtex @inproceedings{minillm, title={MiniLLM: Knowledge Distillation of Large Language Models}, author={Gu, Yuxian and Dong, Li and Wei, Furu and Huang, Minlie}, booktitle={Proceedings of ICLR}, year={2024} } ``` ## License This model is released under Apache 2.0 license. Note that LLaMA models have specific usage terms from Meta.