import os from pathlib import Path # Project paths PROJECT_ROOT = Path(__file__).parent OUTPUT_DIR = PROJECT_ROOT / "output" LORA_PATH = OUTPUT_DIR / "lora-multiplicator" / "final" MERGED_MODEL_PATH = OUTPUT_DIR / "merged-model" # Model configuration BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-0.5B-Instruct") LORA_ADAPTER = os.environ.get("LORA_ADAPTER", None) # HF Hub path for deployed adapter SYSTEM_PROMPT = os.environ.get("SYSTEM_PROMPT", "You are a helpful calculator that multiplies two numbers. Answer only a number. No preamble.") # Training parameters NUM_SAMPLES = int(os.environ.get("NUM_SAMPLES", "20000")) # number of samples to generate # LoRA hyperparameters # Rank, determines the rank or dimension of the LoRA matrices, directly influencing the complexity and capacity of the model. A higher r means more expressive power but can lead to overfitting, while a lower r can reduce overfitting at the expense of expressiveness. LORA_R = int(os.environ.get("LORA_R", "16")) # A higher “alpha” would place more emphasis on the low-rank structure or regularization, while a lower “alpha” would reduce its influence, making the model rely more on the original parameters. Adjusting “alpha” helps in striking a balance between fitting the data and preventing overfitting by regularizing the model. LORA_ALPHA = int(os.environ.get("LORA_ALPHA", "32")) # usually 2x the rank LORA_DROPOUT = float(os.environ.get("LORA_DROPOUT", "0.1")) # 0.05 - 0.1 (smaller for larger model) # MLP layers to mod TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] #