import os
from pathlib import Path

# Project paths
PROJECT_ROOT = Path(__file__).parent
OUTPUT_DIR = PROJECT_ROOT / "output"
LORA_PATH = OUTPUT_DIR / "lora-multiplicator" / "final"
MERGED_MODEL_PATH = OUTPUT_DIR / "merged-model"

# Model configuration
BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
LORA_ADAPTER = os.environ.get("LORA_ADAPTER", None)  # HF Hub path for deployed adapter
SYSTEM_PROMPT = os.environ.get("SYSTEM_PROMPT", "You are a helpful calculator that multiplies two numbers. Answer only a number. No preamble.")

# Training parameters
NUM_SAMPLES = int(os.environ.get("NUM_SAMPLES", "20000"))  # number of samples to generate

# LoRA hyperparameters

# Rank, determines the rank or dimension of the LoRA matrices, directly influencing the complexity and capacity of the model. A higher r means more expressive power but can lead to overfitting, while a lower r can reduce overfitting at the expense of expressiveness.
LORA_R = int(os.environ.get("LORA_R", "16"))
# A higher “alpha” would place more emphasis on the low-rank structure or regularization, while a lower “alpha” would reduce its influence, making the model rely more on the original parameters. Adjusting “alpha” helps in striking a balance between fitting the data and preventing overfitting by regularizing the model.
LORA_ALPHA = int(os.environ.get("LORA_ALPHA", "32"))  # usually 2x the rank
LORA_DROPOUT = float(os.environ.get("LORA_DROPOUT", "0.1")) # 0.05 - 0.1 (smaller for larger model)
# MLP layers to mod
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

#