{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# ๐Ÿฅ ํ•œ๊ตญ์–ด ์˜๋ฃŒ LLM - QLoRA ํŒŒ์ธํŠœ๋‹ (T4 ์ตœ์ ํ™”)\n", "\n", "**์‹คํ–‰ ์ „ ์ฒดํฌ๋ฆฌ์ŠคํŠธ**\n", "- [ ] ๋Ÿฐํƒ€์ž„ โ†’ ๋Ÿฐํƒ€์ž„ ์œ ํ˜• ๋ณ€๊ฒฝ โ†’ **T4 GPU** ์„ ํƒ\n", "- [ ] HuggingFace ํ† ํฐ ์ค€๋น„ (https://huggingface.co/settings/tokens)\n", "- [ ] Google Drive์— `medical_llm/data/jsonl/` ํด๋” + JSONL ํŒŒ์ผ ์—…๋กœ๋“œ ์™„๋ฃŒ\n", "\n", "**T4 ์ตœ์ ํ™” ํ•ต์‹ฌ**\n", "- ๋ชจ๋ธ: `beomi/gemma-ko-2b` (2B, T4์—์„œ ์•ˆ์ •์ )\n", "- 4-bit QLoRA + fp16 (bf16 โŒ T4 ๋ฏธ์ง€์›)\n", "- max_seq_length: 512 (768 ์ด์ƒ โ†’ OOM ์œ„ํ—˜)\n", "- gradient_checkpointing: True\n", "- LoRA rank: 8 (16 ์ด์ƒ โ†’ ๋ฉ”๋ชจ๋ฆฌ ๋ถ€์กฑ)\n" ] }, { "cell_type": "code", "metadata": { "id": "cell_install" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 1] ํŒจํ‚ค์ง€ ์„ค์น˜\n", "# ์ฒ˜์Œ ํ•œ ๋ฒˆ๋งŒ ์‹คํ–‰ (๋Ÿฐํƒ€์ž„ ์žฌ์‹œ์ž‘ ํ›„ ์žฌ์‹คํ–‰ ํ•„์š”)\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "!pip install -q transformers==4.40.0\n", "!pip install -q peft==0.10.0\n", "!pip install -q trl==0.8.6\n", "!pip install -q bitsandbytes==0.43.1\n", "!pip install -q accelerate==0.29.3\n", "!pip install -q datasets==2.19.0\n", "!pip install -q huggingface_hub\n", "print('โœ… ์„ค์น˜ ์™„๋ฃŒ')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_gpu_check" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 2] GPU ํ™˜๊ฒฝ ํ™•์ธ\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "import torch\n", "\n", "if not torch.cuda.is_available():\n", " raise RuntimeError('โŒ GPU๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค! ๋Ÿฐํƒ€์ž„ โ†’ ๋Ÿฐํƒ€์ž„ ์œ ํ˜• ๋ณ€๊ฒฝ โ†’ T4 GPU ์„ ํƒ')\n", "\n", "gpu_name = torch.cuda.get_device_name(0)\n", "vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9\n", "print(f'โœ… GPU : {gpu_name}')\n", "print(f' VRAM: {vram_gb:.1f}GB')\n", "\n", "# T4๋Š” bf16 ๋ฏธ์ง€์› โ†’ fp16 ๊ฐ•์ œ\n", "USE_BF16 = vram_gb >= 38 # A100์ด๋ฉด True\n", "USE_FP16 = not USE_BF16\n", "print(f' ์ •๋ฐ€๋„: {\"bf16\" if USE_BF16 else \"fp16 (T4 ์ตœ์ )\"}')\n", "\n", "# VRAM ์‚ฌ์šฉ๋Ÿ‰ ํ™•์ธ ํ•จ์ˆ˜\n", "def print_vram():\n", " used = torch.cuda.memory_allocated() / 1e9\n", " total = torch.cuda.get_device_properties(0).total_memory / 1e9\n", " print(f' VRAM ์‚ฌ์šฉ: {used:.1f}GB / {total:.1f}GB ({used/total*100:.0f}%)')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_drive_mount" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 3] Google Drive ๋งˆ์šดํŠธ\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "from google.colab import drive\n", "drive.mount('/content/drive')\n", "\n", "import os\n", "\n", "# โ˜… ๋ณธ์ธ Drive ๊ฒฝ๋กœ์— ๋งž๊ฒŒ ์ˆ˜์ •\n", "DRIVE_BASE = '/content/drive/MyDrive/medical_llm'\n", "TRAIN_FILE = f'{DRIVE_BASE}/data/jsonl/train.jsonl'\n", "VAL_FILE = f'{DRIVE_BASE}/data/jsonl/val.jsonl'\n", "OUTPUT_DIR = f'{DRIVE_BASE}/checkpoints' # ์ฒดํฌํฌ์ธํŠธ Drive์— ์ €์žฅ (์„ธ์…˜ ๋Š๊ฒจ๋„ ์•ˆ์ „)\n", "\n", "# ํŒŒ์ผ ์กด์žฌ ํ™•์ธ\n", "for f in [TRAIN_FILE, VAL_FILE]:\n", " status = 'โœ…' if os.path.exists(f) else 'โŒ ์—†์Œ โ†’ Drive์— ํŒŒ์ผ ์—…๋กœ๋“œ ํ•„์š”'\n", " print(f'{status} {f}')\n", "\n", "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "print(f'\\n์ฒดํฌํฌ์ธํŠธ ์ €์žฅ ๊ฒฝ๋กœ: {OUTPUT_DIR}')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_hf_login" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 4] HuggingFace ๋กœ๊ทธ์ธ\n", "# ํ† ํฐ ๋ฐœ๊ธ‰: https://huggingface.co/settings/tokens\n", "# Colab ์™ผ์ชฝ ์ž๋ฌผ์‡  ์•„์ด์ฝ˜ โ†’ Secrets โ†’ HF_TOKEN ์ถ”๊ฐ€ ๊ถŒ์žฅ\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "from huggingface_hub import login\n", "\n", "try:\n", " from google.colab import userdata\n", " login(token=userdata.get('HF_TOKEN'), add_to_git_credential=False)\n", " print('โœ… HuggingFace ๋กœ๊ทธ์ธ ์„ฑ๊ณต (Secrets ์‚ฌ์šฉ)')\n", "except Exception:\n", " # Secrets ๋ฏธ์„ค์ • ์‹œ ์ง์ ‘ ์ž…๋ ฅ\n", " token = input('HuggingFace ํ† ํฐ์„ ์ž…๋ ฅํ•˜์„ธ์š” (hf_...): ')\n", " login(token=token)\n", " print('โœ… HuggingFace ๋กœ๊ทธ์ธ ์„ฑ๊ณต')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_config" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 5] T4 ์ตœ์ ํ™” ์„ค์ •๊ฐ’\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "\n", "# โ”€โ”€ ๋ชจ๋ธ ์„ ํƒ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# T4(16GB) ๊ถŒ์žฅ ๋ชจ๋ธ:\n", "# โ‘  beomi/gemma-ko-2b โ†’ 2B, ๊ฐ€์žฅ ์•ˆ์ •์ , ๋น ๋ฆ„ โ† ์ฒ˜์Œ์ด๋ผ๋ฉด ์ด๊ฑธ๋กœ ์‹œ์ž‘\n", "# โ‘ก beomi/Llama-3-Open-Ko-8B โ†’ 8B, ์„ฑ๋Šฅ ์ข‹์Œ, OOM ์ฃผ์˜\n", "BASE_MODEL = 'beomi/gemma-ko-2b' # โ† ์ฒซ ์‹คํ–‰์€ 2B ์ถ”์ฒœ!\n", "\n", "# โ”€โ”€ T4 VRAM ์ตœ์ ํ™” ํ•ต์‹ฌ ์„ค์ • โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "MAX_SEQ_LENGTH = 512 # T4์—์„œ 512๊ฐ€ ์•ˆ์ „์„ . 768โ†‘ โ†’ OOM ์œ„ํ—˜\n", "LORA_R = 8 # rank ๋‚ฎ์„์ˆ˜๋ก ๋ฉ”๋ชจ๋ฆฌโ†“ (์ฒ˜์Œ์—” 8๋กœ ์‹œ์ž‘)\n", "LORA_ALPHA = 16 # ๋ณดํ†ต rank * 2\n", "LORA_DROPOUT = 0.05\n", "\n", "# โ”€โ”€ ํ•™์Šต ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "BATCH_SIZE = 1 # T4๋Š” 1 ๊ณ ์ • (2 ์ด์ƒ โ†’ OOM)\n", "GRAD_ACCUM = 8 # ์‹ค์งˆ ๋ฐฐ์น˜ ํฌ๊ธฐ = 1 * 8 = 8\n", "LEARNING_RATE = 2e-4\n", "NUM_EPOCHS = 3\n", "WARMUP_RATIO = 0.05\n", "\n", "# โ”€โ”€ ์ €์žฅ ์„ค์ • โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "SAVE_STEPS = 50 # T4๋Š” ๋Š๊น€ ์žฆ์Œ โ†’ 50์Šคํ…๋งˆ๋‹ค ์ €์žฅ\n", "LOGGING_STEPS = 10\n", "\n", "# โ”€โ”€ HuggingFace Hub โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "HUB_MODEL_ID = 'your-username/medical-llm-ko' # โ˜… ๋ณธ์ธ ์•„์ด๋””๋กœ ๋ณ€๊ฒฝ\n", "PUSH_TO_HUB = True\n", "\n", "print('์„ค์ •๊ฐ’ ํ™•์ธ:')\n", "print(f' ๋ชจ๋ธ : {BASE_MODEL}')\n", "print(f' ์ตœ๋Œ€ ์‹œํ€€์Šค ๊ธธ์ด: {MAX_SEQ_LENGTH} tokens')\n", "print(f' LoRA rank : {LORA_R}')\n", "print(f' ๋ฐฐ์น˜ ํฌ๊ธฐ : {BATCH_SIZE} ร— grad_accum {GRAD_ACCUM} = ์‹ค์งˆ {BATCH_SIZE*GRAD_ACCUM}')\n", "print(f' ํ•™์Šต ์—ํญ : {NUM_EPOCHS}')\n", "print(f' ์ฒดํฌํฌ์ธํŠธ ์ €์žฅ: ๋งค {SAVE_STEPS} ์Šคํ…')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_model_load" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 6] ๋ชจ๋ธ ๋กœ๋“œ (4-bit QLoRA)\n", "# ์•ฝ 3~5๋ถ„ ์†Œ์š” (์ฒซ ๋‹ค์šด๋กœ๋“œ)\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n", "\n", "# T4 ์ตœ์ ํ™”: fp16 ์‚ฌ์šฉ (T4๋Š” bf16 ๋ฏธ์ง€์›)\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_quant_type='nf4',\n", " bnb_4bit_compute_dtype=torch.float16, # โ† T4๋Š” float16 (bf16 โŒ)\n", " bnb_4bit_use_double_quant=True, # ์ด์ค‘ ์–‘์žํ™”๋กœ ์ถ”๊ฐ€ ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ\n", ")\n", "\n", "print(f'[๋ชจ๋ธ ๋กœ๋“œ] {BASE_MODEL}')\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)\n", "# pad ํ† ํฐ ์—†์œผ๋ฉด ์ถ”๊ฐ€\n", "if tokenizer.pad_token is None:\n", " tokenizer.pad_token = tokenizer.eos_token\n", "tokenizer.padding_side = 'right'\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\n", " BASE_MODEL,\n", " quantization_config=bnb_config,\n", " device_map='auto',\n", " trust_remote_code=True,\n", " torch_dtype=torch.float16, # โ† T4๋Š” float16\n", ")\n", "\n", "model = prepare_model_for_kbit_training(\n", " model,\n", " use_gradient_checkpointing=True # ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ ํ•ต์‹ฌ\n", ")\n", "\n", "print('โœ… ๋ชจ๋ธ ๋กœ๋“œ ์™„๋ฃŒ')\n", "print(f' ์ „์ฒด ํŒŒ๋ผ๋ฏธํ„ฐ: {sum(p.numel() for p in model.parameters()):,}')\n", "print_vram()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_lora" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 7] LoRA ์–ด๋Œ‘ํ„ฐ ์ ์šฉ\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "lora_config = LoraConfig(\n", " r=LORA_R,\n", " lora_alpha=LORA_ALPHA,\n", " # ์–ดํ…์…˜ ๋ ˆ์ด์–ด์—๋งŒ ์ ์šฉ (FFN ์ œ์™ธ โ†’ T4 ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ)\n", " target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],\n", " lora_dropout=LORA_DROPOUT,\n", " bias='none',\n", " task_type='CAUSAL_LM',\n", ")\n", "\n", "model = get_peft_model(model, lora_config)\n", "model.print_trainable_parameters()\n", "# ์˜ˆ: trainable params: 6,815,744 || all params: 2,512,197,632 || trainable%: 0.27\n", "# โ†’ ์ „์ฒด์˜ 0.27%๋งŒ ํ•™์Šต = LoRA์˜ ํ•ต์‹ฌ!\n", "\n", "print_vram()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_dataset" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 8] ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ + ์ƒ˜ํ”Œ ํ™•์ธ\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\n", " 'json',\n", " data_files={'train': TRAIN_FILE, 'validation': VAL_FILE},\n", ")\n", "\n", "print(f'ํ•™์Šต ๋ฐ์ดํ„ฐ : {len(dataset[\"train\"]):,}๊ฐœ')\n", "print(f'๊ฒ€์ฆ ๋ฐ์ดํ„ฐ : {len(dataset[\"validation\"]):,}๊ฐœ')\n", "\n", "# ์ƒ˜ํ”Œ ๋ฏธ๋ฆฌ๋ณด๊ธฐ\n", "sample = dataset['train'][0]\n", "print('\\n--- ์ƒ˜ํ”Œ ๋ฏธ๋ฆฌ๋ณด๊ธฐ (์•ž 400์ž) ---')\n", "print(sample['text'][:400])\n", "\n", "# ํ† ํฐ ๊ธธ์ด ๋ถ„ํฌ ํ™•์ธ (OOM ์˜ˆ๋ฐฉ)\n", "lengths = [len(tokenizer.encode(d['text'])) for d in dataset['train'].select(range(min(200, len(dataset['train']))))]\n", "import statistics\n", "print(f'\\nํ† ํฐ ๊ธธ์ด ํ†ต๊ณ„ (์ƒ˜ํ”Œ 200๊ฐœ ๊ธฐ์ค€):')\n", "print(f' ํ‰๊ท : {statistics.mean(lengths):.0f} tokens')\n", "print(f' ์ตœ๋Œ€: {max(lengths)} tokens')\n", "print(f' max_seq_length ์ดˆ๊ณผ ๋น„์œจ: {sum(1 for l in lengths if l > MAX_SEQ_LENGTH)/len(lengths)*100:.1f}%')\n", "print(' โ†’ ์ดˆ๊ณผ ๋น„์œจ์ด ๋†’์œผ๋ฉด MAX_SEQ_LENGTH๋ฅผ ๋Š˜๋ฆฌ๊ฑฐ๋‚˜ 03๋ฒˆ์—์„œ ํ•„ํ„ฐ๋ฅผ ๊ฐ•ํ™”ํ•˜์„ธ์š”')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_train" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 9] ํ•™์Šต ์‹คํ–‰\n", "# T4 ๊ธฐ์ค€ ์˜ˆ์ƒ ์‹œ๊ฐ„:\n", "# 2B ๋ชจ๋ธ 1,000์ƒ˜ํ”Œ 3epoch โ†’ ์•ฝ 20~30๋ถ„\n", "# 2B ๋ชจ๋ธ 10,000์ƒ˜ํ”Œ 3epoch โ†’ ์•ฝ 3~4์‹œ๊ฐ„\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "from transformers import TrainingArguments\n", "from trl import SFTTrainer\n", "\n", "training_args = TrainingArguments(\n", " output_dir=OUTPUT_DIR,\n", " num_train_epochs=NUM_EPOCHS,\n", " per_device_train_batch_size=BATCH_SIZE,\n", " gradient_accumulation_steps=GRAD_ACCUM,\n", " gradient_checkpointing=True, # T4 ํ•„์ˆ˜ (๋ฉ”๋ชจ๋ฆฌ 20~30% ์ ˆ์•ฝ)\n", " learning_rate=LEARNING_RATE,\n", " lr_scheduler_type='cosine',\n", " warmup_ratio=WARMUP_RATIO,\n", " fp16=USE_FP16, # T4: True\n", " bf16=USE_BF16, # T4: False\n", " logging_steps=LOGGING_STEPS,\n", " save_steps=SAVE_STEPS, # Drive์— ์ž์ฃผ ์ €์žฅ (์„ธ์…˜ ๋Š๊น€ ๋Œ€๋น„)\n", " save_total_limit=2, # ์ตœ๊ทผ 2๊ฐœ๋งŒ ๋ณด๊ด€ (Drive ์šฉ๋Ÿ‰ ์ ˆ์•ฝ)\n", " evaluation_strategy='steps',\n", " eval_steps=SAVE_STEPS,\n", " load_best_model_at_end=True,\n", " optim='paged_adamw_8bit', # T4 ๋ฉ”๋ชจ๋ฆฌ ์ตœ์ ํ™” ์˜ตํ‹ฐ๋งˆ์ด์ €\n", " dataloader_pin_memory=False, # T4์—์„œ pin_memory=True โ†’ ์˜คํžˆ๋ ค ๋А๋ฆผ\n", " report_to='none',\n", " push_to_hub=False, # ํ•™์Šต ์ค‘ Hub ์—…๋กœ๋“œ ๋„๊ธฐ (ํ•™์Šต ํ›„ ์ˆ˜๋™ ์—…๋กœ๋“œ)\n", ")\n", "\n", "trainer = SFTTrainer(\n", " model=model,\n", " tokenizer=tokenizer,\n", " train_dataset=dataset['train'],\n", " eval_dataset=dataset['validation'],\n", " dataset_text_field='text',\n", " max_seq_length=MAX_SEQ_LENGTH,\n", " args=training_args,\n", " packing=False,\n", ")\n", "\n", "train_count = len(dataset['train'])\n", "steps_per_epoch = train_count // (BATCH_SIZE * GRAD_ACCUM)\n", "total_steps = steps_per_epoch * NUM_EPOCHS\n", "print(f'๐Ÿš€ ํ•™์Šต ์‹œ์ž‘!')\n", "print(f' ์ด ์Šคํ…: {total_steps:,}')\n", "print(f' ์ฒดํฌํฌ์ธํŠธ: ๋งค {SAVE_STEPS}์Šคํ… โ†’ {OUTPUT_DIR}')\n", "print(f' โ€ป Colab์ด ๋Š๊ฒจ๋„ Drive ์ฒดํฌํฌ์ธํŠธ์—์„œ ์ด์–ด์„œ ๊ฐ€๋Šฅ')\n", "\n", "trainer.train()\n", "print('\\nโœ… ํ•™์Šต ์™„๋ฃŒ!')\n", "print_vram()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_resume" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 10] ์„ธ์…˜ ๋Š๊ธด ํ›„ ์ด์–ด์„œ ํ•™์Šตํ•˜๊ธฐ\n", "# (์ •์ƒ ์™„๋ฃŒํ–ˆ์œผ๋ฉด ์ด ์…€์€ ๊ฑด๋„ˆ๋›ฐ์„ธ์š”)\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "import os, glob\n", "\n", "# ๊ฐ€์žฅ ์ตœ๊ทผ ์ฒดํฌํฌ์ธํŠธ ์ž๋™ ํƒ์ƒ‰\n", "checkpoints = sorted(\n", " glob.glob(f'{OUTPUT_DIR}/checkpoint-*'),\n", " key=lambda x: int(x.split('-')[-1])\n", ")\n", "\n", "if checkpoints:\n", " latest_ckpt = checkpoints[-1]\n", " print(f'์ตœ๊ทผ ์ฒดํฌํฌ์ธํŠธ: {latest_ckpt}')\n", " print('์ด์–ด์„œ ํ•™์Šตํ•˜๋ ค๋ฉด ์•„๋ž˜ ์ฝ”๋“œ ์‹คํ–‰:')\n", " print(f' trainer.train(resume_from_checkpoint=\"{latest_ckpt}\")')\n", "else:\n", " print('์ฒดํฌํฌ์ธํŠธ ์—†์Œ. Cell 9๋ถ€ํ„ฐ ๋‹ค์‹œ ์‹คํ–‰ํ•˜์„ธ์š”.')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_save" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 11] ๋ชจ๋ธ ์ €์žฅ + HuggingFace Hub ์—…๋กœ๋“œ\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "FINAL_MODEL_DIR = f'{OUTPUT_DIR}/final'\n", "\n", "# LoRA ์–ด๋Œ‘ํ„ฐ๋งŒ ์ €์žฅ (์•ฝ 50~150MB)\n", "trainer.save_model(FINAL_MODEL_DIR)\n", "tokenizer.save_pretrained(FINAL_MODEL_DIR)\n", "print(f'โœ… ๋กœ์ปฌ ์ €์žฅ ์™„๋ฃŒ: {FINAL_MODEL_DIR}')\n", "\n", "# HuggingFace Hub ์—…๋กœ๋“œ\n", "if PUSH_TO_HUB:\n", " if 'your-username' in HUB_MODEL_ID:\n", " print('โš ๏ธ HUB_MODEL_ID๋ฅผ ๋ณธ์ธ HuggingFace ์•„์ด๋””๋กœ ๋ณ€๊ฒฝํ•˜์„ธ์š”!')\n", " print(' ์˜ˆ: HUB_MODEL_ID = \"ํ™๊ธธ๋™/medical-llm-ko\"')\n", " else:\n", " trainer.push_to_hub(HUB_MODEL_ID)\n", " print(f'โœ… Hub ์—…๋กœ๋“œ ์™„๋ฃŒ!')\n", " print(f' https://huggingface.co/{HUB_MODEL_ID}')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cell_inference" }, "source": [ "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "# [Cell 12] ์ถ”๋ก  ํ…Œ์ŠคํŠธ (ํŒŒ์ธํŠœ๋‹ ์ „ํ›„ ๋น„๊ต)\n", "# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", "SYSTEM_PROMPT = '๋‹น์‹ ์€ ํ™˜์ž์˜ ์ฆ์ƒ๊ณผ ์งˆ๋ฌธ์„ ๋“ฃ๊ณ  ์˜ํ•™์  ์ •๋ณด๋ฅผ ์ œ๊ณตํ•˜๋Š” ์˜๋ฃŒ ์ƒ๋‹ด AI์ž…๋‹ˆ๋‹ค. ์ •ํ™•ํ•œ ์˜๋ฃŒ ์ •๋ณด๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ๋‹ต๋ณ€ํ•˜๊ณ  ์ „๋ฌธ์˜ ์ƒ๋‹ด์„ ๊ถŒ์žฅํ•˜์„ธ์š”.'\n", "\n", "def make_prompt(question: str) -> str:\n", " \"\"\"LLaMA3 ํฌ๋งท ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ\"\"\"\n", " return (\n", " '<|begin_of_text|>'\n", " '<|start_header_id|>system<|end_header_id|>\\n\\n'\n", " f'{SYSTEM_PROMPT}'\n", " '<|eot_id|>'\n", " '<|start_header_id|>user<|end_header_id|>\\n\\n'\n", " f'{question}'\n", " '<|eot_id|>'\n", " '<|start_header_id|>assistant<|end_header_id|>\\n\\n'\n", " )\n", "\n", "def generate(question: str, max_new_tokens: int = 256) -> str:\n", " prompt = make_prompt(question)\n", " inputs = tokenizer(prompt, return_tensors='pt').to(model.device)\n", " with torch.no_grad():\n", " outputs = model.generate(\n", " **inputs,\n", " max_new_tokens=max_new_tokens,\n", " temperature=0.7,\n", " do_sample=True,\n", " top_p=0.9,\n", " repetition_penalty=1.1,\n", " eos_token_id=tokenizer.eos_token_id,\n", " pad_token_id=tokenizer.pad_token_id,\n", " )\n", " return tokenizer.decode(\n", " outputs[0][inputs['input_ids'].shape[1]:],\n", " skip_special_tokens=True\n", " ).strip()\n", "\n", "# ํ…Œ์ŠคํŠธ ์งˆ๋ฌธ\n", "test_questions = [\n", " '๋‘ํ†ต์ด 3์ผ์งธ ๊ณ„์†๋˜๊ณ  ๊ตฌ์—ญ์งˆ๋„ ๋‚˜๋Š”๋ฐ ์–ด๋–ป๊ฒŒ ํ•ด์•ผ ํ•˜๋‚˜์š”?',\n", " '๋‹น๋‡จ ์ง„๋‹จ์„ ๋ฐ›์•˜๋Š”๋ฐ ์‹๋‹จ ๊ด€๋ฆฌ๋Š” ์–ด๋–ป๊ฒŒ ํ•ด์•ผ ํ•˜๋‚˜์š”?',\n", " 'ํ˜ˆ์••์ด 150/95 ์ •๋„ ๋‚˜์˜ค๋Š”๋ฐ ์œ„ํ—˜ํ•œ๊ฐ€์š”?',\n", "]\n", "\n", "print('=' * 60)\n", "print('ํŒŒ์ธํŠœ๋‹ ๋ชจ๋ธ ์ถ”๋ก  ํ…Œ์ŠคํŠธ')\n", "print('=' * 60)\n", "\n", "for q in test_questions:\n", " print(f'\\n์งˆ๋ฌธ: {q}')\n", " ans = generate(q)\n", " print(f'๋‹ต๋ณ€: {ans}')\n", " print('-' * 40)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## โœ… ์™„๋ฃŒ ํ›„ ๋‹ค์Œ ๋‹จ๊ณ„\n", "\n", "1. **ํ‰๊ฐ€** : `07_evaluation.py` ๋กœ ROUGE/BLEU ํŒŒ์ธํŠœ๋‹ ์ „ํ›„ ๋น„๊ต\n", "2. **RAG ์—ฐ๊ฒฐ** : `05_rag_indexer.py` โ†’ `06_rag_chain.py`\n", "3. **API ๋ฐฐํฌ** : `08_api.py` (FastAPI)\n", "\n", "### OOM ์—๋Ÿฌ๊ฐ€ ๋‚ฌ๋‹ค๋ฉด\n", "```\n", "MAX_SEQ_LENGTH ์ค„์ด๊ธฐ 512 โ†’ 384\n", "LORA_R ์ค„์ด๊ธฐ 8 โ†’ 4\n", "target_modules ์ค„์ด๊ธฐ q_proj, v_proj ๋งŒ\n", "BASE_MODEL ๊ต์ฒด beomi/gemma-ko-2b (์ด๋ฏธ ์ตœ์†Œ)\n", "```\n" ] } ] }