Spaces:
Running
Running
File size: 7,647 Bytes
48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff 48b3ff3 68dd6ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import streamlit as st
import os
import logging
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
BitsAndBytesConfig
)
from datasets import load_dataset
import torch
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig
BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
NEW_MODEL_ID = "behbudiy/Llama-3.1-8B-Instuct-Uz"
DATASET_PATH = "app/datasets/train.json"
HF_TOKEN_SECRET_NAME = "token"
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def get_hf_token():
token = os.environ.get(HF_TOKEN_SECRET_NAME)
if not token:
logger.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Model yuklash yoki Hubga yuklashda muammo bo'lishi mumkin.")
st.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Iltimos, Space sozlamalarida (Secrets) uni qo'shing.")
return token
@st.cache_resource(show_spinner="Model va Tokenizer yuklanmoqda...")
def load_tokenizer_model(model_id_to_load):
"""Berilgan ID bo'yicha tokenizer va modelni yuklaydi"""
hf_token = get_hf_token()
try:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(
model_id_to_load,
token=hf_token
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(
model_id_to_load,
token=hf_token,
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=quantization_config
)
logger.info(f"'{model_id_to_load}' modeli va tokenizer muvaffaqiyatli yuklandi.")
return tokenizer, model
except OSError as e:
logger.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}")
st.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}\n\n"
f"Mumkin sabablar:\n"
f"1. '{model_id_to_load}' noto'g'ri model ID.\n"
f"2. Internet aloqasi yo'q.\n"
f"3. Llama 3.1 gated model. HF Tokeningizda ushbu modelga kirish huquqi bormi?\n"
f"4. HF Tokeningiz Space Secrets'ga ('{HF_TOKEN_SECRET_NAME}' nomi bilan) to'g'ri kiritilganmi?")
st.stop()
except Exception as e:
logger.error(f"Kutilmagan xatolik ({model_id_to_load}): {e}")
st.error(f"Model/tokenizer yuklashda kutilmagan xatolik: {e}")
st.stop()
def format_instruction(sample):
instruction = sample.get('instruction', '')
input_text = sample.get('input', '')
output = sample.get('output', '')
if input_text and input_text.strip():
return f"""<s>[INST] <<SYS>>
You are a helpful Uzbek assistant.
<</SYS>>
{instruction}
Input: {input_text} [/INST] {output} </s>"""
else:
return f"""<s>[INST] <<SYS>>
You are a helpful Uzbek assistant.
<</SYS>>
{instruction} [/INST] {output} </s>"""
def fine_tune(target_model_id, dataset_path):
"""Asosiy modelni yuklaydi, fine-tuning qiladi va Hubga yuklaydi"""
st.info("Fine-tuning jarayoni boshlanmoqda...")
hf_token = get_hf_token()
if not hf_token:
st.error("Jarayonni davom ettirish uchun Hugging Face Token zarur.")
return "Hugging Face Token topilmadi."
try:
logger.info(f"Asosiy model yuklanmoqda: {BASE_MODEL_ID}")
tokenizer, model = load_tokenizer_model(BASE_MODEL_ID)
logger.info(f"Dataset yuklanmoqda: {dataset_path}")
try:
dataset = load_dataset("json", data_files=dataset_path, split="train")
formatted_dataset = dataset.map(lambda p: {'text': format_instruction(p)}, remove_columns=list(dataset.features))
logger.info(f"Dataset formatlandi. Formatlangan ustun: 'text'.")
except Exception as e:
logger.error(f"Dataset yuklash yoki formatlashda xatolik: {e}")
st.error(f"Dataset yuklash yoki formatlashda xatolik ({dataset_path}): {e}")
return f"Datasetni qayta ishlashda xato: {e}"
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules="all-linear",
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
logger.info("LoRA konfiguratsiyasi tayyorlandi.")
training_args = TrainingArguments(
output_dir="./fine-tuning-results",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
learning_rate=2e-5,
logging_steps=10,
num_train_epochs=1,
save_strategy="epoch",
save_total_limit=1,
optim="adamw_torch",
fp16=True,
push_to_hub=True,
hub_model_id=target_model_id,
hub_token=hf_token,
report_to="tensorboard",
gradient_checkpointing=True,
gradient_checkpointing_kwargs={'use_reentrant': False}
)
logger.info("Training Arguments tayyorlandi.")
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=formatted_dataset,
peft_config=lora_config,
dataset_text_field="text",
max_seq_length=1024,
packing=False,
)
logger.info("SFTTrainer ishga tushirildi.")
st.info("Trening boshlanmoqda... Bu biroz vaqt olishi mumkin.")
logger.info("Trening boshlandi.")
train_result = trainer.train()
logger.info("Trening tugadi.")
logger.info(f"Trening natijalari: {train_result}")
st.success(f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi.")
return f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi."
except Exception as e:
logger.error(f"Fine-tuning jarayonida xatolik: {e}", exc_info=True)
st.error(f"Fine-tuning jarayonida kutilmagan xatolik yuz berdi: {e}")
return f"Fine-tuningda xato: {e}"
if __name__ == "__main__":
st.set_page_config(page_title="Llama 3.1 Uzbek Fine-Tuning", layout="wide")
st.title("🧠 Llama 3.1 8B Uzbek Fine-Tuning")
st.markdown(f"""
Bu interfeys **{BASE_MODEL_ID}** modelini **{DATASET_PATH}** dataseti yordamida fine-tuning qilish uchun mo'ljallangan.
Natija **{NEW_MODEL_ID}** nomi bilan Hugging Face Hubga yuklanadi.
**Talablar:**
1. Space sozlamalarida (Secrets) `{HF_TOKEN_SECRET_NAME}` nomli Hugging Face **write** tokeni kiritilgan bo'lishi kerak.
2. Ushbu token bog'langan akkauntda `{BASE_MODEL_ID}` modeliga kirish huquqi (litsenziyani qabul qilish) bo'lishi kerak.
3. Hugging Face Hubda `{NEW_MODEL_ID}` nomli repository mavjud bo'lishi (yoki yaratishga ruxsat bo'lishi) kerak.
""")
st.info(f"**Asosiy Model:** `{BASE_MODEL_ID}`\n\n"
f"**Dataset Yo'li:** `{DATASET_PATH}`\n\n"
f"**Natijaviy Model ID:** `{NEW_MODEL_ID}`")
if st.button("🚀 Fine-tuningni Boshlash"):
with st.spinner("Jarayon boshlanmoqda... Model yuklanishi va trening vaqt oladi."):
status = fine_tune(NEW_MODEL_ID, DATASET_PATH)
st.info(f"Jarayon holati: {status}")
st.markdown("---")
st.markdown("Loglarni quyida yoki Space 'Logs' bo'limida kuzatishingiz mumkin.")
|