Spaces:
Running
Running
import streamlit as st | |
import os | |
import logging | |
from transformers import ( | |
AutoTokenizer, | |
AutoModelForCausalLM, | |
TrainingArguments, | |
BitsAndBytesConfig | |
) | |
from datasets import load_dataset | |
import torch | |
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM | |
from peft import LoraConfig | |
BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct" | |
NEW_MODEL_ID = "behbudiy/Llama-3.1-8B-Instuct-Uz" | |
DATASET_PATH = "app/datasets/train.json" | |
HF_TOKEN_SECRET_NAME = "token" | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
def get_hf_token(): | |
token = os.environ.get(HF_TOKEN_SECRET_NAME) | |
if not token: | |
logger.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Model yuklash yoki Hubga yuklashda muammo bo'lishi mumkin.") | |
st.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Iltimos, Space sozlamalarida (Secrets) uni qo'shing.") | |
return token | |
def load_tokenizer_model(model_id_to_load): | |
"""Berilgan ID bo'yicha tokenizer va modelni yuklaydi""" | |
hf_token = get_hf_token() | |
try: | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.bfloat16 | |
) | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_id_to_load, | |
token=hf_token | |
) | |
tokenizer.pad_token = tokenizer.eos_token | |
tokenizer.padding_side = "right" | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id_to_load, | |
token=hf_token, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
quantization_config=quantization_config | |
) | |
logger.info(f"'{model_id_to_load}' modeli va tokenizer muvaffaqiyatli yuklandi.") | |
return tokenizer, model | |
except OSError as e: | |
logger.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}") | |
st.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}\n\n" | |
f"Mumkin sabablar:\n" | |
f"1. '{model_id_to_load}' noto'g'ri model ID.\n" | |
f"2. Internet aloqasi yo'q.\n" | |
f"3. Llama 3.1 gated model. HF Tokeningizda ushbu modelga kirish huquqi bormi?\n" | |
f"4. HF Tokeningiz Space Secrets'ga ('{HF_TOKEN_SECRET_NAME}' nomi bilan) to'g'ri kiritilganmi?") | |
st.stop() | |
except Exception as e: | |
logger.error(f"Kutilmagan xatolik ({model_id_to_load}): {e}") | |
st.error(f"Model/tokenizer yuklashda kutilmagan xatolik: {e}") | |
st.stop() | |
def format_instruction(sample): | |
instruction = sample.get('instruction', '') | |
input_text = sample.get('input', '') | |
output = sample.get('output', '') | |
if input_text and input_text.strip(): | |
return f"""<s>[INST] <<SYS>> | |
You are a helpful Uzbek assistant. | |
<</SYS>> | |
{instruction} | |
Input: {input_text} [/INST] {output} </s>""" | |
else: | |
return f"""<s>[INST] <<SYS>> | |
You are a helpful Uzbek assistant. | |
<</SYS>> | |
{instruction} [/INST] {output} </s>""" | |
def fine_tune(target_model_id, dataset_path): | |
"""Asosiy modelni yuklaydi, fine-tuning qiladi va Hubga yuklaydi""" | |
st.info("Fine-tuning jarayoni boshlanmoqda...") | |
hf_token = get_hf_token() | |
if not hf_token: | |
st.error("Jarayonni davom ettirish uchun Hugging Face Token zarur.") | |
return "Hugging Face Token topilmadi." | |
try: | |
logger.info(f"Asosiy model yuklanmoqda: {BASE_MODEL_ID}") | |
tokenizer, model = load_tokenizer_model(BASE_MODEL_ID) | |
logger.info(f"Dataset yuklanmoqda: {dataset_path}") | |
try: | |
dataset = load_dataset("json", data_files=dataset_path, split="train") | |
formatted_dataset = dataset.map(lambda p: {'text': format_instruction(p)}, remove_columns=list(dataset.features)) | |
logger.info(f"Dataset formatlandi. Formatlangan ustun: 'text'.") | |
except Exception as e: | |
logger.error(f"Dataset yuklash yoki formatlashda xatolik: {e}") | |
st.error(f"Dataset yuklash yoki formatlashda xatolik ({dataset_path}): {e}") | |
return f"Datasetni qayta ishlashda xato: {e}" | |
lora_config = LoraConfig( | |
r=16, | |
lora_alpha=32, | |
target_modules="all-linear", | |
lora_dropout=0.05, | |
bias="none", | |
task_type="CAUSAL_LM" | |
) | |
logger.info("LoRA konfiguratsiyasi tayyorlandi.") | |
training_args = TrainingArguments( | |
output_dir="./fine-tuning-results", | |
per_device_train_batch_size=2, | |
gradient_accumulation_steps=4, | |
learning_rate=2e-5, | |
logging_steps=10, | |
num_train_epochs=1, | |
save_strategy="epoch", | |
save_total_limit=1, | |
optim="adamw_torch", | |
fp16=True, | |
push_to_hub=True, | |
hub_model_id=target_model_id, | |
hub_token=hf_token, | |
report_to="tensorboard", | |
gradient_checkpointing=True, | |
gradient_checkpointing_kwargs={'use_reentrant': False} | |
) | |
logger.info("Training Arguments tayyorlandi.") | |
trainer = SFTTrainer( | |
model=model, | |
tokenizer=tokenizer, | |
args=training_args, | |
train_dataset=formatted_dataset, | |
peft_config=lora_config, | |
dataset_text_field="text", | |
max_seq_length=1024, | |
packing=False, | |
) | |
logger.info("SFTTrainer ishga tushirildi.") | |
st.info("Trening boshlanmoqda... Bu biroz vaqt olishi mumkin.") | |
logger.info("Trening boshlandi.") | |
train_result = trainer.train() | |
logger.info("Trening tugadi.") | |
logger.info(f"Trening natijalari: {train_result}") | |
st.success(f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi.") | |
return f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi." | |
except Exception as e: | |
logger.error(f"Fine-tuning jarayonida xatolik: {e}", exc_info=True) | |
st.error(f"Fine-tuning jarayonida kutilmagan xatolik yuz berdi: {e}") | |
return f"Fine-tuningda xato: {e}" | |
if __name__ == "__main__": | |
st.set_page_config(page_title="Llama 3.1 Uzbek Fine-Tuning", layout="wide") | |
st.title("🧠 Llama 3.1 8B Uzbek Fine-Tuning") | |
st.markdown(f""" | |
Bu interfeys **{BASE_MODEL_ID}** modelini **{DATASET_PATH}** dataseti yordamida fine-tuning qilish uchun mo'ljallangan. | |
Natija **{NEW_MODEL_ID}** nomi bilan Hugging Face Hubga yuklanadi. | |
**Talablar:** | |
1. Space sozlamalarida (Secrets) `{HF_TOKEN_SECRET_NAME}` nomli Hugging Face **write** tokeni kiritilgan bo'lishi kerak. | |
2. Ushbu token bog'langan akkauntda `{BASE_MODEL_ID}` modeliga kirish huquqi (litsenziyani qabul qilish) bo'lishi kerak. | |
3. Hugging Face Hubda `{NEW_MODEL_ID}` nomli repository mavjud bo'lishi (yoki yaratishga ruxsat bo'lishi) kerak. | |
""") | |
st.info(f"**Asosiy Model:** `{BASE_MODEL_ID}`\n\n" | |
f"**Dataset Yo'li:** `{DATASET_PATH}`\n\n" | |
f"**Natijaviy Model ID:** `{NEW_MODEL_ID}`") | |
if st.button("🚀 Fine-tuningni Boshlash"): | |
with st.spinner("Jarayon boshlanmoqda... Model yuklanishi va trening vaqt oladi."): | |
status = fine_tune(NEW_MODEL_ID, DATASET_PATH) | |
st.info(f"Jarayon holati: {status}") | |
st.markdown("---") | |
st.markdown("Loglarni quyida yoki Space 'Logs' bo'limida kuzatishingiz mumkin.") | |