futzone
update app.py
68dd6ff
import streamlit as st
import os
import logging
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
BitsAndBytesConfig
)
from datasets import load_dataset
import torch
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig
BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
NEW_MODEL_ID = "behbudiy/Llama-3.1-8B-Instuct-Uz"
DATASET_PATH = "app/datasets/train.json"
HF_TOKEN_SECRET_NAME = "token"
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def get_hf_token():
token = os.environ.get(HF_TOKEN_SECRET_NAME)
if not token:
logger.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Model yuklash yoki Hubga yuklashda muammo bo'lishi mumkin.")
st.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Iltimos, Space sozlamalarida (Secrets) uni qo'shing.")
return token
@st.cache_resource(show_spinner="Model va Tokenizer yuklanmoqda...")
def load_tokenizer_model(model_id_to_load):
"""Berilgan ID bo'yicha tokenizer va modelni yuklaydi"""
hf_token = get_hf_token()
try:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(
model_id_to_load,
token=hf_token
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(
model_id_to_load,
token=hf_token,
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=quantization_config
)
logger.info(f"'{model_id_to_load}' modeli va tokenizer muvaffaqiyatli yuklandi.")
return tokenizer, model
except OSError as e:
logger.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}")
st.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}\n\n"
f"Mumkin sabablar:\n"
f"1. '{model_id_to_load}' noto'g'ri model ID.\n"
f"2. Internet aloqasi yo'q.\n"
f"3. Llama 3.1 gated model. HF Tokeningizda ushbu modelga kirish huquqi bormi?\n"
f"4. HF Tokeningiz Space Secrets'ga ('{HF_TOKEN_SECRET_NAME}' nomi bilan) to'g'ri kiritilganmi?")
st.stop()
except Exception as e:
logger.error(f"Kutilmagan xatolik ({model_id_to_load}): {e}")
st.error(f"Model/tokenizer yuklashda kutilmagan xatolik: {e}")
st.stop()
def format_instruction(sample):
instruction = sample.get('instruction', '')
input_text = sample.get('input', '')
output = sample.get('output', '')
if input_text and input_text.strip():
return f"""<s>[INST] <<SYS>>
You are a helpful Uzbek assistant.
<</SYS>>
{instruction}
Input: {input_text} [/INST] {output} </s>"""
else:
return f"""<s>[INST] <<SYS>>
You are a helpful Uzbek assistant.
<</SYS>>
{instruction} [/INST] {output} </s>"""
def fine_tune(target_model_id, dataset_path):
"""Asosiy modelni yuklaydi, fine-tuning qiladi va Hubga yuklaydi"""
st.info("Fine-tuning jarayoni boshlanmoqda...")
hf_token = get_hf_token()
if not hf_token:
st.error("Jarayonni davom ettirish uchun Hugging Face Token zarur.")
return "Hugging Face Token topilmadi."
try:
logger.info(f"Asosiy model yuklanmoqda: {BASE_MODEL_ID}")
tokenizer, model = load_tokenizer_model(BASE_MODEL_ID)
logger.info(f"Dataset yuklanmoqda: {dataset_path}")
try:
dataset = load_dataset("json", data_files=dataset_path, split="train")
formatted_dataset = dataset.map(lambda p: {'text': format_instruction(p)}, remove_columns=list(dataset.features))
logger.info(f"Dataset formatlandi. Formatlangan ustun: 'text'.")
except Exception as e:
logger.error(f"Dataset yuklash yoki formatlashda xatolik: {e}")
st.error(f"Dataset yuklash yoki formatlashda xatolik ({dataset_path}): {e}")
return f"Datasetni qayta ishlashda xato: {e}"
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules="all-linear",
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
logger.info("LoRA konfiguratsiyasi tayyorlandi.")
training_args = TrainingArguments(
output_dir="./fine-tuning-results",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
learning_rate=2e-5,
logging_steps=10,
num_train_epochs=1,
save_strategy="epoch",
save_total_limit=1,
optim="adamw_torch",
fp16=True,
push_to_hub=True,
hub_model_id=target_model_id,
hub_token=hf_token,
report_to="tensorboard",
gradient_checkpointing=True,
gradient_checkpointing_kwargs={'use_reentrant': False}
)
logger.info("Training Arguments tayyorlandi.")
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=formatted_dataset,
peft_config=lora_config,
dataset_text_field="text",
max_seq_length=1024,
packing=False,
)
logger.info("SFTTrainer ishga tushirildi.")
st.info("Trening boshlanmoqda... Bu biroz vaqt olishi mumkin.")
logger.info("Trening boshlandi.")
train_result = trainer.train()
logger.info("Trening tugadi.")
logger.info(f"Trening natijalari: {train_result}")
st.success(f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi.")
return f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi."
except Exception as e:
logger.error(f"Fine-tuning jarayonida xatolik: {e}", exc_info=True)
st.error(f"Fine-tuning jarayonida kutilmagan xatolik yuz berdi: {e}")
return f"Fine-tuningda xato: {e}"
if __name__ == "__main__":
st.set_page_config(page_title="Llama 3.1 Uzbek Fine-Tuning", layout="wide")
st.title("🧠 Llama 3.1 8B Uzbek Fine-Tuning")
st.markdown(f"""
Bu interfeys **{BASE_MODEL_ID}** modelini **{DATASET_PATH}** dataseti yordamida fine-tuning qilish uchun mo'ljallangan.
Natija **{NEW_MODEL_ID}** nomi bilan Hugging Face Hubga yuklanadi.
**Talablar:**
1. Space sozlamalarida (Secrets) `{HF_TOKEN_SECRET_NAME}` nomli Hugging Face **write** tokeni kiritilgan bo'lishi kerak.
2. Ushbu token bog'langan akkauntda `{BASE_MODEL_ID}` modeliga kirish huquqi (litsenziyani qabul qilish) bo'lishi kerak.
3. Hugging Face Hubda `{NEW_MODEL_ID}` nomli repository mavjud bo'lishi (yoki yaratishga ruxsat bo'lishi) kerak.
""")
st.info(f"**Asosiy Model:** `{BASE_MODEL_ID}`\n\n"
f"**Dataset Yo'li:** `{DATASET_PATH}`\n\n"
f"**Natijaviy Model ID:** `{NEW_MODEL_ID}`")
if st.button("🚀 Fine-tuningni Boshlash"):
with st.spinner("Jarayon boshlanmoqda... Model yuklanishi va trening vaqt oladi."):
status = fine_tune(NEW_MODEL_ID, DATASET_PATH)
st.info(f"Jarayon holati: {status}")
st.markdown("---")
st.markdown("Loglarni quyida yoki Space 'Logs' bo'limida kuzatishingiz mumkin.")