import streamlit as st import os import logging from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig ) from datasets import load_dataset import torch from trl import SFTTrainer, DataCollatorForCompletionOnlyLM from peft import LoraConfig BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct" NEW_MODEL_ID = "behbudiy/Llama-3.1-8B-Instuct-Uz" DATASET_PATH = "app/datasets/train.json" HF_TOKEN_SECRET_NAME = "token" logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def get_hf_token(): token = os.environ.get(HF_TOKEN_SECRET_NAME) if not token: logger.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Model yuklash yoki Hubga yuklashda muammo bo'lishi mumkin.") st.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Iltimos, Space sozlamalarida (Secrets) uni qo'shing.") return token @st.cache_resource(show_spinner="Model va Tokenizer yuklanmoqda...") def load_tokenizer_model(model_id_to_load): """Berilgan ID bo'yicha tokenizer va modelni yuklaydi""" hf_token = get_hf_token() try: quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained( model_id_to_load, token=hf_token ) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" model = AutoModelForCausalLM.from_pretrained( model_id_to_load, token=hf_token, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) logger.info(f"'{model_id_to_load}' modeli va tokenizer muvaffaqiyatli yuklandi.") return tokenizer, model except OSError as e: logger.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}") st.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}\n\n" f"Mumkin sabablar:\n" f"1. '{model_id_to_load}' noto'g'ri model ID.\n" f"2. Internet aloqasi yo'q.\n" f"3. Llama 3.1 gated model. HF Tokeningizda ushbu modelga kirish huquqi bormi?\n" f"4. HF Tokeningiz Space Secrets'ga ('{HF_TOKEN_SECRET_NAME}' nomi bilan) to'g'ri kiritilganmi?") st.stop() except Exception as e: logger.error(f"Kutilmagan xatolik ({model_id_to_load}): {e}") st.error(f"Model/tokenizer yuklashda kutilmagan xatolik: {e}") st.stop() def format_instruction(sample): instruction = sample.get('instruction', '') input_text = sample.get('input', '') output = sample.get('output', '') if input_text and input_text.strip(): return f"""[INST] <> You are a helpful Uzbek assistant. <> {instruction} Input: {input_text} [/INST] {output} """ else: return f"""[INST] <> You are a helpful Uzbek assistant. <> {instruction} [/INST] {output} """ def fine_tune(target_model_id, dataset_path): """Asosiy modelni yuklaydi, fine-tuning qiladi va Hubga yuklaydi""" st.info("Fine-tuning jarayoni boshlanmoqda...") hf_token = get_hf_token() if not hf_token: st.error("Jarayonni davom ettirish uchun Hugging Face Token zarur.") return "Hugging Face Token topilmadi." try: logger.info(f"Asosiy model yuklanmoqda: {BASE_MODEL_ID}") tokenizer, model = load_tokenizer_model(BASE_MODEL_ID) logger.info(f"Dataset yuklanmoqda: {dataset_path}") try: dataset = load_dataset("json", data_files=dataset_path, split="train") formatted_dataset = dataset.map(lambda p: {'text': format_instruction(p)}, remove_columns=list(dataset.features)) logger.info(f"Dataset formatlandi. Formatlangan ustun: 'text'.") except Exception as e: logger.error(f"Dataset yuklash yoki formatlashda xatolik: {e}") st.error(f"Dataset yuklash yoki formatlashda xatolik ({dataset_path}): {e}") return f"Datasetni qayta ishlashda xato: {e}" lora_config = LoraConfig( r=16, lora_alpha=32, target_modules="all-linear", lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) logger.info("LoRA konfiguratsiyasi tayyorlandi.") training_args = TrainingArguments( output_dir="./fine-tuning-results", per_device_train_batch_size=2, gradient_accumulation_steps=4, learning_rate=2e-5, logging_steps=10, num_train_epochs=1, save_strategy="epoch", save_total_limit=1, optim="adamw_torch", fp16=True, push_to_hub=True, hub_model_id=target_model_id, hub_token=hf_token, report_to="tensorboard", gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': False} ) logger.info("Training Arguments tayyorlandi.") trainer = SFTTrainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=formatted_dataset, peft_config=lora_config, dataset_text_field="text", max_seq_length=1024, packing=False, ) logger.info("SFTTrainer ishga tushirildi.") st.info("Trening boshlanmoqda... Bu biroz vaqt olishi mumkin.") logger.info("Trening boshlandi.") train_result = trainer.train() logger.info("Trening tugadi.") logger.info(f"Trening natijalari: {train_result}") st.success(f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi.") return f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi." except Exception as e: logger.error(f"Fine-tuning jarayonida xatolik: {e}", exc_info=True) st.error(f"Fine-tuning jarayonida kutilmagan xatolik yuz berdi: {e}") return f"Fine-tuningda xato: {e}" if __name__ == "__main__": st.set_page_config(page_title="Llama 3.1 Uzbek Fine-Tuning", layout="wide") st.title("🧠 Llama 3.1 8B Uzbek Fine-Tuning") st.markdown(f""" Bu interfeys **{BASE_MODEL_ID}** modelini **{DATASET_PATH}** dataseti yordamida fine-tuning qilish uchun mo'ljallangan. Natija **{NEW_MODEL_ID}** nomi bilan Hugging Face Hubga yuklanadi. **Talablar:** 1. Space sozlamalarida (Secrets) `{HF_TOKEN_SECRET_NAME}` nomli Hugging Face **write** tokeni kiritilgan bo'lishi kerak. 2. Ushbu token bog'langan akkauntda `{BASE_MODEL_ID}` modeliga kirish huquqi (litsenziyani qabul qilish) bo'lishi kerak. 3. Hugging Face Hubda `{NEW_MODEL_ID}` nomli repository mavjud bo'lishi (yoki yaratishga ruxsat bo'lishi) kerak. """) st.info(f"**Asosiy Model:** `{BASE_MODEL_ID}`\n\n" f"**Dataset Yo'li:** `{DATASET_PATH}`\n\n" f"**Natijaviy Model ID:** `{NEW_MODEL_ID}`") if st.button("🚀 Fine-tuningni Boshlash"): with st.spinner("Jarayon boshlanmoqda... Model yuklanishi va trening vaqt oladi."): status = fine_tune(NEW_MODEL_ID, DATASET_PATH) st.info(f"Jarayon holati: {status}") st.markdown("---") st.markdown("Loglarni quyida yoki Space 'Logs' bo'limida kuzatishingiz mumkin.")