File size: 7,647 Bytes
48b3ff3
68dd6ff
 
 
 
 
 
 
 
48b3ff3
 
68dd6ff
 
48b3ff3
68dd6ff
 
 
 
48b3ff3
68dd6ff
 
48b3ff3
 
68dd6ff
 
 
 
 
 
48b3ff3
 
68dd6ff
 
 
 
 
 
 
 
 
 
48b3ff3
68dd6ff
 
 
 
 
 
48b3ff3
68dd6ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48b3ff3
 
68dd6ff
 
 
 
48b3ff3
68dd6ff
 
 
 
48b3ff3
68dd6ff
 
 
 
 
 
48b3ff3
68dd6ff
48b3ff3
 
68dd6ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import streamlit as st
import os
import logging
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    BitsAndBytesConfig
)
from datasets import load_dataset
import torch
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig

BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
NEW_MODEL_ID = "behbudiy/Llama-3.1-8B-Instuct-Uz"
DATASET_PATH = "app/datasets/train.json"
HF_TOKEN_SECRET_NAME = "token"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def get_hf_token():
    token = os.environ.get(HF_TOKEN_SECRET_NAME)
    if not token:
        logger.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Model yuklash yoki Hubga yuklashda muammo bo'lishi mumkin.")
        st.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Iltimos, Space sozlamalarida (Secrets) uni qo'shing.")
    return token


@st.cache_resource(show_spinner="Model va Tokenizer yuklanmoqda...")
def load_tokenizer_model(model_id_to_load):
    """Berilgan ID bo'yicha tokenizer va modelni yuklaydi"""
    hf_token = get_hf_token()
    try:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        tokenizer = AutoTokenizer.from_pretrained(
            model_id_to_load,
            token=hf_token
        )
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"

        model = AutoModelForCausalLM.from_pretrained(
            model_id_to_load,
            token=hf_token,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            quantization_config=quantization_config
        )
        logger.info(f"'{model_id_to_load}' modeli va tokenizer muvaffaqiyatli yuklandi.")
        return tokenizer, model
    except OSError as e:
        logger.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}")
        st.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}\n\n"
                 f"Mumkin sabablar:\n"
                 f"1. '{model_id_to_load}' noto'g'ri model ID.\n"
                 f"2. Internet aloqasi yo'q.\n"
                 f"3. Llama 3.1 gated model. HF Tokeningizda ushbu modelga kirish huquqi bormi?\n"
                 f"4. HF Tokeningiz Space Secrets'ga ('{HF_TOKEN_SECRET_NAME}' nomi bilan) to'g'ri kiritilganmi?")
        st.stop()
    except Exception as e:
        logger.error(f"Kutilmagan xatolik ({model_id_to_load}): {e}")
        st.error(f"Model/tokenizer yuklashda kutilmagan xatolik: {e}")
        st.stop()


def format_instruction(sample):
    instruction = sample.get('instruction', '')
    input_text = sample.get('input', '')
    output = sample.get('output', '')

    if input_text and input_text.strip():
        return f"""<s>[INST] <<SYS>>
You are a helpful Uzbek assistant.
<</SYS>>

{instruction}
Input: {input_text} [/INST] {output} </s>"""
    else:
        return f"""<s>[INST] <<SYS>>
You are a helpful Uzbek assistant.
<</SYS>>

{instruction} [/INST] {output} </s>"""


def fine_tune(target_model_id, dataset_path):
    """Asosiy modelni yuklaydi, fine-tuning qiladi va Hubga yuklaydi"""
    st.info("Fine-tuning jarayoni boshlanmoqda...")
    hf_token = get_hf_token()
    if not hf_token:
        st.error("Jarayonni davom ettirish uchun Hugging Face Token zarur.")
        return "Hugging Face Token topilmadi."

    try:
        logger.info(f"Asosiy model yuklanmoqda: {BASE_MODEL_ID}")
        tokenizer, model = load_tokenizer_model(BASE_MODEL_ID)

        logger.info(f"Dataset yuklanmoqda: {dataset_path}")
        try:
            dataset = load_dataset("json", data_files=dataset_path, split="train")
            formatted_dataset = dataset.map(lambda p: {'text': format_instruction(p)}, remove_columns=list(dataset.features))
            logger.info(f"Dataset formatlandi. Formatlangan ustun: 'text'.")
        except Exception as e:
            logger.error(f"Dataset yuklash yoki formatlashda xatolik: {e}")
            st.error(f"Dataset yuklash yoki formatlashda xatolik ({dataset_path}): {e}")
            return f"Datasetni qayta ishlashda xato: {e}"

        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules="all-linear",
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )
        logger.info("LoRA konfiguratsiyasi tayyorlandi.")

        training_args = TrainingArguments(
            output_dir="./fine-tuning-results",
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            learning_rate=2e-5,
            logging_steps=10,
            num_train_epochs=1,

            save_strategy="epoch",
            save_total_limit=1,
            optim="adamw_torch",
            fp16=True,
            push_to_hub=True,
            hub_model_id=target_model_id,
            hub_token=hf_token,
            report_to="tensorboard",
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={'use_reentrant': False}
        )
        logger.info("Training Arguments tayyorlandi.")

        trainer = SFTTrainer(
            model=model,
            tokenizer=tokenizer,
            args=training_args,
            train_dataset=formatted_dataset,
            peft_config=lora_config,
            dataset_text_field="text",
            max_seq_length=1024,
            packing=False,

        )
        logger.info("SFTTrainer ishga tushirildi.")

        st.info("Trening boshlanmoqda... Bu biroz vaqt olishi mumkin.")
        logger.info("Trening boshlandi.")
        train_result = trainer.train()
        logger.info("Trening tugadi.")
        logger.info(f"Trening natijalari: {train_result}")

        st.success(f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi.")
        return f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi."

    except Exception as e:
        logger.error(f"Fine-tuning jarayonida xatolik: {e}", exc_info=True)
        st.error(f"Fine-tuning jarayonida kutilmagan xatolik yuz berdi: {e}")
        return f"Fine-tuningda xato: {e}"


if __name__ == "__main__":
    st.set_page_config(page_title="Llama 3.1 Uzbek Fine-Tuning", layout="wide")
    st.title("🧠 Llama 3.1 8B Uzbek Fine-Tuning")

    st.markdown(f"""
    Bu interfeys **{BASE_MODEL_ID}** modelini **{DATASET_PATH}** dataseti yordamida fine-tuning qilish uchun mo'ljallangan.
    Natija **{NEW_MODEL_ID}** nomi bilan Hugging Face Hubga yuklanadi.

    **Talablar:**
    1.  Space sozlamalarida (Secrets) `{HF_TOKEN_SECRET_NAME}` nomli Hugging Face **write** tokeni kiritilgan bo'lishi kerak.
    2.  Ushbu token bog'langan akkauntda `{BASE_MODEL_ID}` modeliga kirish huquqi (litsenziyani qabul qilish) bo'lishi kerak.
    3.  Hugging Face Hubda `{NEW_MODEL_ID}` nomli repository mavjud bo'lishi (yoki yaratishga ruxsat bo'lishi) kerak.
    """)

    st.info(f"**Asosiy Model:** `{BASE_MODEL_ID}`\n\n"
            f"**Dataset Yo'li:** `{DATASET_PATH}`\n\n"
            f"**Natijaviy Model ID:** `{NEW_MODEL_ID}`")

    if st.button("🚀 Fine-tuningni Boshlash"):
        with st.spinner("Jarayon boshlanmoqda... Model yuklanishi va trening vaqt oladi."):
            status = fine_tune(NEW_MODEL_ID, DATASET_PATH)
            st.info(f"Jarayon holati: {status}")

    st.markdown("---")
    st.markdown("Loglarni quyida yoki Space 'Logs' bo'limida kuzatishingiz mumkin.")