Spaces:
Running
Running
futzone
commited on
Commit
·
68dd6ff
1
Parent(s):
c7faa5e
update app.py
Browse files- app.py +181 -50
- requirements.txt +4 -3
app.py
CHANGED
@@ -1,68 +1,199 @@
|
|
1 |
import streamlit as st
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from datasets import load_dataset
|
5 |
import torch
|
|
|
|
|
6 |
|
7 |
-
|
|
|
|
|
|
|
8 |
|
|
|
|
|
9 |
|
10 |
-
@st.cache_resource
|
11 |
-
def load_tokenizer_model(model_id):
|
12 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
13 |
-
model = AutoModelForCausalLM.from_pretrained(
|
14 |
-
model_id,
|
15 |
-
load_in_8bit=True,
|
16 |
-
device_map="auto"
|
17 |
-
)
|
18 |
-
return tokenizer, model
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
def tokenize_function(example, tokenizer):
|
22 |
-
prompt = f"<s>[INST] {example['instruction']} [/INST] {example['output']} </s>"
|
23 |
-
return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
dataset = load_dataset("json", data_files=dataset_path)
|
39 |
-
tokenized = dataset["train"].map(lambda x: tokenize_function(x, tokenizer))
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
logging_dir="./logs",
|
46 |
-
save_strategy="no",
|
47 |
-
learning_rate=2e-4
|
48 |
-
)
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
)
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
60 |
|
|
|
61 |
|
62 |
-
model_id = st.text_input("Model ID", "behbudiy/Llama-3.1-8B-Instuct-Uz")
|
63 |
-
dataset_path = st.text_input("Dataset path", "app/datasets/train.json")
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
from transformers import (
|
5 |
+
AutoTokenizer,
|
6 |
+
AutoModelForCausalLM,
|
7 |
+
TrainingArguments,
|
8 |
+
BitsAndBytesConfig
|
9 |
+
)
|
10 |
from datasets import load_dataset
|
11 |
import torch
|
12 |
+
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
|
13 |
+
from peft import LoraConfig
|
14 |
|
15 |
+
BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
16 |
+
NEW_MODEL_ID = "behbudiy/Llama-3.1-8B-Instuct-Uz"
|
17 |
+
DATASET_PATH = "app/datasets/train.json"
|
18 |
+
HF_TOKEN_SECRET_NAME = "token"
|
19 |
|
20 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
21 |
+
logger = logging.getLogger(__name__)
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
def get_hf_token():
|
25 |
+
token = os.environ.get(HF_TOKEN_SECRET_NAME)
|
26 |
+
if not token:
|
27 |
+
logger.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Model yuklash yoki Hubga yuklashda muammo bo'lishi mumkin.")
|
28 |
+
st.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Iltimos, Space sozlamalarida (Secrets) uni qo'shing.")
|
29 |
+
return token
|
30 |
|
|
|
|
|
|
|
31 |
|
32 |
+
@st.cache_resource(show_spinner="Model va Tokenizer yuklanmoqda...")
|
33 |
+
def load_tokenizer_model(model_id_to_load):
|
34 |
+
"""Berilgan ID bo'yicha tokenizer va modelni yuklaydi"""
|
35 |
+
hf_token = get_hf_token()
|
36 |
+
try:
|
37 |
+
quantization_config = BitsAndBytesConfig(
|
38 |
+
load_in_4bit=True,
|
39 |
+
bnb_4bit_quant_type="nf4",
|
40 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
41 |
+
)
|
42 |
|
43 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
44 |
+
model_id_to_load,
|
45 |
+
token=hf_token
|
46 |
+
)
|
47 |
+
tokenizer.pad_token = tokenizer.eos_token
|
48 |
+
tokenizer.padding_side = "right"
|
49 |
|
50 |
+
model = AutoModelForCausalLM.from_pretrained(
|
51 |
+
model_id_to_load,
|
52 |
+
token=hf_token,
|
53 |
+
torch_dtype=torch.bfloat16,
|
54 |
+
device_map="auto",
|
55 |
+
quantization_config=quantization_config
|
56 |
+
)
|
57 |
+
logger.info(f"'{model_id_to_load}' modeli va tokenizer muvaffaqiyatli yuklandi.")
|
58 |
+
return tokenizer, model
|
59 |
+
except OSError as e:
|
60 |
+
logger.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}")
|
61 |
+
st.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}\n\n"
|
62 |
+
f"Mumkin sabablar:\n"
|
63 |
+
f"1. '{model_id_to_load}' noto'g'ri model ID.\n"
|
64 |
+
f"2. Internet aloqasi yo'q.\n"
|
65 |
+
f"3. Llama 3.1 gated model. HF Tokeningizda ushbu modelga kirish huquqi bormi?\n"
|
66 |
+
f"4. HF Tokeningiz Space Secrets'ga ('{HF_TOKEN_SECRET_NAME}' nomi bilan) to'g'ri kiritilganmi?")
|
67 |
+
st.stop()
|
68 |
+
except Exception as e:
|
69 |
+
logger.error(f"Kutilmagan xatolik ({model_id_to_load}): {e}")
|
70 |
+
st.error(f"Model/tokenizer yuklashda kutilmagan xatolik: {e}")
|
71 |
+
st.stop()
|
72 |
|
|
|
|
|
73 |
|
74 |
+
def format_instruction(sample):
|
75 |
+
instruction = sample.get('instruction', '')
|
76 |
+
input_text = sample.get('input', '')
|
77 |
+
output = sample.get('output', '')
|
|
|
|
|
|
|
|
|
78 |
|
79 |
+
if input_text and input_text.strip():
|
80 |
+
return f"""<s>[INST] <<SYS>>
|
81 |
+
You are a helpful Uzbek assistant.
|
82 |
+
<</SYS>>
|
|
|
83 |
|
84 |
+
{instruction}
|
85 |
+
Input: {input_text} [/INST] {output} </s>"""
|
86 |
+
else:
|
87 |
+
return f"""<s>[INST] <<SYS>>
|
88 |
+
You are a helpful Uzbek assistant.
|
89 |
+
<</SYS>>
|
90 |
|
91 |
+
{instruction} [/INST] {output} </s>"""
|
92 |
|
|
|
|
|
93 |
|
94 |
+
def fine_tune(target_model_id, dataset_path):
|
95 |
+
"""Asosiy modelni yuklaydi, fine-tuning qiladi va Hubga yuklaydi"""
|
96 |
+
st.info("Fine-tuning jarayoni boshlanmoqda...")
|
97 |
+
hf_token = get_hf_token()
|
98 |
+
if not hf_token:
|
99 |
+
st.error("Jarayonni davom ettirish uchun Hugging Face Token zarur.")
|
100 |
+
return "Hugging Face Token topilmadi."
|
101 |
+
|
102 |
+
try:
|
103 |
+
logger.info(f"Asosiy model yuklanmoqda: {BASE_MODEL_ID}")
|
104 |
+
tokenizer, model = load_tokenizer_model(BASE_MODEL_ID)
|
105 |
+
|
106 |
+
logger.info(f"Dataset yuklanmoqda: {dataset_path}")
|
107 |
+
try:
|
108 |
+
dataset = load_dataset("json", data_files=dataset_path, split="train")
|
109 |
+
formatted_dataset = dataset.map(lambda p: {'text': format_instruction(p)}, remove_columns=list(dataset.features))
|
110 |
+
logger.info(f"Dataset formatlandi. Formatlangan ustun: 'text'.")
|
111 |
+
except Exception as e:
|
112 |
+
logger.error(f"Dataset yuklash yoki formatlashda xatolik: {e}")
|
113 |
+
st.error(f"Dataset yuklash yoki formatlashda xatolik ({dataset_path}): {e}")
|
114 |
+
return f"Datasetni qayta ishlashda xato: {e}"
|
115 |
+
|
116 |
+
lora_config = LoraConfig(
|
117 |
+
r=16,
|
118 |
+
lora_alpha=32,
|
119 |
+
target_modules="all-linear",
|
120 |
+
lora_dropout=0.05,
|
121 |
+
bias="none",
|
122 |
+
task_type="CAUSAL_LM"
|
123 |
+
)
|
124 |
+
logger.info("LoRA konfiguratsiyasi tayyorlandi.")
|
125 |
+
|
126 |
+
training_args = TrainingArguments(
|
127 |
+
output_dir="./fine-tuning-results",
|
128 |
+
per_device_train_batch_size=2,
|
129 |
+
gradient_accumulation_steps=4,
|
130 |
+
learning_rate=2e-5,
|
131 |
+
logging_steps=10,
|
132 |
+
num_train_epochs=1,
|
133 |
+
|
134 |
+
save_strategy="epoch",
|
135 |
+
save_total_limit=1,
|
136 |
+
optim="adamw_torch",
|
137 |
+
fp16=True,
|
138 |
+
push_to_hub=True,
|
139 |
+
hub_model_id=target_model_id,
|
140 |
+
hub_token=hf_token,
|
141 |
+
report_to="tensorboard",
|
142 |
+
gradient_checkpointing=True,
|
143 |
+
gradient_checkpointing_kwargs={'use_reentrant': False}
|
144 |
+
)
|
145 |
+
logger.info("Training Arguments tayyorlandi.")
|
146 |
+
|
147 |
+
trainer = SFTTrainer(
|
148 |
+
model=model,
|
149 |
+
tokenizer=tokenizer,
|
150 |
+
args=training_args,
|
151 |
+
train_dataset=formatted_dataset,
|
152 |
+
peft_config=lora_config,
|
153 |
+
dataset_text_field="text",
|
154 |
+
max_seq_length=1024,
|
155 |
+
packing=False,
|
156 |
+
|
157 |
+
)
|
158 |
+
logger.info("SFTTrainer ishga tushirildi.")
|
159 |
+
|
160 |
+
st.info("Trening boshlanmoqda... Bu biroz vaqt olishi mumkin.")
|
161 |
+
logger.info("Trening boshlandi.")
|
162 |
+
train_result = trainer.train()
|
163 |
+
logger.info("Trening tugadi.")
|
164 |
+
logger.info(f"Trening natijalari: {train_result}")
|
165 |
+
|
166 |
+
st.success(f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi.")
|
167 |
+
return f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi."
|
168 |
+
|
169 |
+
except Exception as e:
|
170 |
+
logger.error(f"Fine-tuning jarayonida xatolik: {e}", exc_info=True)
|
171 |
+
st.error(f"Fine-tuning jarayonida kutilmagan xatolik yuz berdi: {e}")
|
172 |
+
return f"Fine-tuningda xato: {e}"
|
173 |
+
|
174 |
+
|
175 |
+
if __name__ == "__main__":
|
176 |
+
st.set_page_config(page_title="Llama 3.1 Uzbek Fine-Tuning", layout="wide")
|
177 |
+
st.title("🧠 Llama 3.1 8B Uzbek Fine-Tuning")
|
178 |
+
|
179 |
+
st.markdown(f"""
|
180 |
+
Bu interfeys **{BASE_MODEL_ID}** modelini **{DATASET_PATH}** dataseti yordamida fine-tuning qilish uchun mo'ljallangan.
|
181 |
+
Natija **{NEW_MODEL_ID}** nomi bilan Hugging Face Hubga yuklanadi.
|
182 |
+
|
183 |
+
**Talablar:**
|
184 |
+
1. Space sozlamalarida (Secrets) `{HF_TOKEN_SECRET_NAME}` nomli Hugging Face **write** tokeni kiritilgan bo'lishi kerak.
|
185 |
+
2. Ushbu token bog'langan akkauntda `{BASE_MODEL_ID}` modeliga kirish huquqi (litsenziyani qabul qilish) bo'lishi kerak.
|
186 |
+
3. Hugging Face Hubda `{NEW_MODEL_ID}` nomli repository mavjud bo'lishi (yoki yaratishga ruxsat bo'lishi) kerak.
|
187 |
+
""")
|
188 |
+
|
189 |
+
st.info(f"**Asosiy Model:** `{BASE_MODEL_ID}`\n\n"
|
190 |
+
f"**Dataset Yo'li:** `{DATASET_PATH}`\n\n"
|
191 |
+
f"**Natijaviy Model ID:** `{NEW_MODEL_ID}`")
|
192 |
+
|
193 |
+
if st.button("🚀 Fine-tuningni Boshlash"):
|
194 |
+
with st.spinner("Jarayon boshlanmoqda... Model yuklanishi va trening vaqt oladi."):
|
195 |
+
status = fine_tune(NEW_MODEL_ID, DATASET_PATH)
|
196 |
+
st.info(f"Jarayon holati: {status}")
|
197 |
+
|
198 |
+
st.markdown("---")
|
199 |
+
st.markdown("Loglarni quyida yoki Space 'Logs' bo'limida kuzatishingiz mumkin.")
|
requirements.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
|
|
|
|
1 |
transformers
|
2 |
datasets
|
|
|
3 |
peft
|
4 |
accelerate
|
5 |
-
bitsandbytes
|
6 |
-
torch
|
7 |
-
streamlit
|
|
|
1 |
+
streamlit
|
2 |
+
torch
|
3 |
transformers
|
4 |
datasets
|
5 |
+
trl
|
6 |
peft
|
7 |
accelerate
|
8 |
+
bitsandbytes
|
|
|
|