futzone commited on
Commit
68dd6ff
·
1 Parent(s): c7faa5e

update app.py

Browse files
Files changed (2) hide show
  1. app.py +181 -50
  2. requirements.txt +4 -3
app.py CHANGED
@@ -1,68 +1,199 @@
1
  import streamlit as st
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
3
- from peft import get_peft_model, LoraConfig, TaskType
 
 
 
 
 
 
4
  from datasets import load_dataset
5
  import torch
 
 
6
 
7
- st.title("🧠 Llama 3.1 8B Uzbek Fine-Tuning")
 
 
 
8
 
 
 
9
 
10
- @st.cache_resource
11
- def load_tokenizer_model(model_id):
12
- tokenizer = AutoTokenizer.from_pretrained(model_id)
13
- model = AutoModelForCausalLM.from_pretrained(
14
- model_id,
15
- load_in_8bit=True,
16
- device_map="auto"
17
- )
18
- return tokenizer, model
19
 
 
 
 
 
 
 
20
 
21
- def tokenize_function(example, tokenizer):
22
- prompt = f"<s>[INST] {example['instruction']} [/INST] {example['output']} </s>"
23
- return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)
24
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- def fine_tune(model_id, dataset_path):
27
- tokenizer, model = load_tokenizer_model(model_id)
 
 
 
 
28
 
29
- # LoRA config
30
- lora_config = LoraConfig(
31
- task_type=TaskType.CAUSAL_LM,
32
- r=8,
33
- lora_alpha=32,
34
- lora_dropout=0.1
35
- )
36
- model = get_peft_model(model, lora_config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- dataset = load_dataset("json", data_files=dataset_path)
39
- tokenized = dataset["train"].map(lambda x: tokenize_function(x, tokenizer))
40
 
41
- training_args = TrainingArguments(
42
- per_device_train_batch_size=1,
43
- num_train_epochs=2,
44
- output_dir="./results",
45
- logging_dir="./logs",
46
- save_strategy="no",
47
- learning_rate=2e-4
48
- )
49
 
50
- trainer = Trainer(
51
- model=model,
52
- args=training_args,
53
- train_dataset=tokenized
54
- )
55
 
56
- trainer.train()
57
- model.save_pretrained("finetuned_model")
58
- tokenizer.save_pretrained("finetuned_model")
59
- return " Fine-tuning tugadi!"
 
 
60
 
 
61
 
62
- model_id = st.text_input("Model ID", "behbudiy/Llama-3.1-8B-Instuct-Uz")
63
- dataset_path = st.text_input("Dataset path", "app/datasets/train.json")
64
 
65
- if st.button("🚀 Start Fine-Tuning"):
66
- st.warning("Fine-tuning boshlandi! Bu vaqt oladi...")
67
- status = fine_tune(model_id, dataset_path)
68
- st.success(status)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ import logging
4
+ from transformers import (
5
+ AutoTokenizer,
6
+ AutoModelForCausalLM,
7
+ TrainingArguments,
8
+ BitsAndBytesConfig
9
+ )
10
  from datasets import load_dataset
11
  import torch
12
+ from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
13
+ from peft import LoraConfig
14
 
15
+ BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
16
+ NEW_MODEL_ID = "behbudiy/Llama-3.1-8B-Instuct-Uz"
17
+ DATASET_PATH = "app/datasets/train.json"
18
+ HF_TOKEN_SECRET_NAME = "token"
19
 
20
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
+ logger = logging.getLogger(__name__)
22
 
 
 
 
 
 
 
 
 
 
23
 
24
+ def get_hf_token():
25
+ token = os.environ.get(HF_TOKEN_SECRET_NAME)
26
+ if not token:
27
+ logger.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Model yuklash yoki Hubga yuklashda muammo bo'lishi mumkin.")
28
+ st.warning(f"'{HF_TOKEN_SECRET_NAME}' nomli Secret topilmadi. Iltimos, Space sozlamalarida (Secrets) uni qo'shing.")
29
+ return token
30
 
 
 
 
31
 
32
+ @st.cache_resource(show_spinner="Model va Tokenizer yuklanmoqda...")
33
+ def load_tokenizer_model(model_id_to_load):
34
+ """Berilgan ID bo'yicha tokenizer va modelni yuklaydi"""
35
+ hf_token = get_hf_token()
36
+ try:
37
+ quantization_config = BitsAndBytesConfig(
38
+ load_in_4bit=True,
39
+ bnb_4bit_quant_type="nf4",
40
+ bnb_4bit_compute_dtype=torch.bfloat16
41
+ )
42
 
43
+ tokenizer = AutoTokenizer.from_pretrained(
44
+ model_id_to_load,
45
+ token=hf_token
46
+ )
47
+ tokenizer.pad_token = tokenizer.eos_token
48
+ tokenizer.padding_side = "right"
49
 
50
+ model = AutoModelForCausalLM.from_pretrained(
51
+ model_id_to_load,
52
+ token=hf_token,
53
+ torch_dtype=torch.bfloat16,
54
+ device_map="auto",
55
+ quantization_config=quantization_config
56
+ )
57
+ logger.info(f"'{model_id_to_load}' modeli va tokenizer muvaffaqiyatli yuklandi.")
58
+ return tokenizer, model
59
+ except OSError as e:
60
+ logger.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}")
61
+ st.error(f"Model yoki tokenizer yuklashda xatolik ({model_id_to_load}): {e}\n\n"
62
+ f"Mumkin sabablar:\n"
63
+ f"1. '{model_id_to_load}' noto'g'ri model ID.\n"
64
+ f"2. Internet aloqasi yo'q.\n"
65
+ f"3. Llama 3.1 gated model. HF Tokeningizda ushbu modelga kirish huquqi bormi?\n"
66
+ f"4. HF Tokeningiz Space Secrets'ga ('{HF_TOKEN_SECRET_NAME}' nomi bilan) to'g'ri kiritilganmi?")
67
+ st.stop()
68
+ except Exception as e:
69
+ logger.error(f"Kutilmagan xatolik ({model_id_to_load}): {e}")
70
+ st.error(f"Model/tokenizer yuklashda kutilmagan xatolik: {e}")
71
+ st.stop()
72
 
 
 
73
 
74
+ def format_instruction(sample):
75
+ instruction = sample.get('instruction', '')
76
+ input_text = sample.get('input', '')
77
+ output = sample.get('output', '')
 
 
 
 
78
 
79
+ if input_text and input_text.strip():
80
+ return f"""<s>[INST] <<SYS>>
81
+ You are a helpful Uzbek assistant.
82
+ <</SYS>>
 
83
 
84
+ {instruction}
85
+ Input: {input_text} [/INST] {output} </s>"""
86
+ else:
87
+ return f"""<s>[INST] <<SYS>>
88
+ You are a helpful Uzbek assistant.
89
+ <</SYS>>
90
 
91
+ {instruction} [/INST] {output} </s>"""
92
 
 
 
93
 
94
+ def fine_tune(target_model_id, dataset_path):
95
+ """Asosiy modelni yuklaydi, fine-tuning qiladi va Hubga yuklaydi"""
96
+ st.info("Fine-tuning jarayoni boshlanmoqda...")
97
+ hf_token = get_hf_token()
98
+ if not hf_token:
99
+ st.error("Jarayonni davom ettirish uchun Hugging Face Token zarur.")
100
+ return "Hugging Face Token topilmadi."
101
+
102
+ try:
103
+ logger.info(f"Asosiy model yuklanmoqda: {BASE_MODEL_ID}")
104
+ tokenizer, model = load_tokenizer_model(BASE_MODEL_ID)
105
+
106
+ logger.info(f"Dataset yuklanmoqda: {dataset_path}")
107
+ try:
108
+ dataset = load_dataset("json", data_files=dataset_path, split="train")
109
+ formatted_dataset = dataset.map(lambda p: {'text': format_instruction(p)}, remove_columns=list(dataset.features))
110
+ logger.info(f"Dataset formatlandi. Formatlangan ustun: 'text'.")
111
+ except Exception as e:
112
+ logger.error(f"Dataset yuklash yoki formatlashda xatolik: {e}")
113
+ st.error(f"Dataset yuklash yoki formatlashda xatolik ({dataset_path}): {e}")
114
+ return f"Datasetni qayta ishlashda xato: {e}"
115
+
116
+ lora_config = LoraConfig(
117
+ r=16,
118
+ lora_alpha=32,
119
+ target_modules="all-linear",
120
+ lora_dropout=0.05,
121
+ bias="none",
122
+ task_type="CAUSAL_LM"
123
+ )
124
+ logger.info("LoRA konfiguratsiyasi tayyorlandi.")
125
+
126
+ training_args = TrainingArguments(
127
+ output_dir="./fine-tuning-results",
128
+ per_device_train_batch_size=2,
129
+ gradient_accumulation_steps=4,
130
+ learning_rate=2e-5,
131
+ logging_steps=10,
132
+ num_train_epochs=1,
133
+
134
+ save_strategy="epoch",
135
+ save_total_limit=1,
136
+ optim="adamw_torch",
137
+ fp16=True,
138
+ push_to_hub=True,
139
+ hub_model_id=target_model_id,
140
+ hub_token=hf_token,
141
+ report_to="tensorboard",
142
+ gradient_checkpointing=True,
143
+ gradient_checkpointing_kwargs={'use_reentrant': False}
144
+ )
145
+ logger.info("Training Arguments tayyorlandi.")
146
+
147
+ trainer = SFTTrainer(
148
+ model=model,
149
+ tokenizer=tokenizer,
150
+ args=training_args,
151
+ train_dataset=formatted_dataset,
152
+ peft_config=lora_config,
153
+ dataset_text_field="text",
154
+ max_seq_length=1024,
155
+ packing=False,
156
+
157
+ )
158
+ logger.info("SFTTrainer ishga tushirildi.")
159
+
160
+ st.info("Trening boshlanmoqda... Bu biroz vaqt olishi mumkin.")
161
+ logger.info("Trening boshlandi.")
162
+ train_result = trainer.train()
163
+ logger.info("Trening tugadi.")
164
+ logger.info(f"Trening natijalari: {train_result}")
165
+
166
+ st.success(f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi.")
167
+ return f"Fine-tuning muvaffaqiyatli yakunlandi! Model '{target_model_id}' manziliga yuklandi."
168
+
169
+ except Exception as e:
170
+ logger.error(f"Fine-tuning jarayonida xatolik: {e}", exc_info=True)
171
+ st.error(f"Fine-tuning jarayonida kutilmagan xatolik yuz berdi: {e}")
172
+ return f"Fine-tuningda xato: {e}"
173
+
174
+
175
+ if __name__ == "__main__":
176
+ st.set_page_config(page_title="Llama 3.1 Uzbek Fine-Tuning", layout="wide")
177
+ st.title("🧠 Llama 3.1 8B Uzbek Fine-Tuning")
178
+
179
+ st.markdown(f"""
180
+ Bu interfeys **{BASE_MODEL_ID}** modelini **{DATASET_PATH}** dataseti yordamida fine-tuning qilish uchun mo'ljallangan.
181
+ Natija **{NEW_MODEL_ID}** nomi bilan Hugging Face Hubga yuklanadi.
182
+
183
+ **Talablar:**
184
+ 1. Space sozlamalarida (Secrets) `{HF_TOKEN_SECRET_NAME}` nomli Hugging Face **write** tokeni kiritilgan bo'lishi kerak.
185
+ 2. Ushbu token bog'langan akkauntda `{BASE_MODEL_ID}` modeliga kirish huquqi (litsenziyani qabul qilish) bo'lishi kerak.
186
+ 3. Hugging Face Hubda `{NEW_MODEL_ID}` nomli repository mavjud bo'lishi (yoki yaratishga ruxsat bo'lishi) kerak.
187
+ """)
188
+
189
+ st.info(f"**Asosiy Model:** `{BASE_MODEL_ID}`\n\n"
190
+ f"**Dataset Yo'li:** `{DATASET_PATH}`\n\n"
191
+ f"**Natijaviy Model ID:** `{NEW_MODEL_ID}`")
192
+
193
+ if st.button("🚀 Fine-tuningni Boshlash"):
194
+ with st.spinner("Jarayon boshlanmoqda... Model yuklanishi va trening vaqt oladi."):
195
+ status = fine_tune(NEW_MODEL_ID, DATASET_PATH)
196
+ st.info(f"Jarayon holati: {status}")
197
+
198
+ st.markdown("---")
199
+ st.markdown("Loglarni quyida yoki Space 'Logs' bo'limida kuzatishingiz mumkin.")
requirements.txt CHANGED
@@ -1,7 +1,8 @@
 
 
1
  transformers
2
  datasets
 
3
  peft
4
  accelerate
5
- bitsandbytes
6
- torch
7
- streamlit
 
1
+ streamlit
2
+ torch
3
  transformers
4
  datasets
5
+ trl
6
  peft
7
  accelerate
8
+ bitsandbytes