Update app.py
Browse files
app.py
CHANGED
@@ -8,12 +8,17 @@ import matplotlib.pyplot as plt
|
|
8 |
from datasets import load_dataset
|
9 |
from train_tokenizer import train_tokenizer
|
10 |
from tokenizers import Tokenizer
|
|
|
11 |
|
12 |
-
#
|
|
|
|
|
|
|
13 |
CHECKPOINT_FILE = "checkpoint.txt"
|
14 |
TOKENIZER_DIR = "tokenizer_model"
|
15 |
TOKENIZER_FILE = os.path.join(TOKENIZER_DIR, "tokenizer.json")
|
16 |
CHUNK_SIZE = 1000 # Μέγεθος batch για checkpoint
|
|
|
17 |
|
18 |
def fetch_splits(dataset_name):
|
19 |
"""Ανάκτηση των splits του dataset από το Hugging Face."""
|
@@ -63,6 +68,37 @@ def load_checkpoint():
|
|
63 |
return f.read().splitlines()
|
64 |
return []
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
def train_and_test(dataset_name, configs, split, vocab_size, min_freq, test_text):
|
67 |
"""Εκπαίδευση του tokenizer και δοκιμή του."""
|
68 |
print("🚀 Ξεκινά η διαδικασία εκπαίδευσης...")
|
@@ -73,8 +109,10 @@ def train_and_test(dataset_name, configs, split, vocab_size, min_freq, test_text
|
|
73 |
|
74 |
dataset_iterator = create_iterator(dataset_name, configs, split)
|
75 |
new_texts = []
|
76 |
-
|
77 |
for text in dataset_iterator:
|
|
|
|
|
78 |
new_texts.append(text)
|
79 |
total_processed += 1
|
80 |
if len(new_texts) >= CHUNK_SIZE:
|
@@ -86,6 +124,8 @@ def train_and_test(dataset_name, configs, split, vocab_size, min_freq, test_text
|
|
86 |
append_to_checkpoint(new_texts)
|
87 |
print(f"✅ Τελικό batch αποθηκεύτηκε ({total_processed} δείγματα).")
|
88 |
|
|
|
|
|
89 |
# Εκπαίδευση του tokenizer
|
90 |
all_texts = load_checkpoint()
|
91 |
tokenizer = train_tokenizer(all_texts, vocab_size, min_freq, TOKENIZER_DIR)
|
@@ -113,23 +153,30 @@ def train_and_test(dataset_name, configs, split, vocab_size, min_freq, test_text
|
|
113 |
with gr.Blocks() as demo:
|
114 |
gr.Markdown("## Wikipedia Tokenizer Trainer with Checkpointing")
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
132 |
|
133 |
-
train_btn.click(train_and_test,
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
demo.launch()
|
|
|
8 |
from datasets import load_dataset
|
9 |
from train_tokenizer import train_tokenizer
|
10 |
from tokenizers import Tokenizer
|
11 |
+
from langdetect import detect, DetectorFactory
|
12 |
|
13 |
+
# Για επαναληψιμότητα στο langdetect
|
14 |
+
DetectorFactory.seed = 0
|
15 |
+
|
16 |
+
# Ρυθμίσεις checkpointing και αποθήκευσης του tokenizer
|
17 |
CHECKPOINT_FILE = "checkpoint.txt"
|
18 |
TOKENIZER_DIR = "tokenizer_model"
|
19 |
TOKENIZER_FILE = os.path.join(TOKENIZER_DIR, "tokenizer.json")
|
20 |
CHUNK_SIZE = 1000 # Μέγεθος batch για checkpoint
|
21 |
+
MAX_SAMPLES = 3000000 # Όριο δειγμάτων (μπορείς να το προσαρμόσεις)
|
22 |
|
23 |
def fetch_splits(dataset_name):
|
24 |
"""Ανάκτηση των splits του dataset από το Hugging Face."""
|
|
|
68 |
return f.read().splitlines()
|
69 |
return []
|
70 |
|
71 |
+
def analyze_checkpoint(num_samples=1000):
|
72 |
+
"""
|
73 |
+
Διαβάζει τα πρώτα num_samples δείγματα από το checkpoint και επιστρέφει το ποσοστό γλωσσών.
|
74 |
+
"""
|
75 |
+
if not os.path.exists(CHECKPOINT_FILE):
|
76 |
+
return "Το αρχείο checkpoint δεν υπάρχει."
|
77 |
+
|
78 |
+
with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
|
79 |
+
lines = f.read().splitlines()
|
80 |
+
|
81 |
+
sample_lines = lines[:num_samples] if len(lines) >= num_samples else lines
|
82 |
+
|
83 |
+
language_counts = {}
|
84 |
+
total = 0
|
85 |
+
for line in sample_lines:
|
86 |
+
try:
|
87 |
+
lang = detect(line)
|
88 |
+
language_counts[lang] = language_counts.get(lang, 0) + 1
|
89 |
+
total += 1
|
90 |
+
except Exception as e:
|
91 |
+
continue
|
92 |
+
|
93 |
+
if total == 0:
|
94 |
+
return "Δεν βρέθηκαν έγκυρα δείγματα για ανάλυση."
|
95 |
+
|
96 |
+
report = "Αποτελέσματα Ανάλυσης:\n"
|
97 |
+
for lang, count in language_counts.items():
|
98 |
+
report += f"Γλώσσα {lang}: {count/total*100:.2f}%\n"
|
99 |
+
|
100 |
+
return report
|
101 |
+
|
102 |
def train_and_test(dataset_name, configs, split, vocab_size, min_freq, test_text):
|
103 |
"""Εκπαίδευση του tokenizer και δοκιμή του."""
|
104 |
print("🚀 Ξεκινά η διαδικασία εκπαίδευσης...")
|
|
|
109 |
|
110 |
dataset_iterator = create_iterator(dataset_name, configs, split)
|
111 |
new_texts = []
|
112 |
+
|
113 |
for text in dataset_iterator:
|
114 |
+
if total_processed >= MAX_SAMPLES:
|
115 |
+
break # Διακοπή εάν ξεπεραστεί το όριο
|
116 |
new_texts.append(text)
|
117 |
total_processed += 1
|
118 |
if len(new_texts) >= CHUNK_SIZE:
|
|
|
124 |
append_to_checkpoint(new_texts)
|
125 |
print(f"✅ Τελικό batch αποθηκεύτηκε ({total_processed} δείγματα).")
|
126 |
|
127 |
+
print("🚀 Η αποθήκευση δεδομένων ολοκληρώθηκε! Ξεκινάει η εκπαίδευση του tokenizer...")
|
128 |
+
|
129 |
# Εκπαίδευση του tokenizer
|
130 |
all_texts = load_checkpoint()
|
131 |
tokenizer = train_tokenizer(all_texts, vocab_size, min_freq, TOKENIZER_DIR)
|
|
|
153 |
with gr.Blocks() as demo:
|
154 |
gr.Markdown("## Wikipedia Tokenizer Trainer with Checkpointing")
|
155 |
|
156 |
+
with gr.Row():
|
157 |
+
with gr.Column():
|
158 |
+
dataset_name = gr.Textbox(value="wikimedia/wikipedia", label="Dataset Name")
|
159 |
+
configs = gr.Textbox(value="20231101.el,20231101.en", label="Configs")
|
160 |
+
split = gr.Dropdown(choices=["train"], value="train", label="Split")
|
161 |
+
vocab_size = gr.Slider(20000, 100000, value=50000, label="Vocabulary Size")
|
162 |
+
min_freq = gr.Slider(1, 100, value=3, label="Minimum Frequency")
|
163 |
+
test_text = gr.Textbox(value="Η Ακρόπολη είναι σύμβολο της αρχαίας Ελλάδας.", label="Test Text")
|
164 |
+
train_btn = gr.Button("Train")
|
165 |
+
analyze_btn = gr.Button("Analyze Samples")
|
166 |
+
with gr.Column():
|
167 |
+
progress = gr.Textbox(label="Progress", interactive=False, lines=10)
|
168 |
+
results_text = gr.Textbox(label="Test Decoded Text", interactive=False)
|
169 |
+
results_plot = gr.Image(label="Token Length Distribution")
|
170 |
+
# Έλεγχος ύπαρξης του tokenizer για download
|
171 |
+
initial_file_value = TOKENIZER_FILE if os.path.exists(TOKENIZER_FILE) else None
|
172 |
+
download_button = gr.File(label="Download Tokenizer", value=initial_file_value)
|
173 |
|
174 |
+
train_btn.click(train_and_test,
|
175 |
+
inputs=[dataset_name, configs, split, vocab_size, min_freq, test_text],
|
176 |
+
outputs=[progress, results_text, results_plot])
|
177 |
+
|
178 |
+
analyze_btn.click(fn=lambda: analyze_checkpoint(1000),
|
179 |
+
inputs=[],
|
180 |
+
outputs=progress)
|
181 |
|
182 |
demo.launch()
|