Create train_tokenizer.py
Browse files- train_tokenizer.py +45 -0
train_tokenizer.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
2 |
+
from datasets import load_dataset
|
3 |
+
import re
|
4 |
+
|
5 |
+
# Προσαρμοσμένος Pre-tokenizer για κώδικα και πολυγλωσσικό κείμενο
|
6 |
+
code_regex = r"""(?x:
|
7 |
+
//.*?$|/\*.*?\*/| # Σχόλια
|
8 |
+
"(?:\\.|[^\\"])*"| # Strings
|
9 |
+
'(?:\\.|[^\\'])*'| # Chars
|
10 |
+
\b(?:if|else|for|while|return|function)\b| # Keywords
|
11 |
+
[<>]=?|\+\+|--|&&|\|\||[-+*/%=&|^~!]=?| # Operators
|
12 |
+
\d+\.?\d*|\.\d+| # Αριθμοί
|
13 |
+
[{}[\](),.;:]| # Σύμβολα
|
14 |
+
\p{L}+|\p{N}+| # Unicode letters/numbers
|
15 |
+
\s+| # Whitespace
|
16 |
+
\S # Άλλα
|
17 |
+
)"""
|
18 |
+
|
19 |
+
def train_tokenizer(iterator, vocab_size=32000, min_frequency=2):
|
20 |
+
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
|
21 |
+
|
22 |
+
# Προχωρημένο Normalization
|
23 |
+
tokenizer.normalizer = normalizers.Sequence([
|
24 |
+
normalizers.NFC(),
|
25 |
+
normalizers.StripAccents() # Προαιρετικό για τόνους
|
26 |
+
])
|
27 |
+
|
28 |
+
# Προσαρμοσμένος Pre-tokenizer
|
29 |
+
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
30 |
+
pre_tokenizers.Regex(code_regex),
|
31 |
+
pre_tokenizers.ByteLevel(add_prefix_space=False)
|
32 |
+
])
|
33 |
+
|
34 |
+
# Προχωρημένος Trainer
|
35 |
+
trainer = trainers.BpeTrainer(
|
36 |
+
vocab_size=vocab_size,
|
37 |
+
min_frequency=min_frequency,
|
38 |
+
special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>"],
|
39 |
+
continuing_subword_prefix="",
|
40 |
+
show_progress=True
|
41 |
+
)
|
42 |
+
|
43 |
+
tokenizer.train_from_iterator(iterator, trainer=trainer)
|
44 |
+
tokenizer.decoder = decoders.ByteLevel()
|
45 |
+
return tokenizer
|