tymbos commited on
Commit
1536a51
·
verified ·
1 Parent(s): 0430da2

Create train_tokenizer.py

Browse files
Files changed (1) hide show
  1. train_tokenizer.py +45 -0
train_tokenizer.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
2
+ from datasets import load_dataset
3
+ import re
4
+
5
+ # Προσαρμοσμένος Pre-tokenizer για κώδικα και πολυγλωσσικό κείμενο
6
+ code_regex = r"""(?x:
7
+ //.*?$|/\*.*?\*/| # Σχόλια
8
+ "(?:\\.|[^\\"])*"| # Strings
9
+ '(?:\\.|[^\\'])*'| # Chars
10
+ \b(?:if|else|for|while|return|function)\b| # Keywords
11
+ [<>]=?|\+\+|--|&&|\|\||[-+*/%=&|^~!]=?| # Operators
12
+ \d+\.?\d*|\.\d+| # Αριθμοί
13
+ [{}[\](),.;:]| # Σύμβολα
14
+ \p{L}+|\p{N}+| # Unicode letters/numbers
15
+ \s+| # Whitespace
16
+ \S # Άλλα
17
+ )"""
18
+
19
+ def train_tokenizer(iterator, vocab_size=32000, min_frequency=2):
20
+ tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
21
+
22
+ # Προχωρημένο Normalization
23
+ tokenizer.normalizer = normalizers.Sequence([
24
+ normalizers.NFC(),
25
+ normalizers.StripAccents() # Προαιρετικό για τόνους
26
+ ])
27
+
28
+ # Προσαρμοσμένος Pre-tokenizer
29
+ tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
30
+ pre_tokenizers.Regex(code_regex),
31
+ pre_tokenizers.ByteLevel(add_prefix_space=False)
32
+ ])
33
+
34
+ # Προχωρημένος Trainer
35
+ trainer = trainers.BpeTrainer(
36
+ vocab_size=vocab_size,
37
+ min_frequency=min_frequency,
38
+ special_tokens=["<|endoftext|>", "<pad>", "<unk>", "<mask>"],
39
+ continuing_subword_prefix="",
40
+ show_progress=True
41
+ )
42
+
43
+ tokenizer.train_from_iterator(iterator, trainer=trainer)
44
+ tokenizer.decoder = decoders.ByteLevel()
45
+ return tokenizer