Spaces:
Paused
Paused
Update train.py
Browse files
train.py
CHANGED
@@ -34,11 +34,14 @@ def load_data():
|
|
34 |
|
35 |
def create_tokenizer(training_corpus):
|
36 |
tokenizer = ByteLevelBPETokenizer()
|
|
|
|
|
|
|
37 |
tokenizer.train_from_iterator(
|
38 |
training_corpus,
|
39 |
vocab_size=VOCAB_SIZE,
|
40 |
min_frequency=2,
|
41 |
-
special_tokens=
|
42 |
)
|
43 |
|
44 |
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
|
|
|
34 |
|
35 |
def create_tokenizer(training_corpus):
|
36 |
tokenizer = ByteLevelBPETokenizer()
|
37 |
+
special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
|
38 |
+
if INSTRUCT_FINETUNE_BOOL:
|
39 |
+
special_tokens.append("<|user|>", "<|bot|>", "<|end|>")
|
40 |
tokenizer.train_from_iterator(
|
41 |
training_corpus,
|
42 |
vocab_size=VOCAB_SIZE,
|
43 |
min_frequency=2,
|
44 |
+
special_tokens=special_tokens
|
45 |
)
|
46 |
|
47 |
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
|