nroggendorff commited on
Commit
062ca1d
·
verified ·
1 Parent(s): 8cbd82e

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +4 -1
train.py CHANGED
@@ -34,11 +34,14 @@ def load_data():
34
 
35
  def create_tokenizer(training_corpus):
36
  tokenizer = ByteLevelBPETokenizer()
 
 
 
37
  tokenizer.train_from_iterator(
38
  training_corpus,
39
  vocab_size=VOCAB_SIZE,
40
  min_frequency=2,
41
- special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
42
  )
43
 
44
  fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
 
34
 
35
  def create_tokenizer(training_corpus):
36
  tokenizer = ByteLevelBPETokenizer()
37
+ special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
38
+ if INSTRUCT_FINETUNE_BOOL:
39
+ special_tokens.append("<|user|>", "<|bot|>", "<|end|>")
40
  tokenizer.train_from_iterator(
41
  training_corpus,
42
  vocab_size=VOCAB_SIZE,
43
  min_frequency=2,
44
+ special_tokens=special_tokens
45
  )
46
 
47
  fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)