File size: 4,503 Bytes
07423df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import codecs
import logging
import os
from typing import Any
from transformers import AutoTokenizer
logger = logging.getLogger(__name__)
TEXT_SEPARATOR = "<TEXT_SEPARATOR>"
def get_texts(df, cfg, separator=None):
if isinstance(cfg.dataset.prompt_column, str):
# single column dataset
texts = df[cfg.dataset.prompt_column].astype(str)
texts = texts.values
else:
# multi-column dataset - prepend (if necessary) and join
columns = list(cfg.dataset.prompt_column)
for column in columns:
df[column] = df[column].astype(str)
if separator is None:
separator = getattr(cfg, "_tokenizer_sep_token", TEXT_SEPARATOR)
join_str = f" {separator} "
texts = df[columns].astype(str)
texts = texts.apply(lambda x: join_str.join(x), axis=1).values
return texts
def get_tokenizer(cfg: Any):
kwargs = dict(
revision=cfg.environment.huggingface_branch,
use_fast=cfg.tokenizer.use_fast,
trust_remote_code=cfg.environment.trust_remote_code,
token=os.getenv("HUGGINGFACE_TOKEN"),
)
try:
tokenizer = AutoTokenizer.from_pretrained(cfg.llm_backbone, **kwargs)
except TypeError as e:
error_message = str(e)
if "token" in error_message:
# TypeError: RWForCausalLM.__init__() got
# an unexpected keyword argument 'token'
kwargs.pop("token")
tokenizer = AutoTokenizer.from_pretrained(cfg.llm_backbone, **kwargs)
elif "not a string" in error_message:
# https://github.com/h2oai/h2o-llmstudio/issues/623
kwargs.pop("add_prefix_space")
tokenizer = AutoTokenizer.from_pretrained(cfg.llm_backbone, **kwargs)
else:
raise e
tokenizer.padding_side = getattr(
cfg.tokenizer, "_padding_side", tokenizer.padding_side
)
# if the eos token is an empty string, we assign it to a token
if tokenizer.eos_token == "":
tokenizer.add_special_tokens({"eos_token": "</s>"})
tokenizer.eos_token = "</s>"
if tokenizer.pad_token is None:
if tokenizer.unk_token is not None:
tokenizer.pad_token = tokenizer.unk_token
else:
tokenizer.pad_token = tokenizer.eos_token
if tokenizer.bos_token is None:
tokenizer.bos_token = tokenizer.eos_token
if tokenizer.cls_token is None:
tokenizer.cls_token = tokenizer.eos_token
if tokenizer.sep_token is None:
tokenizer.sep_token = tokenizer.eos_token
cfg._tokenizer_sep_token = tokenizer.sep_token
if tokenizer.unk_token_id is not None:
cfg._tokenizer_mask_token_id = tokenizer.unk_token_id
elif tokenizer.mask_token_id is not None:
cfg._tokenizer_mask_token_id = tokenizer.mask_token_id
elif tokenizer.pad_token_id is not None:
cfg._tokenizer_mask_token_id = tokenizer.pad_token_id
else:
# setting the mask token id to the last token in the vocabulary
# this usually is a safe choice and mostly refers to eos token
cfg._tokenizer_mask_token_id = len(tokenizer) - 1
cfg._tokenizer_eos_token = tokenizer.eos_token
if hasattr(cfg.prediction, "stop_tokens"):
set_stop_token_ids(cfg, tokenizer)
cfg.tokenizer._vocab_length = len(tokenizer)
return tokenizer
def set_stop_token_ids(cfg, tokenizer):
cfg.tokenizer._stop_words = list(
filter(None, cfg.prediction.stop_tokens.split(","))
)
for stop_word in [
cfg.dataset.text_system_start,
cfg.dataset.text_prompt_start,
cfg.dataset.text_answer_separator,
]:
stop_word = codecs.decode(stop_word, "unicode_escape").strip()
if (
stop_word != ""
and cfg.tokenizer.add_prompt_answer_tokens
and (stop_word not in tokenizer.get_vocab())
):
tokenizer.add_tokens([stop_word])
cfg.tokenizer._stop_words.append(stop_word)
cfg.tokenizer._stop_words = [
stop_word for stop_word in cfg.tokenizer._stop_words if stop_word != ""
]
cfg.tokenizer._stop_words_ids = []
for stop_word in set(cfg.tokenizer._stop_words):
cfg.tokenizer._stop_words_ids.append(
tokenizer(stop_word, return_tensors="pt", add_special_tokens=False)[
"input_ids"
][0]
)
if cfg.environment._local_rank == 0:
logger.info(f"Stop token ids: {cfg.tokenizer._stop_words_ids}")
|