Gül Sena Altıntaş
commited on
Commit
·
cb0e70e
1
Parent(s):
aebf6ac
Fixed supertoken tokenizer loading
Browse files
app.py
CHANGED
@@ -73,6 +73,22 @@ def parse_dataset(text):
|
|
73 |
error_msg = '\n'.join(errors) if errors else ""
|
74 |
return questions, error_msg
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
|
78 |
"""Load model and tokenizer with caching"""
|
@@ -97,7 +113,7 @@ def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None)
|
|
97 |
progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
|
98 |
|
99 |
# Load tokenizer
|
100 |
-
tokenizer =
|
101 |
|
102 |
# Add pad token if missing
|
103 |
if tokenizer.pad_token is None:
|
|
|
73 |
error_msg = '\n'.join(errors) if errors else ""
|
74 |
return questions, error_msg
|
75 |
|
76 |
+
def setup_tokenizer(model_path):
|
77 |
+
tokenizer_name = model_path
|
78 |
+
if "supertoken" in model_path:
|
79 |
+
from huggingface_hub import list_repo_files, hf_hub_download
|
80 |
+
import json
|
81 |
+
files = list_repo_files(model_path)
|
82 |
+
if "tokenizer_config.json" in files:
|
83 |
+
tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer_config.json")
|
84 |
+
with open(tokenizer_path) as f:
|
85 |
+
tok_config = json.load(f)["data"]["tokenizer"]
|
86 |
+
if tok_config["name"] == "huggingface":
|
87 |
+
tokenizer_name = tok_config["path"]
|
88 |
+
# todo: tiktoken
|
89 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True, legacy=True)
|
90 |
+
return tokenizer
|
91 |
+
|
92 |
|
93 |
def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
|
94 |
"""Load model and tokenizer with caching"""
|
|
|
113 |
progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
|
114 |
|
115 |
# Load tokenizer
|
116 |
+
tokenizer = setup_tokenizer(model_path)
|
117 |
|
118 |
# Add pad token if missing
|
119 |
if tokenizer.pad_token is None:
|