Gül Sena Altıntaş commited on
Commit
cb0e70e
·
1 Parent(s): aebf6ac

Fixed supertoken tokenizer loading

Browse files
Files changed (1) hide show
  1. app.py +17 -1
app.py CHANGED
@@ -73,6 +73,22 @@ def parse_dataset(text):
73
  error_msg = '\n'.join(errors) if errors else ""
74
  return questions, error_msg
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
78
  """Load model and tokenizer with caching"""
@@ -97,7 +113,7 @@ def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None)
97
  progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
98
 
99
  # Load tokenizer
100
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, legacy=True)
101
 
102
  # Add pad token if missing
103
  if tokenizer.pad_token is None:
 
73
  error_msg = '\n'.join(errors) if errors else ""
74
  return questions, error_msg
75
 
76
+ def setup_tokenizer(model_path):
77
+ tokenizer_name = model_path
78
+ if "supertoken" in model_path:
79
+ from huggingface_hub import list_repo_files, hf_hub_download
80
+ import json
81
+ files = list_repo_files(model_path)
82
+ if "tokenizer_config.json" in files:
83
+ tokenizer_path = hf_hub_download(repo_id=model_path, filename="tokenizer_config.json")
84
+ with open(tokenizer_path) as f:
85
+ tok_config = json.load(f)["data"]["tokenizer"]
86
+ if tok_config["name"] == "huggingface":
87
+ tokenizer_name = tok_config["path"]
88
+ # todo: tiktoken
89
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, trust_remote_code=True, legacy=True)
90
+ return tokenizer
91
+
92
 
93
  def load_model_and_tokenizer(model_path, use_cache=True, progress_callback=None):
94
  """Load model and tokenizer with caching"""
 
113
  progress_callback(0.2, f"📥 Loading tokenizer for {model_path}...")
114
 
115
  # Load tokenizer
116
+ tokenizer = setup_tokenizer(model_path)
117
 
118
  # Add pad token if missing
119
  if tokenizer.pad_token is None: