Spaces:
Running
Running
import unittest | |
from utils import load_hf_tokenizer, load_openai_tokenizer, load_tokenizers, tokenize, load_gsw_tokenizer | |
class TestTokenizerLoading(unittest.TestCase): | |
def test_load_hf_tokenizer(self): | |
"""Test loading a single HuggingFace tokenizer.""" | |
name, tokenizer = load_hf_tokenizer("meta-llama/Llama-4-Scout-17B-16E-Instruct") | |
self.assertEqual(name, "meta-llama/Llama-4-Scout-17B-16E-Instruct") | |
self.assertIsNotNone(tokenizer) | |
def test_load_openai_tokenizer(self): | |
"""Test loading a single OpenAI tokenizer.""" | |
name, tokenizer = load_openai_tokenizer("gpt-4o") | |
self.assertEqual(name, "gpt-4o") | |
self.assertIsNotNone(tokenizer) | |
def test_load_tokenizers(self): | |
"""Test loading all tokenizers.""" | |
tokenizers = load_tokenizers() | |
# Check that we have the expected number of tokenizers | |
expected_count = 6 # 5 HF + 1 OpenAI | |
self.assertEqual(len(tokenizers), expected_count) | |
# Check that all expected tokenizers are present | |
expected_names = { | |
"meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
"deepseek-ai/DeepSeek-V3-0324", | |
"ZurichNLP/swissbert", | |
"mistralai/Mistral-Nemo-Instruct-2407", | |
"google/gemma-3-27b-it", | |
"gpt-4o" | |
} | |
self.assertEqual(set(tokenizers.keys()), expected_names) | |
# Check that all tokenizers are valid | |
for name, tokenizer in tokenizers.items(): | |
self.assertIsNotNone(tokenizer) | |
def test_load_gsw_tokenizer(self): | |
"""Test loading the Swiss German tokenizer from local files.""" | |
name, tokenizer = load_gsw_tokenizer() | |
self.assertEqual(name, "swissbert-gsw") | |
self.assertIsNotNone(tokenizer) | |
# Test basic tokenization functionality | |
test_text = "nöd" | |
tokens = tokenize(test_text, tokenizer) | |
self.assertIsInstance(tokens, list) | |
self.assertTrue(all(isinstance(t, str) for t in tokens)) | |
self.assertTrue(len(tokens) > 0) | |
class TestTokenizerFunctionality(unittest.TestCase): | |
def setUp(self): | |
"""Set up tokenizers for testing.""" | |
self.tokenizers = load_tokenizers() | |
self.test_text = "Dies ist ein Test." | |
def test_tokenize_llama(self): | |
"""Test tokenization with Llama tokenizer.""" | |
tokenizer = self.tokenizers["meta-llama/Llama-4-Scout-17B-16E-Instruct"] | |
tokens = tokenize(self.test_text, tokenizer) | |
self.assertIsInstance(tokens, list) | |
self.assertTrue(all(isinstance(t, str) for t in tokens)) | |
self.assertTrue(len(tokens) > 0) | |
print(tokens) | |
def test_tokenize_deepseek(self): | |
"""Test tokenization with DeepSeek tokenizer.""" | |
tokenizer = self.tokenizers["deepseek-ai/DeepSeek-V3-0324"] | |
tokens = tokenize(self.test_text, tokenizer) | |
self.assertIsInstance(tokens, list) | |
self.assertTrue(all(isinstance(t, str) for t in tokens)) | |
self.assertTrue(len(tokens) > 0) | |
print(tokens) | |
def test_tokenize_swissbert(self): | |
"""Test tokenization with SwissBERT tokenizer.""" | |
tokenizer = self.tokenizers["ZurichNLP/swissbert"] | |
tokens = tokenize(self.test_text, tokenizer) | |
self.assertIsInstance(tokens, list) | |
self.assertTrue(all(isinstance(t, str) for t in tokens)) | |
self.assertTrue(len(tokens) > 0) | |
print(tokens) | |
def test_tokenize_gpt4(self): | |
"""Test tokenization with GPT-4 tokenizer.""" | |
tokenizer = self.tokenizers["gpt-4o"] | |
tokens = tokenize(self.test_text, tokenizer) | |
self.assertIsInstance(tokens, list) | |
self.assertTrue(all(isinstance(t, str) for t in tokens)) | |
self.assertTrue(len(tokens) > 0) | |
print(tokens) | |
def test_tokenize_swissbert_comparison(self): | |
"""Test that SwissBERT tokenization compares both tokenizers and returns shorter result.""" | |
tokenizer = self.tokenizers["ZurichNLP/swissbert"] | |
# Test with a Swiss German word | |
test_text = "nöd" | |
tokens = tokenize(test_text, tokenizer) | |
# Verify we get a valid tokenization | |
self.assertIsInstance(tokens, list) | |
self.assertTrue(all(isinstance(t, str) for t in tokens)) | |
self.assertTrue(len(tokens) > 0) | |
# Get both tokenizations directly to verify comparison | |
_, gsw_tokenizer = load_gsw_tokenizer() | |
swissbert_tokens = tokenize(test_text, tokenizer) | |
gsw_tokens = tokenize(test_text, gsw_tokenizer) | |
# Verify that the returned tokenization is the shorter one | |
expected_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens | |
self.assertEqual(tokens, expected_tokens) | |
def test_tokenize_mistral(self): | |
"""Test tokenization with Mistral NeMo tokenizer.""" | |
tokenizer = self.tokenizers["mistralai/Mistral-Nemo-Instruct-2407"] | |
tokens = tokenize(self.test_text, tokenizer) | |
self.assertIsInstance(tokens, list) | |
self.assertTrue(all(isinstance(t, str) for t in tokens)) | |
self.assertTrue(len(tokens) > 0) | |
print(tokens) | |
def test_tokenize_gemma(self): | |
"""Test tokenization with Gemma 3 tokenizer.""" | |
tokenizer = self.tokenizers["google/gemma-3-27b-it"] | |
tokens = tokenize(self.test_text, tokenizer) | |
self.assertIsInstance(tokens, list) | |
self.assertTrue(all(isinstance(t, str) for t in tokens)) | |
self.assertTrue(len(tokens) > 0) | |
print(tokens) | |
if __name__ == '__main__': | |
unittest.main() |