Spaces:
Running
Running
File size: 5,660 Bytes
a35d485 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import unittest
from utils import load_hf_tokenizer, load_openai_tokenizer, load_tokenizers, tokenize, load_gsw_tokenizer
class TestTokenizerLoading(unittest.TestCase):
def test_load_hf_tokenizer(self):
"""Test loading a single HuggingFace tokenizer."""
name, tokenizer = load_hf_tokenizer("meta-llama/Llama-4-Scout-17B-16E-Instruct")
self.assertEqual(name, "meta-llama/Llama-4-Scout-17B-16E-Instruct")
self.assertIsNotNone(tokenizer)
def test_load_openai_tokenizer(self):
"""Test loading a single OpenAI tokenizer."""
name, tokenizer = load_openai_tokenizer("gpt-4o")
self.assertEqual(name, "gpt-4o")
self.assertIsNotNone(tokenizer)
def test_load_tokenizers(self):
"""Test loading all tokenizers."""
tokenizers = load_tokenizers()
# Check that we have the expected number of tokenizers
expected_count = 6 # 5 HF + 1 OpenAI
self.assertEqual(len(tokenizers), expected_count)
# Check that all expected tokenizers are present
expected_names = {
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"deepseek-ai/DeepSeek-V3-0324",
"ZurichNLP/swissbert",
"mistralai/Mistral-Nemo-Instruct-2407",
"google/gemma-3-27b-it",
"gpt-4o"
}
self.assertEqual(set(tokenizers.keys()), expected_names)
# Check that all tokenizers are valid
for name, tokenizer in tokenizers.items():
self.assertIsNotNone(tokenizer)
def test_load_gsw_tokenizer(self):
"""Test loading the Swiss German tokenizer from local files."""
name, tokenizer = load_gsw_tokenizer()
self.assertEqual(name, "swissbert-gsw")
self.assertIsNotNone(tokenizer)
# Test basic tokenization functionality
test_text = "nöd"
tokens = tokenize(test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
class TestTokenizerFunctionality(unittest.TestCase):
def setUp(self):
"""Set up tokenizers for testing."""
self.tokenizers = load_tokenizers()
self.test_text = "Dies ist ein Test."
def test_tokenize_llama(self):
"""Test tokenization with Llama tokenizer."""
tokenizer = self.tokenizers["meta-llama/Llama-4-Scout-17B-16E-Instruct"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_deepseek(self):
"""Test tokenization with DeepSeek tokenizer."""
tokenizer = self.tokenizers["deepseek-ai/DeepSeek-V3-0324"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_swissbert(self):
"""Test tokenization with SwissBERT tokenizer."""
tokenizer = self.tokenizers["ZurichNLP/swissbert"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_gpt4(self):
"""Test tokenization with GPT-4 tokenizer."""
tokenizer = self.tokenizers["gpt-4o"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_swissbert_comparison(self):
"""Test that SwissBERT tokenization compares both tokenizers and returns shorter result."""
tokenizer = self.tokenizers["ZurichNLP/swissbert"]
# Test with a Swiss German word
test_text = "nöd"
tokens = tokenize(test_text, tokenizer)
# Verify we get a valid tokenization
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
# Get both tokenizations directly to verify comparison
_, gsw_tokenizer = load_gsw_tokenizer()
swissbert_tokens = tokenize(test_text, tokenizer)
gsw_tokens = tokenize(test_text, gsw_tokenizer)
# Verify that the returned tokenization is the shorter one
expected_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
self.assertEqual(tokens, expected_tokens)
def test_tokenize_mistral(self):
"""Test tokenization with Mistral NeMo tokenizer."""
tokenizer = self.tokenizers["mistralai/Mistral-Nemo-Instruct-2407"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_gemma(self):
"""Test tokenization with Gemma 3 tokenizer."""
tokenizer = self.tokenizers["google/gemma-3-27b-it"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
if __name__ == '__main__':
unittest.main() |