subword-tokenization / tests /test_utils.py
jvamvas's picture
Initial commit
a35d485
import unittest
from utils import load_hf_tokenizer, load_openai_tokenizer, load_tokenizers, tokenize, load_gsw_tokenizer
class TestTokenizerLoading(unittest.TestCase):
def test_load_hf_tokenizer(self):
"""Test loading a single HuggingFace tokenizer."""
name, tokenizer = load_hf_tokenizer("meta-llama/Llama-4-Scout-17B-16E-Instruct")
self.assertEqual(name, "meta-llama/Llama-4-Scout-17B-16E-Instruct")
self.assertIsNotNone(tokenizer)
def test_load_openai_tokenizer(self):
"""Test loading a single OpenAI tokenizer."""
name, tokenizer = load_openai_tokenizer("gpt-4o")
self.assertEqual(name, "gpt-4o")
self.assertIsNotNone(tokenizer)
def test_load_tokenizers(self):
"""Test loading all tokenizers."""
tokenizers = load_tokenizers()
# Check that we have the expected number of tokenizers
expected_count = 6 # 5 HF + 1 OpenAI
self.assertEqual(len(tokenizers), expected_count)
# Check that all expected tokenizers are present
expected_names = {
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"deepseek-ai/DeepSeek-V3-0324",
"ZurichNLP/swissbert",
"mistralai/Mistral-Nemo-Instruct-2407",
"google/gemma-3-27b-it",
"gpt-4o"
}
self.assertEqual(set(tokenizers.keys()), expected_names)
# Check that all tokenizers are valid
for name, tokenizer in tokenizers.items():
self.assertIsNotNone(tokenizer)
def test_load_gsw_tokenizer(self):
"""Test loading the Swiss German tokenizer from local files."""
name, tokenizer = load_gsw_tokenizer()
self.assertEqual(name, "swissbert-gsw")
self.assertIsNotNone(tokenizer)
# Test basic tokenization functionality
test_text = "nöd"
tokens = tokenize(test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
class TestTokenizerFunctionality(unittest.TestCase):
def setUp(self):
"""Set up tokenizers for testing."""
self.tokenizers = load_tokenizers()
self.test_text = "Dies ist ein Test."
def test_tokenize_llama(self):
"""Test tokenization with Llama tokenizer."""
tokenizer = self.tokenizers["meta-llama/Llama-4-Scout-17B-16E-Instruct"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_deepseek(self):
"""Test tokenization with DeepSeek tokenizer."""
tokenizer = self.tokenizers["deepseek-ai/DeepSeek-V3-0324"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_swissbert(self):
"""Test tokenization with SwissBERT tokenizer."""
tokenizer = self.tokenizers["ZurichNLP/swissbert"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_gpt4(self):
"""Test tokenization with GPT-4 tokenizer."""
tokenizer = self.tokenizers["gpt-4o"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_swissbert_comparison(self):
"""Test that SwissBERT tokenization compares both tokenizers and returns shorter result."""
tokenizer = self.tokenizers["ZurichNLP/swissbert"]
# Test with a Swiss German word
test_text = "nöd"
tokens = tokenize(test_text, tokenizer)
# Verify we get a valid tokenization
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
# Get both tokenizations directly to verify comparison
_, gsw_tokenizer = load_gsw_tokenizer()
swissbert_tokens = tokenize(test_text, tokenizer)
gsw_tokens = tokenize(test_text, gsw_tokenizer)
# Verify that the returned tokenization is the shorter one
expected_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
self.assertEqual(tokens, expected_tokens)
def test_tokenize_mistral(self):
"""Test tokenization with Mistral NeMo tokenizer."""
tokenizer = self.tokenizers["mistralai/Mistral-Nemo-Instruct-2407"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
def test_tokenize_gemma(self):
"""Test tokenization with Gemma 3 tokenizer."""
tokenizer = self.tokenizers["google/gemma-3-27b-it"]
tokens = tokenize(self.test_text, tokenizer)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(t, str) for t in tokens))
self.assertTrue(len(tokens) > 0)
print(tokens)
if __name__ == '__main__':
unittest.main()