import unittest from utils import load_hf_tokenizer, load_openai_tokenizer, load_tokenizers, tokenize, load_gsw_tokenizer class TestTokenizerLoading(unittest.TestCase): def test_load_hf_tokenizer(self): """Test loading a single HuggingFace tokenizer.""" name, tokenizer = load_hf_tokenizer("meta-llama/Llama-4-Scout-17B-16E-Instruct") self.assertEqual(name, "meta-llama/Llama-4-Scout-17B-16E-Instruct") self.assertIsNotNone(tokenizer) def test_load_openai_tokenizer(self): """Test loading a single OpenAI tokenizer.""" name, tokenizer = load_openai_tokenizer("gpt-4o") self.assertEqual(name, "gpt-4o") self.assertIsNotNone(tokenizer) def test_load_tokenizers(self): """Test loading all tokenizers.""" tokenizers = load_tokenizers() # Check that we have the expected number of tokenizers expected_count = 6 # 5 HF + 1 OpenAI self.assertEqual(len(tokenizers), expected_count) # Check that all expected tokenizers are present expected_names = { "meta-llama/Llama-4-Scout-17B-16E-Instruct", "deepseek-ai/DeepSeek-V3-0324", "ZurichNLP/swissbert", "mistralai/Mistral-Nemo-Instruct-2407", "google/gemma-3-27b-it", "gpt-4o" } self.assertEqual(set(tokenizers.keys()), expected_names) # Check that all tokenizers are valid for name, tokenizer in tokenizers.items(): self.assertIsNotNone(tokenizer) def test_load_gsw_tokenizer(self): """Test loading the Swiss German tokenizer from local files.""" name, tokenizer = load_gsw_tokenizer() self.assertEqual(name, "swissbert-gsw") self.assertIsNotNone(tokenizer) # Test basic tokenization functionality test_text = "nöd" tokens = tokenize(test_text, tokenizer) self.assertIsInstance(tokens, list) self.assertTrue(all(isinstance(t, str) for t in tokens)) self.assertTrue(len(tokens) > 0) class TestTokenizerFunctionality(unittest.TestCase): def setUp(self): """Set up tokenizers for testing.""" self.tokenizers = load_tokenizers() self.test_text = "Dies ist ein Test." def test_tokenize_llama(self): """Test tokenization with Llama tokenizer.""" tokenizer = self.tokenizers["meta-llama/Llama-4-Scout-17B-16E-Instruct"] tokens = tokenize(self.test_text, tokenizer) self.assertIsInstance(tokens, list) self.assertTrue(all(isinstance(t, str) for t in tokens)) self.assertTrue(len(tokens) > 0) print(tokens) def test_tokenize_deepseek(self): """Test tokenization with DeepSeek tokenizer.""" tokenizer = self.tokenizers["deepseek-ai/DeepSeek-V3-0324"] tokens = tokenize(self.test_text, tokenizer) self.assertIsInstance(tokens, list) self.assertTrue(all(isinstance(t, str) for t in tokens)) self.assertTrue(len(tokens) > 0) print(tokens) def test_tokenize_swissbert(self): """Test tokenization with SwissBERT tokenizer.""" tokenizer = self.tokenizers["ZurichNLP/swissbert"] tokens = tokenize(self.test_text, tokenizer) self.assertIsInstance(tokens, list) self.assertTrue(all(isinstance(t, str) for t in tokens)) self.assertTrue(len(tokens) > 0) print(tokens) def test_tokenize_gpt4(self): """Test tokenization with GPT-4 tokenizer.""" tokenizer = self.tokenizers["gpt-4o"] tokens = tokenize(self.test_text, tokenizer) self.assertIsInstance(tokens, list) self.assertTrue(all(isinstance(t, str) for t in tokens)) self.assertTrue(len(tokens) > 0) print(tokens) def test_tokenize_swissbert_comparison(self): """Test that SwissBERT tokenization compares both tokenizers and returns shorter result.""" tokenizer = self.tokenizers["ZurichNLP/swissbert"] # Test with a Swiss German word test_text = "nöd" tokens = tokenize(test_text, tokenizer) # Verify we get a valid tokenization self.assertIsInstance(tokens, list) self.assertTrue(all(isinstance(t, str) for t in tokens)) self.assertTrue(len(tokens) > 0) # Get both tokenizations directly to verify comparison _, gsw_tokenizer = load_gsw_tokenizer() swissbert_tokens = tokenize(test_text, tokenizer) gsw_tokens = tokenize(test_text, gsw_tokenizer) # Verify that the returned tokenization is the shorter one expected_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens self.assertEqual(tokens, expected_tokens) def test_tokenize_mistral(self): """Test tokenization with Mistral NeMo tokenizer.""" tokenizer = self.tokenizers["mistralai/Mistral-Nemo-Instruct-2407"] tokens = tokenize(self.test_text, tokenizer) self.assertIsInstance(tokens, list) self.assertTrue(all(isinstance(t, str) for t in tokens)) self.assertTrue(len(tokens) > 0) print(tokens) def test_tokenize_gemma(self): """Test tokenization with Gemma 3 tokenizer.""" tokenizer = self.tokenizers["google/gemma-3-27b-it"] tokens = tokenize(self.test_text, tokenizer) self.assertIsInstance(tokens, list) self.assertTrue(all(isinstance(t, str) for t in tokens)) self.assertTrue(len(tokens) > 0) print(tokens) if __name__ == '__main__': unittest.main()