File size: 5,660 Bytes
a35d485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import unittest
from utils import load_hf_tokenizer, load_openai_tokenizer, load_tokenizers, tokenize, load_gsw_tokenizer

class TestTokenizerLoading(unittest.TestCase):
    def test_load_hf_tokenizer(self):
        """Test loading a single HuggingFace tokenizer."""
        name, tokenizer = load_hf_tokenizer("meta-llama/Llama-4-Scout-17B-16E-Instruct")
        self.assertEqual(name, "meta-llama/Llama-4-Scout-17B-16E-Instruct")
        self.assertIsNotNone(tokenizer)

    def test_load_openai_tokenizer(self):
        """Test loading a single OpenAI tokenizer."""
        name, tokenizer = load_openai_tokenizer("gpt-4o")
        self.assertEqual(name, "gpt-4o")
        self.assertIsNotNone(tokenizer)

    def test_load_tokenizers(self):
        """Test loading all tokenizers."""
        tokenizers = load_tokenizers()
        
        # Check that we have the expected number of tokenizers
        expected_count = 6  # 5 HF + 1 OpenAI
        self.assertEqual(len(tokenizers), expected_count)
        
        # Check that all expected tokenizers are present
        expected_names = {
            "meta-llama/Llama-4-Scout-17B-16E-Instruct",
            "deepseek-ai/DeepSeek-V3-0324",
            "ZurichNLP/swissbert",
            "mistralai/Mistral-Nemo-Instruct-2407",
            "google/gemma-3-27b-it",
            "gpt-4o"
        }
        self.assertEqual(set(tokenizers.keys()), expected_names)
        
        # Check that all tokenizers are valid
        for name, tokenizer in tokenizers.items():
            self.assertIsNotNone(tokenizer)

    def test_load_gsw_tokenizer(self):
        """Test loading the Swiss German tokenizer from local files."""
        name, tokenizer = load_gsw_tokenizer()
        self.assertEqual(name, "swissbert-gsw")
        self.assertIsNotNone(tokenizer)
        
        # Test basic tokenization functionality
        test_text = "nöd"
        tokens = tokenize(test_text, tokenizer)
        self.assertIsInstance(tokens, list)
        self.assertTrue(all(isinstance(t, str) for t in tokens))
        self.assertTrue(len(tokens) > 0)

class TestTokenizerFunctionality(unittest.TestCase):
    def setUp(self):
        """Set up tokenizers for testing."""
        self.tokenizers = load_tokenizers()
        self.test_text = "Dies ist ein Test."

    def test_tokenize_llama(self):
        """Test tokenization with Llama tokenizer."""
        tokenizer = self.tokenizers["meta-llama/Llama-4-Scout-17B-16E-Instruct"]
        tokens = tokenize(self.test_text, tokenizer)
        self.assertIsInstance(tokens, list)
        self.assertTrue(all(isinstance(t, str) for t in tokens))
        self.assertTrue(len(tokens) > 0)
        print(tokens)

    def test_tokenize_deepseek(self):
        """Test tokenization with DeepSeek tokenizer."""
        tokenizer = self.tokenizers["deepseek-ai/DeepSeek-V3-0324"]
        tokens = tokenize(self.test_text, tokenizer)
        self.assertIsInstance(tokens, list)
        self.assertTrue(all(isinstance(t, str) for t in tokens))
        self.assertTrue(len(tokens) > 0)
        print(tokens)

    def test_tokenize_swissbert(self):
        """Test tokenization with SwissBERT tokenizer."""
        tokenizer = self.tokenizers["ZurichNLP/swissbert"]
        tokens = tokenize(self.test_text, tokenizer)
        self.assertIsInstance(tokens, list)
        self.assertTrue(all(isinstance(t, str) for t in tokens))
        self.assertTrue(len(tokens) > 0)
        print(tokens)

    def test_tokenize_gpt4(self):
        """Test tokenization with GPT-4 tokenizer."""
        tokenizer = self.tokenizers["gpt-4o"]
        tokens = tokenize(self.test_text, tokenizer)
        self.assertIsInstance(tokens, list)
        self.assertTrue(all(isinstance(t, str) for t in tokens))
        self.assertTrue(len(tokens) > 0)
        print(tokens)

    def test_tokenize_swissbert_comparison(self):
        """Test that SwissBERT tokenization compares both tokenizers and returns shorter result."""
        tokenizer = self.tokenizers["ZurichNLP/swissbert"]
        
        # Test with a Swiss German word
        test_text = "nöd"
        tokens = tokenize(test_text, tokenizer)
        
        # Verify we get a valid tokenization
        self.assertIsInstance(tokens, list)
        self.assertTrue(all(isinstance(t, str) for t in tokens))
        self.assertTrue(len(tokens) > 0)
        
        # Get both tokenizations directly to verify comparison
        _, gsw_tokenizer = load_gsw_tokenizer()
        swissbert_tokens = tokenize(test_text, tokenizer)
        gsw_tokens = tokenize(test_text, gsw_tokenizer)
        
        # Verify that the returned tokenization is the shorter one
        expected_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
        self.assertEqual(tokens, expected_tokens)

    def test_tokenize_mistral(self):
        """Test tokenization with Mistral NeMo tokenizer."""
        tokenizer = self.tokenizers["mistralai/Mistral-Nemo-Instruct-2407"]
        tokens = tokenize(self.test_text, tokenizer)
        self.assertIsInstance(tokens, list)
        self.assertTrue(all(isinstance(t, str) for t in tokens))
        self.assertTrue(len(tokens) > 0)
        print(tokens)

    def test_tokenize_gemma(self):
        """Test tokenization with Gemma 3 tokenizer."""
        tokenizer = self.tokenizers["google/gemma-3-27b-it"]
        tokens = tokenize(self.test_text, tokenizer)
        self.assertIsInstance(tokens, list)
        self.assertTrue(all(isinstance(t, str) for t in tokens))
        self.assertTrue(len(tokens) > 0)
        print(tokens)

if __name__ == '__main__':
    unittest.main()