Adapters
File size: 895 Bytes
70a6fd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import torch

class SimpleTokenizer:
    def __init__(self, vocab_path):
        self.char_to_idx = torch.load(vocab_pth)

        # Add <unk> if not in vocab
        if '<unk>' not in self.char_to_idx:
            self.char_to_idx['<unk>'] = max(self.char_to_idx.values()) + 1

        self.idx_to_char = {i: c for c, i in self.char_to_idx.items()}


    def encode(self, text):
        return [self.char_to_idx.get(c, self.char_to_idx.get('<unk>', 0)) for c in text]

    def decode(self, indices):
        return ''.join([self.idx_to_char.get(i, '') for i in indices])

# Example usage
vocab_path = 'vocab.pth'  # Replace with the actual path to your vocab file
tokenizer = SimpleTokenizer(vocab_path)

text = "Hello, world!"
tokens = tokenizer.encode(text)  # Use the encode method here
print(tokens)

decoded_text = tokenizer.decode(tokens)
print(decoded_text)