File size: 8,643 Bytes
b4c92f5
 
 
 
 
 
 
 
 
 
 
 
 
4bf5701
 
 
 
 
 
 
 
 
 
b4c92f5
4bf5701
 
 
 
b4c92f5
 
 
 
 
 
 
 
 
 
 
 
 
3011301
4bf5701
b4c92f5
 
 
 
 
 
4bf5701
b4c92f5
 
4bf5701
 
b4c92f5
 
 
4bf5701
 
 
 
 
 
 
 
 
3011301
 
 
 
b4c92f5
 
4bf5701
b4c92f5
 
 
 
 
 
3011301
 
b4c92f5
 
 
 
 
3011301
b4c92f5
 
 
3011301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4c92f5
 
 
 
 
 
 
 
 
3011301
b4c92f5
3011301
b4c92f5
 
4bf5701
b4c92f5
4bf5701
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
from typing import List, Dict
import hashlib
import logging

# Configure logging
logger = logging.getLogger(__name__)

# Initialize a cache for tokenization results
# Using a simple in-memory dictionary with text hash as key
_tokenization_cache: Dict[str, List[str]] = {}

# Maximum cache size (number of entries)
MAX_CACHE_SIZE = 1000

try:
    from botok import WordTokenizer

    # Initialize the tokenizer once at the module level
    BOTOK_TOKENIZER = WordTokenizer()
except ImportError:
    # Handle the case where botok might not be installed,
    # though it's a core dependency for this app.
    BOTOK_TOKENIZER = None
    logger.error("botok library not found. Tokenization will fail.")
    # Optionally, raise an error here if botok is absolutely critical for the app to even start
    # raise ImportError("botok is required for tokenization. Please install it.")


def _get_text_hash(text: str) -> str:
    """
    Generate a hash for the input text to use as a cache key.
    
    Args:
        text: The input text to hash
        
    Returns:
        A string representation of the MD5 hash of the input text
    """
    return hashlib.md5(text.encode('utf-8')).hexdigest()


def tokenize_texts(texts: List[str], mode: str = "syllable") -> List[List[str]]:
    """
    Tokenizes a list of raw Tibetan texts using botok, with caching for performance.
    
    This function maintains an in-memory cache of previously tokenized texts to avoid
    redundant processing of the same content. The cache uses MD5 hashes of the input
    texts as keys.
    
    Args:
        texts: List of raw text strings to tokenize.
        
    Returns:
        List of tokenized texts (each as a list of tokens).
        
    Raises:
        RuntimeError: If the botok tokenizer failed to initialize.
    """
    if BOTOK_TOKENIZER is None:
        # This case should ideally be handled more gracefully,
        # perhaps by preventing analysis if the tokenizer failed to load.
        raise RuntimeError(
            "Botok tokenizer failed to initialize. Cannot tokenize texts."
        )

    tokenized_texts_list = []

    if mode not in ["word", "syllable"]:
        logger.warning(f"Invalid tokenization mode: '{mode}'. Defaulting to 'syllable'.")
        mode = "syllable"
    
    # Process each text
    for text_content in texts:
        # Skip empty texts
        if not text_content.strip():
            tokenized_texts_list.append([])
            continue
            
        # Generate hash for cache lookup
        cache_key_string = text_content + f"_{mode}" # Include mode in string for hashing
        text_hash = _get_text_hash(cache_key_string)
        
        # Check if we have this text in cache
        if text_hash in _tokenization_cache:
            # Cache hit - use cached tokens
            tokens = _tokenization_cache[text_hash]
            logger.debug(f"Cache hit for text hash {text_hash[:8]}... (mode: {mode})")
        else:
            # Cache miss - tokenize and store in cache
            try:
                current_tokens = []
                if BOTOK_TOKENIZER:
                    raw_botok_items = list(BOTOK_TOKENIZER.tokenize(text_content))
                    
                    if mode == "word":
                        for item_idx, w in enumerate(raw_botok_items):
                            if hasattr(w, 'text') and isinstance(w.text, str):
                                token_text = w.text.strip()
                                if token_text: # Ensure token is not empty or just whitespace
                                    current_tokens.append(token_text)
                            # Optionally log if w.text is not a string or missing, for debugging
                            # elif w.text is not None:
                            #     logger.debug(f"Token item {item_idx} has non-string text {type(w.text)} for hash {text_hash[:8]}. Skipping word.")
                            # else:
                            #     logger.debug(f"Token item {item_idx} missing text attribute for hash {text_hash[:8]}. Skipping word.")
                        logger.debug(
                            f"WORD TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): "
                            f"{current_tokens[:30]}"
                        )
                    elif mode == "syllable":
                        # This is the original syllable extraction logic
                        for item_idx, w in enumerate(raw_botok_items):
                            if hasattr(w, 'syls') and w.syls:
                                for syl_idx, syl_item in enumerate(w.syls):
                                    syllable_to_process = None
                                    if isinstance(syl_item, str):
                                        syllable_to_process = syl_item
                                    elif isinstance(syl_item, list):
                                        try:
                                            syllable_to_process = "".join(syl_item)
                                        except TypeError:
                                            logger.warning(
                                                f"Syllable item in w.syls was a list, but could not be joined (non-string elements?): {syl_item} "
                                                f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} "
                                                f"for hash {text_hash[:8]}. Skipping this syllable."
                                            )
                                            continue
                                    
                                    if syllable_to_process is not None:
                                        stripped_syl = syllable_to_process.strip()
                                        if stripped_syl:
                                            current_tokens.append(stripped_syl)
                                    elif syl_item is not None:
                                        logger.warning(
                                            f"Unexpected type for syllable item (neither str nor list): {type(syl_item)} ('{str(syl_item)[:100]}') "
                                            f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} "
                                            f"for hash {text_hash[:8]}. Skipping this syllable."
                                        )
                            elif hasattr(w, 'text') and w.text: # Fallback if no 'syls' but in syllable mode
                                if isinstance(w.text, str):
                                    token_text = w.text.strip()
                                    if token_text:
                                        current_tokens.append(token_text) # Treat as a single syllable/token
                                elif w.text is not None:
                                    logger.warning(
                                        f"Unexpected type for w.text (in syllable mode fallback): {type(w.text)} ('{str(w.text)[:100]}') "
                                        f"for item {item_idx} (POS: {getattr(w, 'pos', 'N/A')}) "
                                        f"for hash {text_hash[:8]}. Skipping this token."
                                    )
                        logger.debug(
                            f"SYLLABLE TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): "
                            f"{current_tokens[:30]}"
                        )
                    tokens = current_tokens
                else:
                    logger.error(f"BOTOK_TOKENIZER is None for text hash {text_hash[:8]}, cannot tokenize (mode: {mode}).")
                    tokens = []
                
                # Store in cache if not empty
                if tokens:
                    # If cache is full, remove a random entry (simple strategy)
                    if len(_tokenization_cache) >= MAX_CACHE_SIZE:
                        # Remove first key (oldest if ordered dict, random otherwise)
                        _tokenization_cache.pop(next(iter(_tokenization_cache)))
                    
                    _tokenization_cache[text_hash] = tokens
                    logger.debug(f"Added tokens to cache with hash {text_hash[:8]}... (mode: {mode})")
            except Exception as e:
                logger.error(f"Error tokenizing text (mode: {mode}): {e}")
                tokens = []
                
        tokenized_texts_list.append(tokens)
        
    return tokenized_texts_list