from typing import List, Dict import hashlib import logging # Configure logging logger = logging.getLogger(__name__) # Initialize a cache for tokenization results # Using a simple in-memory dictionary with text hash as key _tokenization_cache: Dict[str, List[str]] = {} # Maximum cache size (number of entries) MAX_CACHE_SIZE = 1000 try: from botok import WordTokenizer # Initialize the tokenizer once at the module level BOTOK_TOKENIZER = WordTokenizer() except ImportError: # Handle the case where botok might not be installed, # though it's a core dependency for this app. BOTOK_TOKENIZER = None logger.error("botok library not found. Tokenization will fail.") # Optionally, raise an error here if botok is absolutely critical for the app to even start # raise ImportError("botok is required for tokenization. Please install it.") def _get_text_hash(text: str) -> str: """ Generate a hash for the input text to use as a cache key. Args: text: The input text to hash Returns: A string representation of the MD5 hash of the input text """ return hashlib.md5(text.encode('utf-8')).hexdigest() def tokenize_texts(texts: List[str], mode: str = "syllable") -> List[List[str]]: """ Tokenizes a list of raw Tibetan texts using botok, with caching for performance. This function maintains an in-memory cache of previously tokenized texts to avoid redundant processing of the same content. The cache uses MD5 hashes of the input texts as keys. Args: texts: List of raw text strings to tokenize. Returns: List of tokenized texts (each as a list of tokens). Raises: RuntimeError: If the botok tokenizer failed to initialize. """ if BOTOK_TOKENIZER is None: # This case should ideally be handled more gracefully, # perhaps by preventing analysis if the tokenizer failed to load. raise RuntimeError( "Botok tokenizer failed to initialize. Cannot tokenize texts." ) tokenized_texts_list = [] if mode not in ["word", "syllable"]: logger.warning(f"Invalid tokenization mode: '{mode}'. Defaulting to 'syllable'.") mode = "syllable" # Process each text for text_content in texts: # Skip empty texts if not text_content.strip(): tokenized_texts_list.append([]) continue # Generate hash for cache lookup cache_key_string = text_content + f"_{mode}" # Include mode in string for hashing text_hash = _get_text_hash(cache_key_string) # Check if we have this text in cache if text_hash in _tokenization_cache: # Cache hit - use cached tokens tokens = _tokenization_cache[text_hash] logger.debug(f"Cache hit for text hash {text_hash[:8]}... (mode: {mode})") else: # Cache miss - tokenize and store in cache try: current_tokens = [] if BOTOK_TOKENIZER: raw_botok_items = list(BOTOK_TOKENIZER.tokenize(text_content)) if mode == "word": for item_idx, w in enumerate(raw_botok_items): if hasattr(w, 'text') and isinstance(w.text, str): token_text = w.text.strip() if token_text: # Ensure token is not empty or just whitespace current_tokens.append(token_text) # Optionally log if w.text is not a string or missing, for debugging # elif w.text is not None: # logger.debug(f"Token item {item_idx} has non-string text {type(w.text)} for hash {text_hash[:8]}. Skipping word.") # else: # logger.debug(f"Token item {item_idx} missing text attribute for hash {text_hash[:8]}. Skipping word.") logger.debug( f"WORD TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): " f"{current_tokens[:30]}" ) elif mode == "syllable": # This is the original syllable extraction logic for item_idx, w in enumerate(raw_botok_items): if hasattr(w, 'syls') and w.syls: for syl_idx, syl_item in enumerate(w.syls): syllable_to_process = None if isinstance(syl_item, str): syllable_to_process = syl_item elif isinstance(syl_item, list): try: syllable_to_process = "".join(syl_item) except TypeError: logger.warning( f"Syllable item in w.syls was a list, but could not be joined (non-string elements?): {syl_item} " f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} " f"for hash {text_hash[:8]}. Skipping this syllable." ) continue if syllable_to_process is not None: stripped_syl = syllable_to_process.strip() if stripped_syl: current_tokens.append(stripped_syl) elif syl_item is not None: logger.warning( f"Unexpected type for syllable item (neither str nor list): {type(syl_item)} ('{str(syl_item)[:100]}') " f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} " f"for hash {text_hash[:8]}. Skipping this syllable." ) elif hasattr(w, 'text') and w.text: # Fallback if no 'syls' but in syllable mode if isinstance(w.text, str): token_text = w.text.strip() if token_text: current_tokens.append(token_text) # Treat as a single syllable/token elif w.text is not None: logger.warning( f"Unexpected type for w.text (in syllable mode fallback): {type(w.text)} ('{str(w.text)[:100]}') " f"for item {item_idx} (POS: {getattr(w, 'pos', 'N/A')}) " f"for hash {text_hash[:8]}. Skipping this token." ) logger.debug( f"SYLLABLE TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): " f"{current_tokens[:30]}" ) tokens = current_tokens else: logger.error(f"BOTOK_TOKENIZER is None for text hash {text_hash[:8]}, cannot tokenize (mode: {mode}).") tokens = [] # Store in cache if not empty if tokens: # If cache is full, remove a random entry (simple strategy) if len(_tokenization_cache) >= MAX_CACHE_SIZE: # Remove first key (oldest if ordered dict, random otherwise) _tokenization_cache.pop(next(iter(_tokenization_cache))) _tokenization_cache[text_hash] = tokens logger.debug(f"Added tokens to cache with hash {text_hash[:8]}... (mode: {mode})") except Exception as e: logger.error(f"Error tokenizing text (mode: {mode}): {e}") tokens = [] tokenized_texts_list.append(tokens) return tokenized_texts_list