Spaces:

daniel-wojahn
/

ttm-webapp-hf

Running

File size: 8,643 Bytes

from typing import List, Dict
import hashlib
import logging

# Configure logging
logger = logging.getLogger(__name__)

# Initialize a cache for tokenization results
# Using a simple in-memory dictionary with text hash as key
_tokenization_cache: Dict[str, List[str]] = {}

# Maximum cache size (number of entries)
MAX_CACHE_SIZE = 1000

try:
    from botok import WordTokenizer

    # Initialize the tokenizer once at the module level
    BOTOK_TOKENIZER = WordTokenizer()
except ImportError:
    # Handle the case where botok might not be installed,
    # though it's a core dependency for this app.
    BOTOK_TOKENIZER = None
    logger.error("botok library not found. Tokenization will fail.")
    # Optionally, raise an error here if botok is absolutely critical for the app to even start
    # raise ImportError("botok is required for tokenization. Please install it.")


def _get_text_hash(text: str) -> str:
    """
    Generate a hash for the input text to use as a cache key.
    
    Args:
        text: The input text to hash
        
    Returns:
        A string representation of the MD5 hash of the input text
    """
    return hashlib.md5(text.encode('utf-8')).hexdigest()


def tokenize_texts(texts: List[str], mode: str = "syllable") -> List[List[str]]:
    """
    Tokenizes a list of raw Tibetan texts using botok, with caching for performance.
    
    This function maintains an in-memory cache of previously tokenized texts to avoid
    redundant processing of the same content. The cache uses MD5 hashes of the input
    texts as keys.
    
    Args:
        texts: List of raw text strings to tokenize.
        
    Returns:
        List of tokenized texts (each as a list of tokens).
        
    Raises:
        RuntimeError: If the botok tokenizer failed to initialize.
    """
    if BOTOK_TOKENIZER is None:
        # This case should ideally be handled more gracefully,
        # perhaps by preventing analysis if the tokenizer failed to load.
        raise RuntimeError(
            "Botok tokenizer failed to initialize. Cannot tokenize texts."
        )

    tokenized_texts_list = []

    if mode not in ["word", "syllable"]:
        logger.warning(f"Invalid tokenization mode: '{mode}'. Defaulting to 'syllable'.")
        mode = "syllable"
    
    # Process each text
    for text_content in texts:
        # Skip empty texts
        if not text_content.strip():
            tokenized_texts_list.append([])
            continue
            
        # Generate hash for cache lookup
        cache_key_string = text_content + f"_{mode}" # Include mode in string for hashing
        text_hash = _get_text_hash(cache_key_string)
        
        # Check if we have this text in cache
        if text_hash in _tokenization_cache:
            # Cache hit - use cached tokens
            tokens = _tokenization_cache[text_hash]
            logger.debug(f"Cache hit for text hash {text_hash[:8]}... (mode: {mode})")
        else:
            # Cache miss - tokenize and store in cache
            try:
                current_tokens = []
                if BOTOK_TOKENIZER:
                    raw_botok_items = list(BOTOK_TOKENIZER.tokenize(text_content))
                    
                    if mode == "word":
                        for item_idx, w in enumerate(raw_botok_items):
                            if hasattr(w, 'text') and isinstance(w.text, str):
                                token_text = w.text.strip()
                                if token_text: # Ensure token is not empty or just whitespace
                                    current_tokens.append(token_text)
                            # Optionally log if w.text is not a string or missing, for debugging
                            # elif w.text is not None:
                            #     logger.debug(f"Token item {item_idx} has non-string text {type(w.text)} for hash {text_hash[:8]}. Skipping word.")
                            # else:
                            #     logger.debug(f"Token item {item_idx} missing text attribute for hash {text_hash[:8]}. Skipping word.")
                        logger.debug(
                            f"WORD TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): "
                            f"{current_tokens[:30]}"
                        )
                    elif mode == "syllable":
                        # This is the original syllable extraction logic
                        for item_idx, w in enumerate(raw_botok_items):
                            if hasattr(w, 'syls') and w.syls:
                                for syl_idx, syl_item in enumerate(w.syls):
                                    syllable_to_process = None
                                    if isinstance(syl_item, str):
                                        syllable_to_process = syl_item
                                    elif isinstance(syl_item, list):
                                        try:
                                            syllable_to_process = "".join(syl_item)
                                        except TypeError:
                                            logger.warning(
                                                f"Syllable item in w.syls was a list, but could not be joined (non-string elements?): {syl_item} "
                                                f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} "
                                                f"for hash {text_hash[:8]}. Skipping this syllable."
                                            )
                                            continue
                                    
                                    if syllable_to_process is not None:
                                        stripped_syl = syllable_to_process.strip()
                                        if stripped_syl:
                                            current_tokens.append(stripped_syl)
                                    elif syl_item is not None:
                                        logger.warning(
                                            f"Unexpected type for syllable item (neither str nor list): {type(syl_item)} ('{str(syl_item)[:100]}') "
                                            f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} "
                                            f"for hash {text_hash[:8]}. Skipping this syllable."
                                        )
                            elif hasattr(w, 'text') and w.text: # Fallback if no 'syls' but in syllable mode
                                if isinstance(w.text, str):
                                    token_text = w.text.strip()
                                    if token_text:
                                        current_tokens.append(token_text) # Treat as a single syllable/token
                                elif w.text is not None:
                                    logger.warning(
                                        f"Unexpected type for w.text (in syllable mode fallback): {type(w.text)} ('{str(w.text)[:100]}') "
                                        f"for item {item_idx} (POS: {getattr(w, 'pos', 'N/A')}) "
                                        f"for hash {text_hash[:8]}. Skipping this token."
                                    )
                        logger.debug(
                            f"SYLLABLE TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): "
                            f"{current_tokens[:30]}"
                        )
                    tokens = current_tokens
                else:
                    logger.error(f"BOTOK_TOKENIZER is None for text hash {text_hash[:8]}, cannot tokenize (mode: {mode}).")
                    tokens = []
                
                # Store in cache if not empty
                if tokens:
                    # If cache is full, remove a random entry (simple strategy)
                    if len(_tokenization_cache) >= MAX_CACHE_SIZE:
                        # Remove first key (oldest if ordered dict, random otherwise)
                        _tokenization_cache.pop(next(iter(_tokenization_cache)))
                    
                    _tokenization_cache[text_hash] = tokens
                    logger.debug(f"Added tokens to cache with hash {text_hash[:8]}... (mode: {mode})")
            except Exception as e:
                logger.error(f"Error tokenizing text (mode: {mode}): {e}")
                tokens = []
                
        tokenized_texts_list.append(tokens)
        
    return tokenized_texts_list