Spaces:

daniel-wojahn
/

ttm-webapp-hf

Running

App Files Files Community

ttm-webapp-hf / pipeline /tokenize.py

daniel-wojahn

Reafactoring of the tokenization pipeline, adjusted fasttext implementation

3011301 verified 17 days ago

raw

history blame contribute delete

8.64 kB

	from typing import List, Dict
	import hashlib
	import logging

	# Configure logging
	logger = logging.getLogger(__name__)

	# Initialize a cache for tokenization results
	# Using a simple in-memory dictionary with text hash as key
	_tokenization_cache: Dict[str, List[str]] = {}

	# Maximum cache size (number of entries)
	MAX_CACHE_SIZE = 1000

	try:
	from botok import WordTokenizer

	# Initialize the tokenizer once at the module level
	BOTOK_TOKENIZER = WordTokenizer()
	except ImportError:
	# Handle the case where botok might not be installed,
	# though it's a core dependency for this app.
	BOTOK_TOKENIZER = None
	logger.error("botok library not found. Tokenization will fail.")
	# Optionally, raise an error here if botok is absolutely critical for the app to even start
	# raise ImportError("botok is required for tokenization. Please install it.")


	def _get_text_hash(text: str) -> str:
	"""
	Generate a hash for the input text to use as a cache key.

	Args:
	text: The input text to hash

	Returns:
	A string representation of the MD5 hash of the input text
	"""
	return hashlib.md5(text.encode('utf-8')).hexdigest()


	def tokenize_texts(texts: List[str], mode: str = "syllable") -> List[List[str]]:
	"""
	Tokenizes a list of raw Tibetan texts using botok, with caching for performance.

	This function maintains an in-memory cache of previously tokenized texts to avoid
	redundant processing of the same content. The cache uses MD5 hashes of the input
	texts as keys.

	Args:
	texts: List of raw text strings to tokenize.

	Returns:
	List of tokenized texts (each as a list of tokens).

	Raises:
	RuntimeError: If the botok tokenizer failed to initialize.
	"""
	if BOTOK_TOKENIZER is None:
	# This case should ideally be handled more gracefully,
	# perhaps by preventing analysis if the tokenizer failed to load.
	raise RuntimeError(
	"Botok tokenizer failed to initialize. Cannot tokenize texts."
	)

	tokenized_texts_list = []

	if mode not in ["word", "syllable"]:
	logger.warning(f"Invalid tokenization mode: '{mode}'. Defaulting to 'syllable'.")
	mode = "syllable"

	# Process each text
	for text_content in texts:
	# Skip empty texts
	if not text_content.strip():
	tokenized_texts_list.append([])
	continue

	# Generate hash for cache lookup
	cache_key_string = text_content + f"_{mode}" # Include mode in string for hashing
	text_hash = _get_text_hash(cache_key_string)

	# Check if we have this text in cache
	if text_hash in _tokenization_cache:
	# Cache hit - use cached tokens
	tokens = _tokenization_cache[text_hash]
	logger.debug(f"Cache hit for text hash {text_hash[:8]}... (mode: {mode})")
	else:
	# Cache miss - tokenize and store in cache
	try:
	current_tokens = []
	if BOTOK_TOKENIZER:
	raw_botok_items = list(BOTOK_TOKENIZER.tokenize(text_content))

	if mode == "word":
	for item_idx, w in enumerate(raw_botok_items):
	if hasattr(w, 'text') and isinstance(w.text, str):
	token_text = w.text.strip()
	if token_text: # Ensure token is not empty or just whitespace
	current_tokens.append(token_text)
	# Optionally log if w.text is not a string or missing, for debugging
	# elif w.text is not None:
	# logger.debug(f"Token item {item_idx} has non-string text {type(w.text)} for hash {text_hash[:8]}. Skipping word.")
	# else:
	# logger.debug(f"Token item {item_idx} missing text attribute for hash {text_hash[:8]}. Skipping word.")
	logger.debug(
	f"WORD TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): "
	f"{current_tokens[:30]}"
	)
	elif mode == "syllable":
	# This is the original syllable extraction logic
	for item_idx, w in enumerate(raw_botok_items):
	if hasattr(w, 'syls') and w.syls:
	for syl_idx, syl_item in enumerate(w.syls):
	syllable_to_process = None
	if isinstance(syl_item, str):
	syllable_to_process = syl_item
	elif isinstance(syl_item, list):
	try:
	syllable_to_process = "".join(syl_item)
	except TypeError:
	logger.warning(
	f"Syllable item in w.syls was a list, but could not be joined (non-string elements?): {syl_item} "
	f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} "
	f"for hash {text_hash[:8]}. Skipping this syllable."
	)
	continue

	if syllable_to_process is not None:
	stripped_syl = syllable_to_process.strip()
	if stripped_syl:
	current_tokens.append(stripped_syl)
	elif syl_item is not None:
	logger.warning(
	f"Unexpected type for syllable item (neither str nor list): {type(syl_item)} ('{str(syl_item)[:100]}') "
	f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} "
	f"for hash {text_hash[:8]}. Skipping this syllable."
	)
	elif hasattr(w, 'text') and w.text: # Fallback if no 'syls' but in syllable mode
	if isinstance(w.text, str):
	token_text = w.text.strip()
	if token_text:
	current_tokens.append(token_text) # Treat as a single syllable/token
	elif w.text is not None:
	logger.warning(
	f"Unexpected type for w.text (in syllable mode fallback): {type(w.text)} ('{str(w.text)[:100]}') "
	f"for item {item_idx} (POS: {getattr(w, 'pos', 'N/A')}) "
	f"for hash {text_hash[:8]}. Skipping this token."
	)
	logger.debug(
	f"SYLLABLE TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): "
	f"{current_tokens[:30]}"
	)
	tokens = current_tokens
	else:
	logger.error(f"BOTOK_TOKENIZER is None for text hash {text_hash[:8]}, cannot tokenize (mode: {mode}).")
	tokens = []

	# Store in cache if not empty
	if tokens:
	# If cache is full, remove a random entry (simple strategy)
	if len(_tokenization_cache) >= MAX_CACHE_SIZE:
	# Remove first key (oldest if ordered dict, random otherwise)
	_tokenization_cache.pop(next(iter(_tokenization_cache)))

	_tokenization_cache[text_hash] = tokens
	logger.debug(f"Added tokens to cache with hash {text_hash[:8]}... (mode: {mode})")
	except Exception as e:
	logger.error(f"Error tokenizing text (mode: {mode}): {e}")
	tokens = []

	tokenized_texts_list.append(tokens)

	return tokenized_texts_list