Spaces:

daniel-wojahn
/

ttm-webapp-hf

Running

App Files Files Community

ttm-webapp-hf / pipeline /semantic_embedding.py

daniel-wojahn

Reafactoring of the tokenization pipeline, adjusted fasttext implementation

3011301 verified 17 days ago

raw

history blame contribute delete

6.58 kB

	import logging
	from typing import List, Any, Optional
	import numpy as np # Added for type hinting Optional[np.ndarray]

	# Configure logging
	logging.basicConfig(
	level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
	)
	logger = logging.getLogger(__name__)

	# Define the model ID for the Facebook FastText pretrained model
	DEFAULT_MODEL_NAME = "facebook-fasttext-pretrained"

	# FASTTEXT_MODEL_ID = "fasttext-tibetan" # Removed: Custom model loading to be handled in process.py directly


	def get_model_and_device(model_id: str = DEFAULT_MODEL_NAME):
	"""
	Loads the Facebook official pre-trained FastText model for Tibetan.

	Args:
	model_id (str): The model ID. Must be 'facebook-fasttext-pretrained' (DEFAULT_MODEL_NAME).

	Returns:
	Tuple[Optional[Any], Optional[str]]:
	A tuple containing the loaded FastText model and its type ("fasttext"),
	or (None, None) if loading fails or model_id is unsupported.
	"""
	logger.info("Attempting to load FastText model via semantic_embedding.get_model_and_device: %s", model_id)

	if model_id == DEFAULT_MODEL_NAME: # DEFAULT_MODEL_NAME is "facebook-fasttext-pretrained"
	try:
	# Importing here to minimize issues if fasttext_embedding also imports from semantic_embedding
	from .fasttext_embedding import load_facebook_official_tibetan_model

	model = load_facebook_official_tibetan_model()

	if model:
	logger.info(f"FastText model object received in get_model_and_device. Type: {type(model)}.")
	try:
	logger.info(f"Model dimensions: {model.get_dimension()}")
	# Basic check for model validity via an expected attribute/method
	if hasattr(model, 'get_word_vector'):
	logger.info("Model has 'get_word_vector' method (Python API expected for fasttext.load_model results).")
	except Exception as diag_e:
	logger.error(f"Error during diagnostic check of FastText model '{model_id}': {diag_e}", exc_info=True)
	return model, "fasttext"
	else:
	# This case implies load_facebook_official_tibetan_model returned None without raising an error.
	logger.error(f"Model loading for '{model_id}' via load_facebook_official_tibetan_model() returned None unexpectedly.")
	return None, None
	except Exception as e:
	logger.error(f"Failed to load or initialize FastText model '{model_id}': {e}. Semantic similarity will not be available.", exc_info=True)
	return None, None
	else:
	logger.error(f"Unsupported model_id for get_model_and_device in semantic_embedding.py: '{model_id}'. Only '{DEFAULT_MODEL_NAME}' is supported by this function.")
	return None, None


	def generate_embeddings(texts: List[str], model: Any, tokenize_fn=None, use_stopwords: bool = True, use_lite_stopwords: bool = False, corpus_token_freq=None, doc_freq_map=None, total_docs_in_corpus=0) -> Optional[np.ndarray]:
	"""
	Generates FastText embeddings for a list of texts.

	Args:
	texts (list[str]): A list of texts to embed.
	model: The loaded FastText model.
	tokenize_fn: Optional tokenization function for FastText (if different from default botok based).
	use_stopwords (bool): Whether to filter out stopwords for FastText embeddings.
	use_lite_stopwords (bool): Whether to use the 'lite' stopwords list.
	corpus_token_freq: Corpus-wide term frequencies for TF-IDF weighted FastText.
	doc_freq_map: Document frequency map for TF-IDF weighted FastText.
	total_docs_in_corpus: Total documents in corpus for TF-IDF weighted FastText.

	Returns:
	Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
	"""
	if not texts:
	logger.warning(
	"No texts provided to generate_embeddings. Returning None."
	)
	return None

	logger.info(f"Generating FastText embeddings for {len(texts)} texts...")

	try:
	from .fasttext_embedding import get_batch_embeddings

	stopwords_set = None
	if use_stopwords:
	if use_lite_stopwords:
	from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE_SET
	stopwords_set = TIBETAN_STOPWORDS_LITE_SET
	else:
	from .stopwords_bo import TIBETAN_STOPWORDS_SET
	stopwords_set = TIBETAN_STOPWORDS_SET

	embeddings = get_batch_embeddings(
	texts,
	model,
	tokenize_fn=tokenize_fn,
	use_stopwords=use_stopwords,
	stopwords_set=stopwords_set,
	corpus_token_freq=corpus_token_freq,
	doc_freq_map=doc_freq_map,
	total_docs_in_corpus=total_docs_in_corpus
	)
	if embeddings is None:
	logger.error(f"get_batch_embeddings returned None for {len(texts)} texts. First few: {texts[:2]}")
	return None

	logger.info("FastText embeddings generated with shape: %s", str(embeddings.shape))
	return embeddings
	except ImportError:
	logger.error("Required FastText modules not found. Please ensure 'fasttext' and its dependencies are correctly installed.")
	return None
	except Exception as e:
	logger.error(f"An unexpected error occurred during FastText embedding generation: {e}", exc_info=True)
	return None


	def train_fasttext_model(corpus_texts: List[str], **kwargs):
	"""
	Train a FastText model on the provided corpus texts.

	Args:
	corpus_texts: List of texts to use for training
	**kwargs: Additional parameters for training (dim, epoch, etc.)

	Returns:
	Trained model and path to the model file (Note: current implementation returns only model object)
	""" # Docstring updated for return type
	try:
	from .fasttext_embedding import prepare_corpus_file, train_fasttext_model as train_ft

	corpus_path = prepare_corpus_file(corpus_texts)
	model = train_ft(corpus_path=corpus_path, **kwargs)

	return model # Returns model object, not path as previously suggested by older docstring
	except ImportError:
	logger.error("FastText module not found. Please install it with 'pip install fasttext'.")
	raise # Re-raising to signal critical failure if training components are missing