ttm-webapp-hf / pipeline /semantic_embedding.py
daniel-wojahn's picture
Reafactoring of the tokenization pipeline, adjusted fasttext implementation
3011301 verified
import logging
from typing import List, Any, Optional
import numpy as np # Added for type hinting Optional[np.ndarray]
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# Define the model ID for the Facebook FastText pretrained model
DEFAULT_MODEL_NAME = "facebook-fasttext-pretrained"
# FASTTEXT_MODEL_ID = "fasttext-tibetan" # Removed: Custom model loading to be handled in process.py directly
def get_model_and_device(model_id: str = DEFAULT_MODEL_NAME):
"""
Loads the Facebook official pre-trained FastText model for Tibetan.
Args:
model_id (str): The model ID. Must be 'facebook-fasttext-pretrained' (DEFAULT_MODEL_NAME).
Returns:
Tuple[Optional[Any], Optional[str]]:
A tuple containing the loaded FastText model and its type ("fasttext"),
or (None, None) if loading fails or model_id is unsupported.
"""
logger.info("Attempting to load FastText model via semantic_embedding.get_model_and_device: %s", model_id)
if model_id == DEFAULT_MODEL_NAME: # DEFAULT_MODEL_NAME is "facebook-fasttext-pretrained"
try:
# Importing here to minimize issues if fasttext_embedding also imports from semantic_embedding
from .fasttext_embedding import load_facebook_official_tibetan_model
model = load_facebook_official_tibetan_model()
if model:
logger.info(f"FastText model object received in get_model_and_device. Type: {type(model)}.")
try:
logger.info(f"Model dimensions: {model.get_dimension()}")
# Basic check for model validity via an expected attribute/method
if hasattr(model, 'get_word_vector'):
logger.info("Model has 'get_word_vector' method (Python API expected for fasttext.load_model results).")
except Exception as diag_e:
logger.error(f"Error during diagnostic check of FastText model '{model_id}': {diag_e}", exc_info=True)
return model, "fasttext"
else:
# This case implies load_facebook_official_tibetan_model returned None without raising an error.
logger.error(f"Model loading for '{model_id}' via load_facebook_official_tibetan_model() returned None unexpectedly.")
return None, None
except Exception as e:
logger.error(f"Failed to load or initialize FastText model '{model_id}': {e}. Semantic similarity will not be available.", exc_info=True)
return None, None
else:
logger.error(f"Unsupported model_id for get_model_and_device in semantic_embedding.py: '{model_id}'. Only '{DEFAULT_MODEL_NAME}' is supported by this function.")
return None, None
def generate_embeddings(texts: List[str], model: Any, tokenize_fn=None, use_stopwords: bool = True, use_lite_stopwords: bool = False, corpus_token_freq=None, doc_freq_map=None, total_docs_in_corpus=0) -> Optional[np.ndarray]:
"""
Generates FastText embeddings for a list of texts.
Args:
texts (list[str]): A list of texts to embed.
model: The loaded FastText model.
tokenize_fn: Optional tokenization function for FastText (if different from default botok based).
use_stopwords (bool): Whether to filter out stopwords for FastText embeddings.
use_lite_stopwords (bool): Whether to use the 'lite' stopwords list.
corpus_token_freq: Corpus-wide term frequencies for TF-IDF weighted FastText.
doc_freq_map: Document frequency map for TF-IDF weighted FastText.
total_docs_in_corpus: Total documents in corpus for TF-IDF weighted FastText.
Returns:
Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
"""
if not texts:
logger.warning(
"No texts provided to generate_embeddings. Returning None."
)
return None
logger.info(f"Generating FastText embeddings for {len(texts)} texts...")
try:
from .fasttext_embedding import get_batch_embeddings
stopwords_set = None
if use_stopwords:
if use_lite_stopwords:
from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE_SET
stopwords_set = TIBETAN_STOPWORDS_LITE_SET
else:
from .stopwords_bo import TIBETAN_STOPWORDS_SET
stopwords_set = TIBETAN_STOPWORDS_SET
embeddings = get_batch_embeddings(
texts,
model,
tokenize_fn=tokenize_fn,
use_stopwords=use_stopwords,
stopwords_set=stopwords_set,
corpus_token_freq=corpus_token_freq,
doc_freq_map=doc_freq_map,
total_docs_in_corpus=total_docs_in_corpus
)
if embeddings is None:
logger.error(f"get_batch_embeddings returned None for {len(texts)} texts. First few: {texts[:2]}")
return None
logger.info("FastText embeddings generated with shape: %s", str(embeddings.shape))
return embeddings
except ImportError:
logger.error("Required FastText modules not found. Please ensure 'fasttext' and its dependencies are correctly installed.")
return None
except Exception as e:
logger.error(f"An unexpected error occurred during FastText embedding generation: {e}", exc_info=True)
return None
def train_fasttext_model(corpus_texts: List[str], **kwargs):
"""
Train a FastText model on the provided corpus texts.
Args:
corpus_texts: List of texts to use for training
**kwargs: Additional parameters for training (dim, epoch, etc.)
Returns:
Trained model and path to the model file (Note: current implementation returns only model object)
""" # Docstring updated for return type
try:
from .fasttext_embedding import prepare_corpus_file, train_fasttext_model as train_ft
corpus_path = prepare_corpus_file(corpus_texts)
model = train_ft(corpus_path=corpus_path, **kwargs)
return model # Returns model object, not path as previously suggested by older docstring
except ImportError:
logger.error("FastText module not found. Please install it with 'pip install fasttext'.")
raise # Re-raising to signal critical failure if training components are missing