import logging from typing import List, Any, Optional import numpy as np # Added for type hinting Optional[np.ndarray] # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) # Define the model ID for the Facebook FastText pretrained model DEFAULT_MODEL_NAME = "facebook-fasttext-pretrained" # FASTTEXT_MODEL_ID = "fasttext-tibetan" # Removed: Custom model loading to be handled in process.py directly def get_model_and_device(model_id: str = DEFAULT_MODEL_NAME): """ Loads the Facebook official pre-trained FastText model for Tibetan. Args: model_id (str): The model ID. Must be 'facebook-fasttext-pretrained' (DEFAULT_MODEL_NAME). Returns: Tuple[Optional[Any], Optional[str]]: A tuple containing the loaded FastText model and its type ("fasttext"), or (None, None) if loading fails or model_id is unsupported. """ logger.info("Attempting to load FastText model via semantic_embedding.get_model_and_device: %s", model_id) if model_id == DEFAULT_MODEL_NAME: # DEFAULT_MODEL_NAME is "facebook-fasttext-pretrained" try: # Importing here to minimize issues if fasttext_embedding also imports from semantic_embedding from .fasttext_embedding import load_facebook_official_tibetan_model model = load_facebook_official_tibetan_model() if model: logger.info(f"FastText model object received in get_model_and_device. Type: {type(model)}.") try: logger.info(f"Model dimensions: {model.get_dimension()}") # Basic check for model validity via an expected attribute/method if hasattr(model, 'get_word_vector'): logger.info("Model has 'get_word_vector' method (Python API expected for fasttext.load_model results).") except Exception as diag_e: logger.error(f"Error during diagnostic check of FastText model '{model_id}': {diag_e}", exc_info=True) return model, "fasttext" else: # This case implies load_facebook_official_tibetan_model returned None without raising an error. logger.error(f"Model loading for '{model_id}' via load_facebook_official_tibetan_model() returned None unexpectedly.") return None, None except Exception as e: logger.error(f"Failed to load or initialize FastText model '{model_id}': {e}. Semantic similarity will not be available.", exc_info=True) return None, None else: logger.error(f"Unsupported model_id for get_model_and_device in semantic_embedding.py: '{model_id}'. Only '{DEFAULT_MODEL_NAME}' is supported by this function.") return None, None def generate_embeddings(texts: List[str], model: Any, tokenize_fn=None, use_stopwords: bool = True, use_lite_stopwords: bool = False, corpus_token_freq=None, doc_freq_map=None, total_docs_in_corpus=0) -> Optional[np.ndarray]: """ Generates FastText embeddings for a list of texts. Args: texts (list[str]): A list of texts to embed. model: The loaded FastText model. tokenize_fn: Optional tokenization function for FastText (if different from default botok based). use_stopwords (bool): Whether to filter out stopwords for FastText embeddings. use_lite_stopwords (bool): Whether to use the 'lite' stopwords list. corpus_token_freq: Corpus-wide term frequencies for TF-IDF weighted FastText. doc_freq_map: Document frequency map for TF-IDF weighted FastText. total_docs_in_corpus: Total documents in corpus for TF-IDF weighted FastText. Returns: Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails. """ if not texts: logger.warning( "No texts provided to generate_embeddings. Returning None." ) return None logger.info(f"Generating FastText embeddings for {len(texts)} texts...") try: from .fasttext_embedding import get_batch_embeddings stopwords_set = None if use_stopwords: if use_lite_stopwords: from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE_SET stopwords_set = TIBETAN_STOPWORDS_LITE_SET else: from .stopwords_bo import TIBETAN_STOPWORDS_SET stopwords_set = TIBETAN_STOPWORDS_SET embeddings = get_batch_embeddings( texts, model, tokenize_fn=tokenize_fn, use_stopwords=use_stopwords, stopwords_set=stopwords_set, corpus_token_freq=corpus_token_freq, doc_freq_map=doc_freq_map, total_docs_in_corpus=total_docs_in_corpus ) if embeddings is None: logger.error(f"get_batch_embeddings returned None for {len(texts)} texts. First few: {texts[:2]}") return None logger.info("FastText embeddings generated with shape: %s", str(embeddings.shape)) return embeddings except ImportError: logger.error("Required FastText modules not found. Please ensure 'fasttext' and its dependencies are correctly installed.") return None except Exception as e: logger.error(f"An unexpected error occurred during FastText embedding generation: {e}", exc_info=True) return None def train_fasttext_model(corpus_texts: List[str], **kwargs): """ Train a FastText model on the provided corpus texts. Args: corpus_texts: List of texts to use for training **kwargs: Additional parameters for training (dim, epoch, etc.) Returns: Trained model and path to the model file (Note: current implementation returns only model object) """ # Docstring updated for return type try: from .fasttext_embedding import prepare_corpus_file, train_fasttext_model as train_ft corpus_path = prepare_corpus_file(corpus_texts) model = train_ft(corpus_path=corpus_path, **kwargs) return model # Returns model object, not path as previously suggested by older docstring except ImportError: logger.error("FastText module not found. Please install it with 'pip install fasttext'.") raise # Re-raising to signal critical failure if training components are missing