Spaces:

daniel-wojahn
/

ttm-webapp-hf

Running

File size: 6,580 Bytes

import logging
from typing import List, Any, Optional
import numpy as np # Added for type hinting Optional[np.ndarray]

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Define the model ID for the Facebook FastText pretrained model
DEFAULT_MODEL_NAME = "facebook-fasttext-pretrained"

# FASTTEXT_MODEL_ID = "fasttext-tibetan" # Removed: Custom model loading to be handled in process.py directly


def get_model_and_device(model_id: str = DEFAULT_MODEL_NAME):
    """
    Loads the Facebook official pre-trained FastText model for Tibetan.

    Args:
        model_id (str): The model ID. Must be 'facebook-fasttext-pretrained' (DEFAULT_MODEL_NAME).

    Returns:
        Tuple[Optional[Any], Optional[str]]: 
            A tuple containing the loaded FastText model and its type ("fasttext"), 
            or (None, None) if loading fails or model_id is unsupported.
    """
    logger.info("Attempting to load FastText model via semantic_embedding.get_model_and_device: %s", model_id)

    if model_id == DEFAULT_MODEL_NAME:  # DEFAULT_MODEL_NAME is "facebook-fasttext-pretrained"
        try:
            # Importing here to minimize issues if fasttext_embedding also imports from semantic_embedding
            from .fasttext_embedding import load_facebook_official_tibetan_model
            
            model = load_facebook_official_tibetan_model()
            
            if model:
                logger.info(f"FastText model object received in get_model_and_device. Type: {type(model)}.")
                try:
                    logger.info(f"Model dimensions: {model.get_dimension()}")
                    # Basic check for model validity via an expected attribute/method
                    if hasattr(model, 'get_word_vector'):
                        logger.info("Model has 'get_word_vector' method (Python API expected for fasttext.load_model results).")
                except Exception as diag_e:
                    logger.error(f"Error during diagnostic check of FastText model '{model_id}': {diag_e}", exc_info=True)
                return model, "fasttext"
            else:
                # This case implies load_facebook_official_tibetan_model returned None without raising an error.
                logger.error(f"Model loading for '{model_id}' via load_facebook_official_tibetan_model() returned None unexpectedly.")
                return None, None
        except Exception as e:
            logger.error(f"Failed to load or initialize FastText model '{model_id}': {e}. Semantic similarity will not be available.", exc_info=True)
            return None, None
    else:
        logger.error(f"Unsupported model_id for get_model_and_device in semantic_embedding.py: '{model_id}'. Only '{DEFAULT_MODEL_NAME}' is supported by this function.")
        return None, None


def generate_embeddings(texts: List[str], model: Any, tokenize_fn=None, use_stopwords: bool = True, use_lite_stopwords: bool = False, corpus_token_freq=None, doc_freq_map=None, total_docs_in_corpus=0) -> Optional[np.ndarray]:
    """
    Generates FastText embeddings for a list of texts.

    Args:
        texts (list[str]): A list of texts to embed.
        model: The loaded FastText model.
        tokenize_fn: Optional tokenization function for FastText (if different from default botok based).
        use_stopwords (bool): Whether to filter out stopwords for FastText embeddings.
        use_lite_stopwords (bool): Whether to use the 'lite' stopwords list.
        corpus_token_freq: Corpus-wide term frequencies for TF-IDF weighted FastText.
        doc_freq_map: Document frequency map for TF-IDF weighted FastText.
        total_docs_in_corpus: Total documents in corpus for TF-IDF weighted FastText.

    Returns:
        Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
    """
    if not texts:
        logger.warning(
            "No texts provided to generate_embeddings. Returning None."
        )
        return None

    logger.info(f"Generating FastText embeddings for {len(texts)} texts...")

    try:
        from .fasttext_embedding import get_batch_embeddings
        
        stopwords_set = None
        if use_stopwords:
            if use_lite_stopwords:
                from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE_SET
                stopwords_set = TIBETAN_STOPWORDS_LITE_SET
            else:
                from .stopwords_bo import TIBETAN_STOPWORDS_SET
                stopwords_set = TIBETAN_STOPWORDS_SET
        
        embeddings = get_batch_embeddings(
            texts, 
            model, 
            tokenize_fn=tokenize_fn, 
            use_stopwords=use_stopwords, 
            stopwords_set=stopwords_set,
            corpus_token_freq=corpus_token_freq,
            doc_freq_map=doc_freq_map,
            total_docs_in_corpus=total_docs_in_corpus
        )
        if embeddings is None:
             logger.error(f"get_batch_embeddings returned None for {len(texts)} texts. First few: {texts[:2]}")
             return None

        logger.info("FastText embeddings generated with shape: %s", str(embeddings.shape))
        return embeddings
    except ImportError:
        logger.error("Required FastText modules not found. Please ensure 'fasttext' and its dependencies are correctly installed.")
        return None
    except Exception as e:
        logger.error(f"An unexpected error occurred during FastText embedding generation: {e}", exc_info=True)
        return None


def train_fasttext_model(corpus_texts: List[str], **kwargs):
    """
    Train a FastText model on the provided corpus texts.
    
    Args:
        corpus_texts: List of texts to use for training
        **kwargs: Additional parameters for training (dim, epoch, etc.)
        
    Returns:
        Trained model and path to the model file (Note: current implementation returns only model object)
    """ # Docstring updated for return type
    try:
        from .fasttext_embedding import prepare_corpus_file, train_fasttext_model as train_ft
        
        corpus_path = prepare_corpus_file(corpus_texts)
        model = train_ft(corpus_path=corpus_path, **kwargs)
        
        return model # Returns model object, not path as previously suggested by older docstring
    except ImportError:
        logger.error("FastText module not found. Please install it with 'pip install fasttext'.")
        raise # Re-raising to signal critical failure if training components are missing