File size: 6,580 Bytes
4bf5701
3011301
 
4bf5701
 
 
 
 
 
 
3011301
 
4bf5701
3011301
4bf5701
b4c92f5
3011301
4bf5701
3011301
4bf5701
 
3011301
4bf5701
 
3011301
 
 
4bf5701
3011301
4bf5701
3011301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bf5701
3011301
4bf5701
 
 
3011301
 
 
 
 
 
 
4bf5701
 
3011301
4bf5701
 
 
3011301
4bf5701
3011301
4bf5701
3011301
4bf5701
3011301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4c92f5
4bf5701
b4c92f5
 
 
 
 
 
 
 
 
3011301
 
b4c92f5
 
 
 
 
 
3011301
b4c92f5
 
3011301
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import logging
from typing import List, Any, Optional
import numpy as np # Added for type hinting Optional[np.ndarray]

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Define the model ID for the Facebook FastText pretrained model
DEFAULT_MODEL_NAME = "facebook-fasttext-pretrained"

# FASTTEXT_MODEL_ID = "fasttext-tibetan" # Removed: Custom model loading to be handled in process.py directly


def get_model_and_device(model_id: str = DEFAULT_MODEL_NAME):
    """
    Loads the Facebook official pre-trained FastText model for Tibetan.

    Args:
        model_id (str): The model ID. Must be 'facebook-fasttext-pretrained' (DEFAULT_MODEL_NAME).

    Returns:
        Tuple[Optional[Any], Optional[str]]: 
            A tuple containing the loaded FastText model and its type ("fasttext"), 
            or (None, None) if loading fails or model_id is unsupported.
    """
    logger.info("Attempting to load FastText model via semantic_embedding.get_model_and_device: %s", model_id)

    if model_id == DEFAULT_MODEL_NAME:  # DEFAULT_MODEL_NAME is "facebook-fasttext-pretrained"
        try:
            # Importing here to minimize issues if fasttext_embedding also imports from semantic_embedding
            from .fasttext_embedding import load_facebook_official_tibetan_model
            
            model = load_facebook_official_tibetan_model()
            
            if model:
                logger.info(f"FastText model object received in get_model_and_device. Type: {type(model)}.")
                try:
                    logger.info(f"Model dimensions: {model.get_dimension()}")
                    # Basic check for model validity via an expected attribute/method
                    if hasattr(model, 'get_word_vector'):
                        logger.info("Model has 'get_word_vector' method (Python API expected for fasttext.load_model results).")
                except Exception as diag_e:
                    logger.error(f"Error during diagnostic check of FastText model '{model_id}': {diag_e}", exc_info=True)
                return model, "fasttext"
            else:
                # This case implies load_facebook_official_tibetan_model returned None without raising an error.
                logger.error(f"Model loading for '{model_id}' via load_facebook_official_tibetan_model() returned None unexpectedly.")
                return None, None
        except Exception as e:
            logger.error(f"Failed to load or initialize FastText model '{model_id}': {e}. Semantic similarity will not be available.", exc_info=True)
            return None, None
    else:
        logger.error(f"Unsupported model_id for get_model_and_device in semantic_embedding.py: '{model_id}'. Only '{DEFAULT_MODEL_NAME}' is supported by this function.")
        return None, None


def generate_embeddings(texts: List[str], model: Any, tokenize_fn=None, use_stopwords: bool = True, use_lite_stopwords: bool = False, corpus_token_freq=None, doc_freq_map=None, total_docs_in_corpus=0) -> Optional[np.ndarray]:
    """
    Generates FastText embeddings for a list of texts.

    Args:
        texts (list[str]): A list of texts to embed.
        model: The loaded FastText model.
        tokenize_fn: Optional tokenization function for FastText (if different from default botok based).
        use_stopwords (bool): Whether to filter out stopwords for FastText embeddings.
        use_lite_stopwords (bool): Whether to use the 'lite' stopwords list.
        corpus_token_freq: Corpus-wide term frequencies for TF-IDF weighted FastText.
        doc_freq_map: Document frequency map for TF-IDF weighted FastText.
        total_docs_in_corpus: Total documents in corpus for TF-IDF weighted FastText.

    Returns:
        Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
    """
    if not texts:
        logger.warning(
            "No texts provided to generate_embeddings. Returning None."
        )
        return None

    logger.info(f"Generating FastText embeddings for {len(texts)} texts...")

    try:
        from .fasttext_embedding import get_batch_embeddings
        
        stopwords_set = None
        if use_stopwords:
            if use_lite_stopwords:
                from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE_SET
                stopwords_set = TIBETAN_STOPWORDS_LITE_SET
            else:
                from .stopwords_bo import TIBETAN_STOPWORDS_SET
                stopwords_set = TIBETAN_STOPWORDS_SET
        
        embeddings = get_batch_embeddings(
            texts, 
            model, 
            tokenize_fn=tokenize_fn, 
            use_stopwords=use_stopwords, 
            stopwords_set=stopwords_set,
            corpus_token_freq=corpus_token_freq,
            doc_freq_map=doc_freq_map,
            total_docs_in_corpus=total_docs_in_corpus
        )
        if embeddings is None:
             logger.error(f"get_batch_embeddings returned None for {len(texts)} texts. First few: {texts[:2]}")
             return None

        logger.info("FastText embeddings generated with shape: %s", str(embeddings.shape))
        return embeddings
    except ImportError:
        logger.error("Required FastText modules not found. Please ensure 'fasttext' and its dependencies are correctly installed.")
        return None
    except Exception as e:
        logger.error(f"An unexpected error occurred during FastText embedding generation: {e}", exc_info=True)
        return None


def train_fasttext_model(corpus_texts: List[str], **kwargs):
    """
    Train a FastText model on the provided corpus texts.
    
    Args:
        corpus_texts: List of texts to use for training
        **kwargs: Additional parameters for training (dim, epoch, etc.)
        
    Returns:
        Trained model and path to the model file (Note: current implementation returns only model object)
    """ # Docstring updated for return type
    try:
        from .fasttext_embedding import prepare_corpus_file, train_fasttext_model as train_ft
        
        corpus_path = prepare_corpus_file(corpus_texts)
        model = train_ft(corpus_path=corpus_path, **kwargs)
        
        return model # Returns model object, not path as previously suggested by older docstring
    except ImportError:
        logger.error("FastText module not found. Please install it with 'pip install fasttext'.")
        raise # Re-raising to signal critical failure if training components are missing