daniel-wojahn commited on
Commit
b2ce320
·
1 Parent(s): d03158d

feat: Enhance embeddings and LLM interpretation

Browse files
app.py CHANGED
@@ -65,10 +65,14 @@ def main_interface():
65
  )
66
 
67
  model_dropdown = gr.Dropdown(
68
- choices=["Facebook FastText (Pre-trained)"],
 
 
 
 
69
  label="Select Embedding Model",
70
- value="Facebook FastText (Pre-trained)",
71
- info="Using Facebook's pre-trained FastText model for semantic similarity. Other model options have been removed."
72
  )
73
 
74
  stopwords_dropdown = gr.Dropdown(
 
65
  )
66
 
67
  model_dropdown = gr.Dropdown(
68
+ choices=[
69
+ "sentence-transformers/LaBSE",
70
+ "intfloat/e5-base-v2",
71
+ "Facebook FastText (Pre-trained)"
72
+ ],
73
  label="Select Embedding Model",
74
+ value="sentence-transformers/LaBSE",
75
+ info="Select the embedding model to use for semantic similarity analysis."
76
  )
77
 
78
  stopwords_dropdown = gr.Dropdown(
pipeline/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (187 Bytes)
 
pipeline/__pycache__/metrics.cpython-310.pyc DELETED
Binary file (7.23 kB)
 
pipeline/__pycache__/process.cpython-310.pyc DELETED
Binary file (3.74 kB)
 
pipeline/__pycache__/semantic_embedding.cpython-310.pyc DELETED
Binary file (4.02 kB)
 
pipeline/__pycache__/tokenize.cpython-310.pyc DELETED
Binary file (1.14 kB)
 
pipeline/__pycache__/upload.cpython-310.pyc DELETED
Binary file (983 Bytes)
 
pipeline/__pycache__/visualize.cpython-310.pyc DELETED
Binary file (4.37 kB)
 
pipeline/fasttext_embedding.py CHANGED
@@ -136,7 +136,7 @@ def train_fasttext_model(
136
  return model
137
 
138
 
139
- def load_facebook_official_tibetan_model() -> Optional[fasttext.FastText._FastText]:
140
  """
141
  Downloads (if necessary) and loads the official Facebook FastText Tibetan model.
142
 
@@ -424,6 +424,38 @@ def get_batch_embeddings(
424
  return np.array(embeddings)
425
 
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  def generate_embeddings(
428
  texts: List[str],
429
  model: fasttext.FastText._FastText,
 
136
  return model
137
 
138
 
139
+ def _load_facebook_official_tibetan_model() -> Optional[fasttext.FastText._FastText]:
140
  """
141
  Downloads (if necessary) and loads the official Facebook FastText Tibetan model.
142
 
 
424
  return np.array(embeddings)
425
 
426
 
427
+ def get_model(model_id: str):
428
+ """
429
+ Loads a FastText model based on the provided model ID.
430
+
431
+ Args:
432
+ model_id (str): The identifier for the model to load.
433
+
434
+ Returns:
435
+ Tuple[Optional[Any], Optional[str]]: A tuple containing the loaded model and its type ('fasttext'),
436
+ or (None, None) if loading fails.
437
+ """
438
+ logger.info(f"Attempting to load FastText model: {model_id}")
439
+
440
+ if model_id == "facebook-fasttext-pretrained":
441
+ try:
442
+ model = _load_facebook_official_tibetan_model()
443
+ if model:
444
+ logger.info(f"FastText model '{model_id}' loaded successfully.")
445
+ return model, "fasttext"
446
+ else:
447
+ logger.error(f"Model loading for '{model_id}' returned None.")
448
+ return None, None
449
+ except Exception as e:
450
+ logger.error(f"Failed to load FastText model '{model_id}': {e}", exc_info=True)
451
+ return None, None
452
+ # Add logic for other custom models here if needed
453
+ # elif model_id == "custom-model-name":
454
+ # ...
455
+ else:
456
+ logger.error(f"Unsupported model_id for get_model: '{model_id}'.")
457
+ return None, None
458
+
459
  def generate_embeddings(
460
  texts: List[str],
461
  model: fasttext.FastText._FastText,
pipeline/hf_embedding.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Any, Optional, Tuple
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ # Cache for loaded models
9
+ _model_cache = {}
10
+
11
+ def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[str]]:
12
+ """
13
+ Loads a SentenceTransformer model from the Hugging Face Hub.
14
+
15
+ Args:
16
+ model_id (str): The identifier for the model to load (e.g., 'sentence-transformers/LaBSE').
17
+
18
+ Returns:
19
+ Tuple[Optional[SentenceTransformer], Optional[str]]: A tuple containing the loaded model and its type ('sentence-transformer'),
20
+ or (None, None) if loading fails.
21
+ """
22
+ if model_id in _model_cache:
23
+ logger.info(f"Returning cached model: {model_id}")
24
+ return _model_cache[model_id], "sentence-transformer"
25
+
26
+ logger.info(f"Loading SentenceTransformer model: {model_id}")
27
+ try:
28
+ model = SentenceTransformer(model_id)
29
+ _model_cache[model_id] = model
30
+ logger.info(f"Model '{model_id}' loaded successfully.")
31
+ return model, "sentence-transformer"
32
+ except Exception as e:
33
+ logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
34
+ return None, None
35
+
36
+ def generate_embeddings(texts: List[str], model: SentenceTransformer) -> Optional[np.ndarray]:
37
+ """
38
+ Generates embeddings for a list of texts using a SentenceTransformer model.
39
+
40
+ Args:
41
+ texts (list[str]): A list of texts to embed.
42
+ model (SentenceTransformer): The loaded SentenceTransformer model.
43
+
44
+ Returns:
45
+ Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
46
+ """
47
+ if not texts or not isinstance(model, SentenceTransformer):
48
+ logger.warning("Invalid input for generating embeddings. Texts list is empty or model is not a SentenceTransformer.")
49
+ return None
50
+
51
+ logger.info(f"Generating embeddings for {len(texts)} texts with {type(model).__name__}...")
52
+ try:
53
+ embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
54
+ logger.info(f"Embeddings generated with shape: {embeddings.shape}")
55
+ return embeddings
56
+ except Exception as e:
57
+ logger.error(f"An unexpected error occurred during embedding generation: {e}", exc_info=True)
58
+ return None
pipeline/llm_service.py CHANGED
@@ -26,7 +26,7 @@ except ImportError:
26
 
27
  # Constants
28
  DEFAULT_MAX_TOKENS = 4000
29
- DEFAULT_MODEL = "mistralai/mistral-7b-instruct"
30
  DEFAULT_TEMPERATURE = 0.3
31
  DEFAULT_TOP_P = 0.9
32
 
@@ -373,58 +373,17 @@ class LLMService:
373
  csv_data = df.to_csv(index=False)
374
 
375
  # Create the prompt using the user's template
376
- prompt = """You are a specialized text analysis interpreter with expertise in Tibetan textual studies. Your task is to analyze text similarity data from a CSV file and create a clear, narrative explanation for scholars who may not have technical expertise.
377
 
378
- <CONTEXT>
379
- This data comes from a text similarity analysis tool designed for various genres of Tibetan sources including historical, religious, literary, and philosophical texts. The tool compares texts using multiple linguistic metrics:
380
- - Jaccard Similarity (%): Measures word overlap between texts (higher % = more similar)
381
- - Normalized LCS: Longest Common Subsequence, measuring sequential text patterns
382
- - Semantic Similarity: Deep meaning comparison using sentence transformers or fasttext
383
- - TF-IDF Cosine Similarity: Term frequency-inverse document frequency comparison
384
- The "Chapter" column indicates which chapter/section of the texts is being compared.
385
- </CONTEXT>
386
 
387
- <INSTRUCTIONS>
388
- 1. Begin by identifying the specific texts being compared in the data (e.g., "Japan13.txt vs Dolanji.txt").
389
 
390
- 2. Create a dual-layer narrative analysis (800-1000 words) that includes:
391
- a) A high-level overview of text similarity patterns accessible to non-technical readers
392
- b) A more detailed analysis for scholars interested in specific textual relationships
393
-
394
- 3. In your analysis:
395
- - Summarize overall similarity patterns between the texts across all chapters
396
- - Identify which chapters show strongest similarities and differences
397
- - Explain whether similarities appear to be more lexical (Jaccard, LCS) or conceptual (Semantic)
398
- - Interpret what these patterns might suggest about textual relationships, transmission, or variant histories
399
- - Note any interesting anomalies (e.g., chapters with high semantic but low lexical similarity)
400
-
401
- 4. Structure your analysis with:
402
- - An introduction explaining the texts compared and general observations
403
- - A section on overall patterns across all chapters with visualized trends
404
- - A detailed examination of 2-3 notable chapters (highest/lowest similarity)
405
- - A discussion of what different metrics reveal about textual relationships
406
- - A conclusion suggesting what these patterns might mean for Tibetan textual scholarship
407
- - 2-3 specific questions these findings raise for further investigation
408
-
409
- 5. Connect your analysis to common interests in Tibetan textual studies such as:
410
- - Textual transmission and lineages
411
- - Regional variants and dialectical differences
412
- - Potential historical relationships between texts
413
- - Original vs. commentary material identification
414
-
415
- 6. Consider using a "family tree" analogy to make the textual relationships more intuitive. For example:
416
- - Texts with very high similarity (>80%) might be described as "siblings" from the same direct source
417
- - Texts with moderate similarity (50-80%) could be "cousins" sharing a common ancestor but with separate development
418
- - Texts with low similarity (<50%) might be "distant relatives" with only fundamental connections
419
- Use this metaphor if it helps clarify the relationships, but don't force it if another explanation would be clearer.
420
-
421
- 7. **Important note on perfect or zero similarity matches:**
422
- If you notice that all metrics indicate perfect or near-perfect similarity (for example, scores of 1.0/100 across all metrics for a chapter) or 0 for a complete mismatch, this may not indicate true textual identity or lack thereof. Instead, it likely means both corresponding text cells were empty or contained no content. In these cases, be sure to clarify in your narrative that such results are *artifacts of missing data, not genuine textual matches*, and should be interpreted with caution.
423
-
424
- 8. Balance scholarly precision with accessibility, explaining technical concepts when necessary while keeping the overall narrative engaging for non-technical readers.
425
- </INSTRUCTIONS>
426
-
427
- Here is the CSV data to analyze:
428
  [CSV_DATA]
429
  """
430
 
@@ -435,33 +394,7 @@ Here is the CSV data to analyze:
435
 
436
  def _get_system_prompt(self) -> str:
437
  """Get the system prompt for the LLM."""
438
- return """
439
- You are a senior scholar of Tibetan Buddhist texts with expertise in textual criticism and
440
- comparative analysis. Your task is to analyze the provided similarity metrics and provide
441
- expert-level insights into the relationships between these Tibetan texts.
442
-
443
- CRITICAL INSTRUCTIONS:
444
- 1. Your analysis MUST be grounded in the specific metrics provided
445
- 2. Always reference actual text names and metric values when making claims
446
- 3. Focus on what the data shows, not what it might show
447
- 4. Be precise and avoid vague or generic statements
448
-
449
- ANALYSIS APPROACH:
450
- 1. Begin with a brief executive summary of the most significant findings
451
- 2. Group similar text pairs and explain their relationships
452
- 3. Highlight any patterns that suggest textual transmission or common sources
453
- 4. Note any anomalies or unexpected results that merit further investigation
454
- 5. Provide specific examples from the data to support your analysis
455
-
456
- TIBETAN TEXT-SPECIFIC GUIDANCE:
457
- - Consider the implications of shared vocabulary in the context of Tibetan Buddhist literature
458
- - Be aware that high LCS scores might indicate shared liturgical or formulaic language
459
- - Note that texts with similar Jaccard but different LCS scores might share content but differ in structure
460
- - Consider the possibility of text reuse, commentary traditions, or shared sources
461
-
462
- Your analysis should be scholarly but accessible, providing clear insights that would be
463
- valuable to researchers studying these texts.
464
- """
465
 
466
  def _call_openrouter_api(
467
  self,
 
26
 
27
  # Constants
28
  DEFAULT_MAX_TOKENS = 4000
29
+ DEFAULT_MODEL = "moonshotai/kimi-k2:free"
30
  DEFAULT_TEMPERATURE = 0.3
31
  DEFAULT_TOP_P = 0.9
32
 
 
373
  csv_data = df.to_csv(index=False)
374
 
375
  # Create the prompt using the user's template
376
+ prompt = """Please analyze the following CSV data, which contains text similarity metrics for Tibetan texts.
377
 
378
+ Your analysis should be a clear, narrative explanation (800-1000 words) suitable for scholars of Tibetan studies. The analysis should cover:
379
+ 1. An introduction to the texts being compared and a high-level overview of the similarity patterns.
380
+ 2. A detailed examination of the most and least similar chapters, explaining whether the similarities are more lexical (Jaccard, LCS) or conceptual (Semantic).
381
+ 3. An interpretation of what these patterns suggest about the texts' relationships, such as their transmission history or potential shared sources.
382
+ 4. A conclusion that summarizes your findings and raises 2-3 questions for further scholarly investigation.
 
 
 
383
 
384
+ **Important:** If you find chapters with perfect (1.0 or 100%) or zero similarity, clarify that this may be due to empty or missing text segments, rather than a true textual match or mismatch.
 
385
 
386
+ Here is the CSV data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  [CSV_DATA]
388
  """
389
 
 
394
 
395
  def _get_system_prompt(self) -> str:
396
  """Get the system prompt for the LLM."""
397
+ return """You are a senior scholar of Tibetan Buddhist texts, specializing in textual criticism. Your task is to analyze the provided similarity metrics and provide expert insights into the relationships between these texts. Ground your analysis in the data, be precise, and focus on what the metrics reveal about the texts' transmission and history."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  def _call_openrouter_api(
400
  self,
pipeline/metrics.py CHANGED
@@ -3,7 +3,9 @@ import pandas as pd
3
  from typing import List, Dict, Union
4
  from itertools import combinations
5
  from sklearn.metrics.pairwise import cosine_similarity
6
- from .semantic_embedding import generate_embeddings
 
 
7
  from .tokenize import tokenize_texts
8
  import logging
9
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -99,20 +101,19 @@ def compute_semantic_similarity(
99
  )
100
  return None
101
 
102
- embedding = generate_embeddings(
103
- texts=[raw_text_segment],
104
- model=model_obj,
105
- tokenize_fn=tokenize_fn_param,
106
- use_stopwords=use_stopwords_param,
107
- use_lite_stopwords=use_lite_stopwords_param,
108
- corpus_token_freq=term_freq_corpus_param,
109
- doc_freq_map=doc_freq_map_param,
110
- total_docs_in_corpus=total_docs_in_corpus_param
111
- )
112
 
113
  if embedding is None or embedding.size == 0:
114
  logger.error(
115
- f"Failed to generate FastText embedding for text: {raw_text_segment[:100]}..."
116
  )
117
  return None
118
  return embedding
 
3
  from typing import List, Dict, Union
4
  from itertools import combinations
5
  from sklearn.metrics.pairwise import cosine_similarity
6
+ from .fasttext_embedding import generate_embeddings as generate_fasttext_embeddings
7
+ from .hf_embedding import generate_embeddings as generate_hf_embeddings
8
+
9
  from .tokenize import tokenize_texts
10
  import logging
11
  from sklearn.feature_extraction.text import TfidfVectorizer
 
101
  )
102
  return None
103
 
104
+ if model_type == "fasttext":
105
+ embedding = generate_fasttext_embeddings(
106
+ texts=[raw_text_segment],
107
+ model=model_obj,
108
+ use_stopwords=use_stopwords_param,
109
+ use_lite_stopwords=use_lite_stopwords_param
110
+ )
111
+ elif model_type == "sentence-transformer":
112
+ embedding = generate_hf_embeddings(texts=[raw_text_segment], model=model_obj)
 
113
 
114
  if embedding is None or embedding.size == 0:
115
  logger.error(
116
+ f"Failed to generate embedding for text: {raw_text_segment[:100]}..."
117
  )
118
  return None
119
  return embedding
pipeline/process.py CHANGED
@@ -1,8 +1,9 @@
1
  import pandas as pd
2
  from typing import Dict, List, Tuple
3
  from .metrics import compute_all_metrics
4
- from .semantic_embedding import get_model_and_device
5
- from .fasttext_embedding import load_fasttext_model # Added for custom fasttext
 
6
  from .tokenize import tokenize_texts
7
  import logging
8
  from itertools import combinations
@@ -51,7 +52,7 @@ def process_texts(
51
  text_data: Dict[str, str],
52
  filenames: List[str],
53
  enable_semantic: bool = True,
54
- model_name: str = "buddhist-nlp/buddhist-sentence-similarity",
55
  use_stopwords: bool = True,
56
  use_lite_stopwords: bool = False,
57
  progress_callback = None
@@ -65,7 +66,7 @@ def process_texts(
65
  enable_semantic (bool, optional): Whether to compute semantic similarity metrics.
66
  Requires loading a sentence transformer model, which can be time-consuming. Defaults to True.
67
  model_name (str, optional): The name of the sentence transformer model to use for semantic similarity.
68
- Must be a valid model identifier on Hugging Face. Defaults to "buddhist-nlp/buddhist-sentence-similarity".
69
  use_stopwords (bool, optional): Whether to use stopwords in the metrics calculation. Defaults to True.
70
  use_lite_stopwords (bool, optional): Whether to use the lite stopwords list (common particles only)
71
  instead of the comprehensive list. Only applies if use_stopwords is True. Defaults to False.
@@ -101,57 +102,20 @@ def process_texts(
101
  if enable_semantic:
102
  logger.info("Semantic similarity enabled. Loading embedding model...")
103
  try:
104
- logger.info("Using model: %s", model_name)
 
 
 
 
105
 
106
- if model_name == FASTTEXT_MODEL_ID: # FASTTEXT_MODEL_ID is 'fasttext-tibetan'
107
- logger.info(f"Attempting to load custom FastText model: {model_name}")
108
- if progress_callback is not None:
109
- try:
110
- progress_callback(0.25, desc=f"Loading custom FastText model: {model_name}...")
111
- except Exception as e:
112
- logger.warning(f"Progress callback error (non-critical): {e}")
113
-
114
- loaded_custom_model = load_fasttext_model(model_id=model_name) # model_id is expected to be path or key by this func
115
- if loaded_custom_model:
116
- model = loaded_custom_model
117
- model_type = "fasttext"
118
- logger.info(f"Custom FastText model '{model_name}' loaded successfully.")
119
- if progress_callback is not None:
120
- try:
121
- progress_callback(0.3, desc=f"Custom FastText model '{model_name}' loaded.")
122
- except Exception as e:
123
- logger.warning(f"Progress callback error (non-critical): {e}")
124
- else:
125
- model_warning = f"Custom FastText model ('{model_name}') failed to load. Semantic similarity will be disabled."
126
- logger.warning(model_warning)
127
- enable_semantic = False
128
-
129
- elif model_name == "facebook-fasttext-pretrained":
130
- logger.info(f"Attempting to load Facebook FastText model: {model_name}")
131
  if progress_callback is not None:
132
- try:
133
- progress_callback(0.25, desc=f"Loading Facebook FastText model: {model_name}...")
134
- except Exception as e:
135
- logger.warning(f"Progress callback error (non-critical): {e}")
136
-
137
- fb_model, fb_model_type = get_model_and_device(model_id=model_name) # from semantic_embedding
138
- if fb_model:
139
- model = fb_model
140
- model_type = fb_model_type # Should be "fasttext"
141
- logger.info(f"Facebook FastText model '{model_name}' (type: {model_type}) loaded successfully.")
142
- if progress_callback is not None:
143
- try:
144
- progress_callback(0.3, desc=f"Facebook FastText model '{model_name}' loaded.")
145
- except Exception as e:
146
- logger.warning(f"Progress callback error (non-critical): {e}")
147
- else:
148
- model_warning = f"Facebook FastText model ('{model_name}') failed to load. Semantic similarity will be disabled."
149
- logger.warning(model_warning)
150
- enable_semantic = False
151
-
152
- else: # Any other model_name is unsupported
153
- model_warning = f"Unsupported model_name: '{model_name}'. Semantic similarity will be disabled. Supported models are '{FASTTEXT_MODEL_ID}' and 'facebook-fasttext-pretrained'."
154
  logger.warning(model_warning)
 
155
  enable_semantic = False
156
  if progress_callback is not None:
157
  try:
@@ -320,8 +284,7 @@ def process_texts(
320
  enable_semantic=enable_semantic,
321
  model_type=model_type,
322
  use_stopwords=use_stopwords,
323
- use_lite_stopwords=use_lite_stopwords,
324
- fasttext_tokenize_fn=tokenizer_for_fasttext
325
  )
326
 
327
  # Rename 'Text Pair' to show file stems and chapter number
 
1
  import pandas as pd
2
  from typing import Dict, List, Tuple
3
  from .metrics import compute_all_metrics
4
+ from .fasttext_embedding import get_model as get_fasttext_model
5
+ from .hf_embedding import get_model as get_hf_model
6
+ from .fasttext_embedding import load_fasttext_model
7
  from .tokenize import tokenize_texts
8
  import logging
9
  from itertools import combinations
 
52
  text_data: Dict[str, str],
53
  filenames: List[str],
54
  enable_semantic: bool = True,
55
+ model_name: str = "facebook-fasttext-pretrained",
56
  use_stopwords: bool = True,
57
  use_lite_stopwords: bool = False,
58
  progress_callback = None
 
66
  enable_semantic (bool, optional): Whether to compute semantic similarity metrics.
67
  Requires loading a sentence transformer model, which can be time-consuming. Defaults to True.
68
  model_name (str, optional): The name of the sentence transformer model to use for semantic similarity.
69
+ Must be a valid model identifier on Hugging Face. Defaults to "facebook-fasttext-pretrained".
70
  use_stopwords (bool, optional): Whether to use stopwords in the metrics calculation. Defaults to True.
71
  use_lite_stopwords (bool, optional): Whether to use the lite stopwords list (common particles only)
72
  instead of the comprehensive list. Only applies if use_stopwords is True. Defaults to False.
 
102
  if enable_semantic:
103
  logger.info("Semantic similarity enabled. Loading embedding model...")
104
  try:
105
+ logger.info(f"Using model: {model_name}")
106
+ if 'e5-base' in model_name or 'LaBSE' in model_name:
107
+ model, model_type = get_hf_model(model_id=model_name)
108
+ else:
109
+ model, model_type = get_fasttext_model(model_id=model_name)
110
 
111
+ if model:
112
+ logger.info(f"Model '{model_name}' (type: {model_type}) loaded successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  if progress_callback is not None:
114
+ progress_callback(0.3, desc=f"Model '{model_name}' loaded.")
115
+ else:
116
+ model_warning = f"Model ('{model_name}') failed to load. Semantic similarity will be disabled."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  logger.warning(model_warning)
118
+ warning += f" {model_warning}"
119
  enable_semantic = False
120
  if progress_callback is not None:
121
  try:
 
284
  enable_semantic=enable_semantic,
285
  model_type=model_type,
286
  use_stopwords=use_stopwords,
287
+ use_lite_stopwords=use_lite_stopwords
 
288
  )
289
 
290
  # Rename 'Text Pair' to show file stems and chapter number
pipeline/semantic_embedding.py DELETED
@@ -1,141 +0,0 @@
1
- import logging
2
- from typing import List, Any, Optional
3
- import numpy as np # Added for type hinting Optional[np.ndarray]
4
-
5
- # Configure logging
6
- logging.basicConfig(
7
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
8
- )
9
- logger = logging.getLogger(__name__)
10
-
11
- # Define the model ID for the Facebook FastText pretrained model
12
- DEFAULT_MODEL_NAME = "facebook-fasttext-pretrained"
13
-
14
- # FASTTEXT_MODEL_ID = "fasttext-tibetan" # Removed: Custom model loading to be handled in process.py directly
15
-
16
-
17
- def get_model_and_device(model_id: str = DEFAULT_MODEL_NAME):
18
- """
19
- Loads the Facebook official pre-trained FastText model for Tibetan.
20
-
21
- Args:
22
- model_id (str): The model ID. Must be 'facebook-fasttext-pretrained' (DEFAULT_MODEL_NAME).
23
-
24
- Returns:
25
- Tuple[Optional[Any], Optional[str]]:
26
- A tuple containing the loaded FastText model and its type ("fasttext"),
27
- or (None, None) if loading fails or model_id is unsupported.
28
- """
29
- logger.info("Attempting to load FastText model via semantic_embedding.get_model_and_device: %s", model_id)
30
-
31
- if model_id == DEFAULT_MODEL_NAME: # DEFAULT_MODEL_NAME is "facebook-fasttext-pretrained"
32
- try:
33
- # Importing here to minimize issues if fasttext_embedding also imports from semantic_embedding
34
- from .fasttext_embedding import load_facebook_official_tibetan_model
35
-
36
- model = load_facebook_official_tibetan_model()
37
-
38
- if model:
39
- logger.info(f"FastText model object received in get_model_and_device. Type: {type(model)}.")
40
- try:
41
- logger.info(f"Model dimensions: {model.get_dimension()}")
42
- # Basic check for model validity via an expected attribute/method
43
- if hasattr(model, 'get_word_vector'):
44
- logger.info("Model has 'get_word_vector' method (Python API expected for fasttext.load_model results).")
45
- except Exception as diag_e:
46
- logger.error(f"Error during diagnostic check of FastText model '{model_id}': {diag_e}", exc_info=True)
47
- return model, "fasttext"
48
- else:
49
- # This case implies load_facebook_official_tibetan_model returned None without raising an error.
50
- logger.error(f"Model loading for '{model_id}' via load_facebook_official_tibetan_model() returned None unexpectedly.")
51
- return None, None
52
- except Exception as e:
53
- logger.error(f"Failed to load or initialize FastText model '{model_id}': {e}. Semantic similarity will not be available.", exc_info=True)
54
- return None, None
55
- else:
56
- logger.error(f"Unsupported model_id for get_model_and_device in semantic_embedding.py: '{model_id}'. Only '{DEFAULT_MODEL_NAME}' is supported by this function.")
57
- return None, None
58
-
59
-
60
- def generate_embeddings(texts: List[str], model: Any, tokenize_fn=None, use_stopwords: bool = True, use_lite_stopwords: bool = False, corpus_token_freq=None, doc_freq_map=None, total_docs_in_corpus=0) -> Optional[np.ndarray]:
61
- """
62
- Generates FastText embeddings for a list of texts.
63
-
64
- Args:
65
- texts (list[str]): A list of texts to embed.
66
- model: The loaded FastText model.
67
- tokenize_fn: Optional tokenization function for FastText (if different from default botok based).
68
- use_stopwords (bool): Whether to filter out stopwords for FastText embeddings.
69
- use_lite_stopwords (bool): Whether to use the 'lite' stopwords list.
70
- corpus_token_freq: Corpus-wide term frequencies for TF-IDF weighted FastText.
71
- doc_freq_map: Document frequency map for TF-IDF weighted FastText.
72
- total_docs_in_corpus: Total documents in corpus for TF-IDF weighted FastText.
73
-
74
- Returns:
75
- Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
76
- """
77
- if not texts:
78
- logger.warning(
79
- "No texts provided to generate_embeddings. Returning None."
80
- )
81
- return None
82
-
83
- logger.info(f"Generating FastText embeddings for {len(texts)} texts...")
84
-
85
- try:
86
- from .fasttext_embedding import get_batch_embeddings
87
-
88
- stopwords_set = None
89
- if use_stopwords:
90
- if use_lite_stopwords:
91
- from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE_SET
92
- stopwords_set = TIBETAN_STOPWORDS_LITE_SET
93
- else:
94
- from .stopwords_bo import TIBETAN_STOPWORDS_SET
95
- stopwords_set = TIBETAN_STOPWORDS_SET
96
-
97
- embeddings = get_batch_embeddings(
98
- texts,
99
- model,
100
- tokenize_fn=tokenize_fn,
101
- use_stopwords=use_stopwords,
102
- stopwords_set=stopwords_set,
103
- corpus_token_freq=corpus_token_freq,
104
- doc_freq_map=doc_freq_map,
105
- total_docs_in_corpus=total_docs_in_corpus
106
- )
107
- if embeddings is None:
108
- logger.error(f"get_batch_embeddings returned None for {len(texts)} texts. First few: {texts[:2]}")
109
- return None
110
-
111
- logger.info("FastText embeddings generated with shape: %s", str(embeddings.shape))
112
- return embeddings
113
- except ImportError:
114
- logger.error("Required FastText modules not found. Please ensure 'fasttext' and its dependencies are correctly installed.")
115
- return None
116
- except Exception as e:
117
- logger.error(f"An unexpected error occurred during FastText embedding generation: {e}", exc_info=True)
118
- return None
119
-
120
-
121
- def train_fasttext_model(corpus_texts: List[str], **kwargs):
122
- """
123
- Train a FastText model on the provided corpus texts.
124
-
125
- Args:
126
- corpus_texts: List of texts to use for training
127
- **kwargs: Additional parameters for training (dim, epoch, etc.)
128
-
129
- Returns:
130
- Trained model and path to the model file (Note: current implementation returns only model object)
131
- """ # Docstring updated for return type
132
- try:
133
- from .fasttext_embedding import prepare_corpus_file, train_fasttext_model as train_ft
134
-
135
- corpus_path = prepare_corpus_file(corpus_texts)
136
- model = train_ft(corpus_path=corpus_path, **kwargs)
137
-
138
- return model # Returns model object, not path as previously suggested by older docstring
139
- except ImportError:
140
- logger.error("FastText module not found. Please install it with 'pip install fasttext'.")
141
- raise # Re-raising to signal critical failure if training components are missing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -12,6 +12,7 @@ numpy==1.26.4
12
  scikit-learn==1.6.1
13
  numba==0.61.2
14
  fasttext==0.9.2
 
15
 
16
  # Tibetan language processing
17
  botok
 
12
  scikit-learn==1.6.1
13
  numba==0.61.2
14
  fasttext==0.9.2
15
+ sentence-transformers==3.0.1
16
 
17
  # Tibetan language processing
18
  botok