Spaces:
Sleeping
Sleeping
Commit
·
b2ce320
1
Parent(s):
d03158d
feat: Enhance embeddings and LLM interpretation
Browse files- app.py +7 -3
- pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
- pipeline/__pycache__/metrics.cpython-310.pyc +0 -0
- pipeline/__pycache__/process.cpython-310.pyc +0 -0
- pipeline/__pycache__/semantic_embedding.cpython-310.pyc +0 -0
- pipeline/__pycache__/tokenize.cpython-310.pyc +0 -0
- pipeline/__pycache__/upload.cpython-310.pyc +0 -0
- pipeline/__pycache__/visualize.cpython-310.pyc +0 -0
- pipeline/fasttext_embedding.py +33 -1
- pipeline/hf_embedding.py +58 -0
- pipeline/llm_service.py +10 -77
- pipeline/metrics.py +13 -12
- pipeline/process.py +17 -54
- pipeline/semantic_embedding.py +0 -141
- requirements.txt +1 -0
app.py
CHANGED
@@ -65,10 +65,14 @@ def main_interface():
|
|
65 |
)
|
66 |
|
67 |
model_dropdown = gr.Dropdown(
|
68 |
-
choices=[
|
|
|
|
|
|
|
|
|
69 |
label="Select Embedding Model",
|
70 |
-
value="
|
71 |
-
info="
|
72 |
)
|
73 |
|
74 |
stopwords_dropdown = gr.Dropdown(
|
|
|
65 |
)
|
66 |
|
67 |
model_dropdown = gr.Dropdown(
|
68 |
+
choices=[
|
69 |
+
"sentence-transformers/LaBSE",
|
70 |
+
"intfloat/e5-base-v2",
|
71 |
+
"Facebook FastText (Pre-trained)"
|
72 |
+
],
|
73 |
label="Select Embedding Model",
|
74 |
+
value="sentence-transformers/LaBSE",
|
75 |
+
info="Select the embedding model to use for semantic similarity analysis."
|
76 |
)
|
77 |
|
78 |
stopwords_dropdown = gr.Dropdown(
|
pipeline/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (187 Bytes)
|
|
pipeline/__pycache__/metrics.cpython-310.pyc
DELETED
Binary file (7.23 kB)
|
|
pipeline/__pycache__/process.cpython-310.pyc
DELETED
Binary file (3.74 kB)
|
|
pipeline/__pycache__/semantic_embedding.cpython-310.pyc
DELETED
Binary file (4.02 kB)
|
|
pipeline/__pycache__/tokenize.cpython-310.pyc
DELETED
Binary file (1.14 kB)
|
|
pipeline/__pycache__/upload.cpython-310.pyc
DELETED
Binary file (983 Bytes)
|
|
pipeline/__pycache__/visualize.cpython-310.pyc
DELETED
Binary file (4.37 kB)
|
|
pipeline/fasttext_embedding.py
CHANGED
@@ -136,7 +136,7 @@ def train_fasttext_model(
|
|
136 |
return model
|
137 |
|
138 |
|
139 |
-
def
|
140 |
"""
|
141 |
Downloads (if necessary) and loads the official Facebook FastText Tibetan model.
|
142 |
|
@@ -424,6 +424,38 @@ def get_batch_embeddings(
|
|
424 |
return np.array(embeddings)
|
425 |
|
426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
def generate_embeddings(
|
428 |
texts: List[str],
|
429 |
model: fasttext.FastText._FastText,
|
|
|
136 |
return model
|
137 |
|
138 |
|
139 |
+
def _load_facebook_official_tibetan_model() -> Optional[fasttext.FastText._FastText]:
|
140 |
"""
|
141 |
Downloads (if necessary) and loads the official Facebook FastText Tibetan model.
|
142 |
|
|
|
424 |
return np.array(embeddings)
|
425 |
|
426 |
|
427 |
+
def get_model(model_id: str):
|
428 |
+
"""
|
429 |
+
Loads a FastText model based on the provided model ID.
|
430 |
+
|
431 |
+
Args:
|
432 |
+
model_id (str): The identifier for the model to load.
|
433 |
+
|
434 |
+
Returns:
|
435 |
+
Tuple[Optional[Any], Optional[str]]: A tuple containing the loaded model and its type ('fasttext'),
|
436 |
+
or (None, None) if loading fails.
|
437 |
+
"""
|
438 |
+
logger.info(f"Attempting to load FastText model: {model_id}")
|
439 |
+
|
440 |
+
if model_id == "facebook-fasttext-pretrained":
|
441 |
+
try:
|
442 |
+
model = _load_facebook_official_tibetan_model()
|
443 |
+
if model:
|
444 |
+
logger.info(f"FastText model '{model_id}' loaded successfully.")
|
445 |
+
return model, "fasttext"
|
446 |
+
else:
|
447 |
+
logger.error(f"Model loading for '{model_id}' returned None.")
|
448 |
+
return None, None
|
449 |
+
except Exception as e:
|
450 |
+
logger.error(f"Failed to load FastText model '{model_id}': {e}", exc_info=True)
|
451 |
+
return None, None
|
452 |
+
# Add logic for other custom models here if needed
|
453 |
+
# elif model_id == "custom-model-name":
|
454 |
+
# ...
|
455 |
+
else:
|
456 |
+
logger.error(f"Unsupported model_id for get_model: '{model_id}'.")
|
457 |
+
return None, None
|
458 |
+
|
459 |
def generate_embeddings(
|
460 |
texts: List[str],
|
461 |
model: fasttext.FastText._FastText,
|
pipeline/hf_embedding.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import List, Any, Optional, Tuple
|
3 |
+
import numpy as np
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
|
6 |
+
logger = logging.getLogger(__name__)
|
7 |
+
|
8 |
+
# Cache for loaded models
|
9 |
+
_model_cache = {}
|
10 |
+
|
11 |
+
def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[str]]:
|
12 |
+
"""
|
13 |
+
Loads a SentenceTransformer model from the Hugging Face Hub.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
model_id (str): The identifier for the model to load (e.g., 'sentence-transformers/LaBSE').
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
Tuple[Optional[SentenceTransformer], Optional[str]]: A tuple containing the loaded model and its type ('sentence-transformer'),
|
20 |
+
or (None, None) if loading fails.
|
21 |
+
"""
|
22 |
+
if model_id in _model_cache:
|
23 |
+
logger.info(f"Returning cached model: {model_id}")
|
24 |
+
return _model_cache[model_id], "sentence-transformer"
|
25 |
+
|
26 |
+
logger.info(f"Loading SentenceTransformer model: {model_id}")
|
27 |
+
try:
|
28 |
+
model = SentenceTransformer(model_id)
|
29 |
+
_model_cache[model_id] = model
|
30 |
+
logger.info(f"Model '{model_id}' loaded successfully.")
|
31 |
+
return model, "sentence-transformer"
|
32 |
+
except Exception as e:
|
33 |
+
logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
|
34 |
+
return None, None
|
35 |
+
|
36 |
+
def generate_embeddings(texts: List[str], model: SentenceTransformer) -> Optional[np.ndarray]:
|
37 |
+
"""
|
38 |
+
Generates embeddings for a list of texts using a SentenceTransformer model.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
texts (list[str]): A list of texts to embed.
|
42 |
+
model (SentenceTransformer): The loaded SentenceTransformer model.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
|
46 |
+
"""
|
47 |
+
if not texts or not isinstance(model, SentenceTransformer):
|
48 |
+
logger.warning("Invalid input for generating embeddings. Texts list is empty or model is not a SentenceTransformer.")
|
49 |
+
return None
|
50 |
+
|
51 |
+
logger.info(f"Generating embeddings for {len(texts)} texts with {type(model).__name__}...")
|
52 |
+
try:
|
53 |
+
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
54 |
+
logger.info(f"Embeddings generated with shape: {embeddings.shape}")
|
55 |
+
return embeddings
|
56 |
+
except Exception as e:
|
57 |
+
logger.error(f"An unexpected error occurred during embedding generation: {e}", exc_info=True)
|
58 |
+
return None
|
pipeline/llm_service.py
CHANGED
@@ -26,7 +26,7 @@ except ImportError:
|
|
26 |
|
27 |
# Constants
|
28 |
DEFAULT_MAX_TOKENS = 4000
|
29 |
-
DEFAULT_MODEL = "
|
30 |
DEFAULT_TEMPERATURE = 0.3
|
31 |
DEFAULT_TOP_P = 0.9
|
32 |
|
@@ -373,58 +373,17 @@ class LLMService:
|
|
373 |
csv_data = df.to_csv(index=False)
|
374 |
|
375 |
# Create the prompt using the user's template
|
376 |
-
prompt = """
|
377 |
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
- TF-IDF Cosine Similarity: Term frequency-inverse document frequency comparison
|
384 |
-
The "Chapter" column indicates which chapter/section of the texts is being compared.
|
385 |
-
</CONTEXT>
|
386 |
|
387 |
-
|
388 |
-
1. Begin by identifying the specific texts being compared in the data (e.g., "Japan13.txt vs Dolanji.txt").
|
389 |
|
390 |
-
|
391 |
-
a) A high-level overview of text similarity patterns accessible to non-technical readers
|
392 |
-
b) A more detailed analysis for scholars interested in specific textual relationships
|
393 |
-
|
394 |
-
3. In your analysis:
|
395 |
-
- Summarize overall similarity patterns between the texts across all chapters
|
396 |
-
- Identify which chapters show strongest similarities and differences
|
397 |
-
- Explain whether similarities appear to be more lexical (Jaccard, LCS) or conceptual (Semantic)
|
398 |
-
- Interpret what these patterns might suggest about textual relationships, transmission, or variant histories
|
399 |
-
- Note any interesting anomalies (e.g., chapters with high semantic but low lexical similarity)
|
400 |
-
|
401 |
-
4. Structure your analysis with:
|
402 |
-
- An introduction explaining the texts compared and general observations
|
403 |
-
- A section on overall patterns across all chapters with visualized trends
|
404 |
-
- A detailed examination of 2-3 notable chapters (highest/lowest similarity)
|
405 |
-
- A discussion of what different metrics reveal about textual relationships
|
406 |
-
- A conclusion suggesting what these patterns might mean for Tibetan textual scholarship
|
407 |
-
- 2-3 specific questions these findings raise for further investigation
|
408 |
-
|
409 |
-
5. Connect your analysis to common interests in Tibetan textual studies such as:
|
410 |
-
- Textual transmission and lineages
|
411 |
-
- Regional variants and dialectical differences
|
412 |
-
- Potential historical relationships between texts
|
413 |
-
- Original vs. commentary material identification
|
414 |
-
|
415 |
-
6. Consider using a "family tree" analogy to make the textual relationships more intuitive. For example:
|
416 |
-
- Texts with very high similarity (>80%) might be described as "siblings" from the same direct source
|
417 |
-
- Texts with moderate similarity (50-80%) could be "cousins" sharing a common ancestor but with separate development
|
418 |
-
- Texts with low similarity (<50%) might be "distant relatives" with only fundamental connections
|
419 |
-
Use this metaphor if it helps clarify the relationships, but don't force it if another explanation would be clearer.
|
420 |
-
|
421 |
-
7. **Important note on perfect or zero similarity matches:**
|
422 |
-
If you notice that all metrics indicate perfect or near-perfect similarity (for example, scores of 1.0/100 across all metrics for a chapter) or 0 for a complete mismatch, this may not indicate true textual identity or lack thereof. Instead, it likely means both corresponding text cells were empty or contained no content. In these cases, be sure to clarify in your narrative that such results are *artifacts of missing data, not genuine textual matches*, and should be interpreted with caution.
|
423 |
-
|
424 |
-
8. Balance scholarly precision with accessibility, explaining technical concepts when necessary while keeping the overall narrative engaging for non-technical readers.
|
425 |
-
</INSTRUCTIONS>
|
426 |
-
|
427 |
-
Here is the CSV data to analyze:
|
428 |
[CSV_DATA]
|
429 |
"""
|
430 |
|
@@ -435,33 +394,7 @@ Here is the CSV data to analyze:
|
|
435 |
|
436 |
def _get_system_prompt(self) -> str:
|
437 |
"""Get the system prompt for the LLM."""
|
438 |
-
return """
|
439 |
-
You are a senior scholar of Tibetan Buddhist texts with expertise in textual criticism and
|
440 |
-
comparative analysis. Your task is to analyze the provided similarity metrics and provide
|
441 |
-
expert-level insights into the relationships between these Tibetan texts.
|
442 |
-
|
443 |
-
CRITICAL INSTRUCTIONS:
|
444 |
-
1. Your analysis MUST be grounded in the specific metrics provided
|
445 |
-
2. Always reference actual text names and metric values when making claims
|
446 |
-
3. Focus on what the data shows, not what it might show
|
447 |
-
4. Be precise and avoid vague or generic statements
|
448 |
-
|
449 |
-
ANALYSIS APPROACH:
|
450 |
-
1. Begin with a brief executive summary of the most significant findings
|
451 |
-
2. Group similar text pairs and explain their relationships
|
452 |
-
3. Highlight any patterns that suggest textual transmission or common sources
|
453 |
-
4. Note any anomalies or unexpected results that merit further investigation
|
454 |
-
5. Provide specific examples from the data to support your analysis
|
455 |
-
|
456 |
-
TIBETAN TEXT-SPECIFIC GUIDANCE:
|
457 |
-
- Consider the implications of shared vocabulary in the context of Tibetan Buddhist literature
|
458 |
-
- Be aware that high LCS scores might indicate shared liturgical or formulaic language
|
459 |
-
- Note that texts with similar Jaccard but different LCS scores might share content but differ in structure
|
460 |
-
- Consider the possibility of text reuse, commentary traditions, or shared sources
|
461 |
-
|
462 |
-
Your analysis should be scholarly but accessible, providing clear insights that would be
|
463 |
-
valuable to researchers studying these texts.
|
464 |
-
"""
|
465 |
|
466 |
def _call_openrouter_api(
|
467 |
self,
|
|
|
26 |
|
27 |
# Constants
|
28 |
DEFAULT_MAX_TOKENS = 4000
|
29 |
+
DEFAULT_MODEL = "moonshotai/kimi-k2:free"
|
30 |
DEFAULT_TEMPERATURE = 0.3
|
31 |
DEFAULT_TOP_P = 0.9
|
32 |
|
|
|
373 |
csv_data = df.to_csv(index=False)
|
374 |
|
375 |
# Create the prompt using the user's template
|
376 |
+
prompt = """Please analyze the following CSV data, which contains text similarity metrics for Tibetan texts.
|
377 |
|
378 |
+
Your analysis should be a clear, narrative explanation (800-1000 words) suitable for scholars of Tibetan studies. The analysis should cover:
|
379 |
+
1. An introduction to the texts being compared and a high-level overview of the similarity patterns.
|
380 |
+
2. A detailed examination of the most and least similar chapters, explaining whether the similarities are more lexical (Jaccard, LCS) or conceptual (Semantic).
|
381 |
+
3. An interpretation of what these patterns suggest about the texts' relationships, such as their transmission history or potential shared sources.
|
382 |
+
4. A conclusion that summarizes your findings and raises 2-3 questions for further scholarly investigation.
|
|
|
|
|
|
|
383 |
|
384 |
+
**Important:** If you find chapters with perfect (1.0 or 100%) or zero similarity, clarify that this may be due to empty or missing text segments, rather than a true textual match or mismatch.
|
|
|
385 |
|
386 |
+
Here is the CSV data:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
[CSV_DATA]
|
388 |
"""
|
389 |
|
|
|
394 |
|
395 |
def _get_system_prompt(self) -> str:
|
396 |
"""Get the system prompt for the LLM."""
|
397 |
+
return """You are a senior scholar of Tibetan Buddhist texts, specializing in textual criticism. Your task is to analyze the provided similarity metrics and provide expert insights into the relationships between these texts. Ground your analysis in the data, be precise, and focus on what the metrics reveal about the texts' transmission and history."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
def _call_openrouter_api(
|
400 |
self,
|
pipeline/metrics.py
CHANGED
@@ -3,7 +3,9 @@ import pandas as pd
|
|
3 |
from typing import List, Dict, Union
|
4 |
from itertools import combinations
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
-
from .
|
|
|
|
|
7 |
from .tokenize import tokenize_texts
|
8 |
import logging
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
@@ -99,20 +101,19 @@ def compute_semantic_similarity(
|
|
99 |
)
|
100 |
return None
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
)
|
112 |
|
113 |
if embedding is None or embedding.size == 0:
|
114 |
logger.error(
|
115 |
-
f"Failed to generate
|
116 |
)
|
117 |
return None
|
118 |
return embedding
|
|
|
3 |
from typing import List, Dict, Union
|
4 |
from itertools import combinations
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
from .fasttext_embedding import generate_embeddings as generate_fasttext_embeddings
|
7 |
+
from .hf_embedding import generate_embeddings as generate_hf_embeddings
|
8 |
+
|
9 |
from .tokenize import tokenize_texts
|
10 |
import logging
|
11 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
101 |
)
|
102 |
return None
|
103 |
|
104 |
+
if model_type == "fasttext":
|
105 |
+
embedding = generate_fasttext_embeddings(
|
106 |
+
texts=[raw_text_segment],
|
107 |
+
model=model_obj,
|
108 |
+
use_stopwords=use_stopwords_param,
|
109 |
+
use_lite_stopwords=use_lite_stopwords_param
|
110 |
+
)
|
111 |
+
elif model_type == "sentence-transformer":
|
112 |
+
embedding = generate_hf_embeddings(texts=[raw_text_segment], model=model_obj)
|
|
|
113 |
|
114 |
if embedding is None or embedding.size == 0:
|
115 |
logger.error(
|
116 |
+
f"Failed to generate embedding for text: {raw_text_segment[:100]}..."
|
117 |
)
|
118 |
return None
|
119 |
return embedding
|
pipeline/process.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
import pandas as pd
|
2 |
from typing import Dict, List, Tuple
|
3 |
from .metrics import compute_all_metrics
|
4 |
-
from .
|
5 |
-
from .
|
|
|
6 |
from .tokenize import tokenize_texts
|
7 |
import logging
|
8 |
from itertools import combinations
|
@@ -51,7 +52,7 @@ def process_texts(
|
|
51 |
text_data: Dict[str, str],
|
52 |
filenames: List[str],
|
53 |
enable_semantic: bool = True,
|
54 |
-
model_name: str = "
|
55 |
use_stopwords: bool = True,
|
56 |
use_lite_stopwords: bool = False,
|
57 |
progress_callback = None
|
@@ -65,7 +66,7 @@ def process_texts(
|
|
65 |
enable_semantic (bool, optional): Whether to compute semantic similarity metrics.
|
66 |
Requires loading a sentence transformer model, which can be time-consuming. Defaults to True.
|
67 |
model_name (str, optional): The name of the sentence transformer model to use for semantic similarity.
|
68 |
-
Must be a valid model identifier on Hugging Face. Defaults to "
|
69 |
use_stopwords (bool, optional): Whether to use stopwords in the metrics calculation. Defaults to True.
|
70 |
use_lite_stopwords (bool, optional): Whether to use the lite stopwords list (common particles only)
|
71 |
instead of the comprehensive list. Only applies if use_stopwords is True. Defaults to False.
|
@@ -101,57 +102,20 @@ def process_texts(
|
|
101 |
if enable_semantic:
|
102 |
logger.info("Semantic similarity enabled. Loading embedding model...")
|
103 |
try:
|
104 |
-
logger.info("Using model:
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
if
|
107 |
-
logger.info(f"
|
108 |
-
if progress_callback is not None:
|
109 |
-
try:
|
110 |
-
progress_callback(0.25, desc=f"Loading custom FastText model: {model_name}...")
|
111 |
-
except Exception as e:
|
112 |
-
logger.warning(f"Progress callback error (non-critical): {e}")
|
113 |
-
|
114 |
-
loaded_custom_model = load_fasttext_model(model_id=model_name) # model_id is expected to be path or key by this func
|
115 |
-
if loaded_custom_model:
|
116 |
-
model = loaded_custom_model
|
117 |
-
model_type = "fasttext"
|
118 |
-
logger.info(f"Custom FastText model '{model_name}' loaded successfully.")
|
119 |
-
if progress_callback is not None:
|
120 |
-
try:
|
121 |
-
progress_callback(0.3, desc=f"Custom FastText model '{model_name}' loaded.")
|
122 |
-
except Exception as e:
|
123 |
-
logger.warning(f"Progress callback error (non-critical): {e}")
|
124 |
-
else:
|
125 |
-
model_warning = f"Custom FastText model ('{model_name}') failed to load. Semantic similarity will be disabled."
|
126 |
-
logger.warning(model_warning)
|
127 |
-
enable_semantic = False
|
128 |
-
|
129 |
-
elif model_name == "facebook-fasttext-pretrained":
|
130 |
-
logger.info(f"Attempting to load Facebook FastText model: {model_name}")
|
131 |
if progress_callback is not None:
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
logger.warning(f"Progress callback error (non-critical): {e}")
|
136 |
-
|
137 |
-
fb_model, fb_model_type = get_model_and_device(model_id=model_name) # from semantic_embedding
|
138 |
-
if fb_model:
|
139 |
-
model = fb_model
|
140 |
-
model_type = fb_model_type # Should be "fasttext"
|
141 |
-
logger.info(f"Facebook FastText model '{model_name}' (type: {model_type}) loaded successfully.")
|
142 |
-
if progress_callback is not None:
|
143 |
-
try:
|
144 |
-
progress_callback(0.3, desc=f"Facebook FastText model '{model_name}' loaded.")
|
145 |
-
except Exception as e:
|
146 |
-
logger.warning(f"Progress callback error (non-critical): {e}")
|
147 |
-
else:
|
148 |
-
model_warning = f"Facebook FastText model ('{model_name}') failed to load. Semantic similarity will be disabled."
|
149 |
-
logger.warning(model_warning)
|
150 |
-
enable_semantic = False
|
151 |
-
|
152 |
-
else: # Any other model_name is unsupported
|
153 |
-
model_warning = f"Unsupported model_name: '{model_name}'. Semantic similarity will be disabled. Supported models are '{FASTTEXT_MODEL_ID}' and 'facebook-fasttext-pretrained'."
|
154 |
logger.warning(model_warning)
|
|
|
155 |
enable_semantic = False
|
156 |
if progress_callback is not None:
|
157 |
try:
|
@@ -320,8 +284,7 @@ def process_texts(
|
|
320 |
enable_semantic=enable_semantic,
|
321 |
model_type=model_type,
|
322 |
use_stopwords=use_stopwords,
|
323 |
-
use_lite_stopwords=use_lite_stopwords
|
324 |
-
fasttext_tokenize_fn=tokenizer_for_fasttext
|
325 |
)
|
326 |
|
327 |
# Rename 'Text Pair' to show file stems and chapter number
|
|
|
1 |
import pandas as pd
|
2 |
from typing import Dict, List, Tuple
|
3 |
from .metrics import compute_all_metrics
|
4 |
+
from .fasttext_embedding import get_model as get_fasttext_model
|
5 |
+
from .hf_embedding import get_model as get_hf_model
|
6 |
+
from .fasttext_embedding import load_fasttext_model
|
7 |
from .tokenize import tokenize_texts
|
8 |
import logging
|
9 |
from itertools import combinations
|
|
|
52 |
text_data: Dict[str, str],
|
53 |
filenames: List[str],
|
54 |
enable_semantic: bool = True,
|
55 |
+
model_name: str = "facebook-fasttext-pretrained",
|
56 |
use_stopwords: bool = True,
|
57 |
use_lite_stopwords: bool = False,
|
58 |
progress_callback = None
|
|
|
66 |
enable_semantic (bool, optional): Whether to compute semantic similarity metrics.
|
67 |
Requires loading a sentence transformer model, which can be time-consuming. Defaults to True.
|
68 |
model_name (str, optional): The name of the sentence transformer model to use for semantic similarity.
|
69 |
+
Must be a valid model identifier on Hugging Face. Defaults to "facebook-fasttext-pretrained".
|
70 |
use_stopwords (bool, optional): Whether to use stopwords in the metrics calculation. Defaults to True.
|
71 |
use_lite_stopwords (bool, optional): Whether to use the lite stopwords list (common particles only)
|
72 |
instead of the comprehensive list. Only applies if use_stopwords is True. Defaults to False.
|
|
|
102 |
if enable_semantic:
|
103 |
logger.info("Semantic similarity enabled. Loading embedding model...")
|
104 |
try:
|
105 |
+
logger.info(f"Using model: {model_name}")
|
106 |
+
if 'e5-base' in model_name or 'LaBSE' in model_name:
|
107 |
+
model, model_type = get_hf_model(model_id=model_name)
|
108 |
+
else:
|
109 |
+
model, model_type = get_fasttext_model(model_id=model_name)
|
110 |
|
111 |
+
if model:
|
112 |
+
logger.info(f"Model '{model_name}' (type: {model_type}) loaded successfully.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
if progress_callback is not None:
|
114 |
+
progress_callback(0.3, desc=f"Model '{model_name}' loaded.")
|
115 |
+
else:
|
116 |
+
model_warning = f"Model ('{model_name}') failed to load. Semantic similarity will be disabled."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
logger.warning(model_warning)
|
118 |
+
warning += f" {model_warning}"
|
119 |
enable_semantic = False
|
120 |
if progress_callback is not None:
|
121 |
try:
|
|
|
284 |
enable_semantic=enable_semantic,
|
285 |
model_type=model_type,
|
286 |
use_stopwords=use_stopwords,
|
287 |
+
use_lite_stopwords=use_lite_stopwords
|
|
|
288 |
)
|
289 |
|
290 |
# Rename 'Text Pair' to show file stems and chapter number
|
pipeline/semantic_embedding.py
DELETED
@@ -1,141 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
from typing import List, Any, Optional
|
3 |
-
import numpy as np # Added for type hinting Optional[np.ndarray]
|
4 |
-
|
5 |
-
# Configure logging
|
6 |
-
logging.basicConfig(
|
7 |
-
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
8 |
-
)
|
9 |
-
logger = logging.getLogger(__name__)
|
10 |
-
|
11 |
-
# Define the model ID for the Facebook FastText pretrained model
|
12 |
-
DEFAULT_MODEL_NAME = "facebook-fasttext-pretrained"
|
13 |
-
|
14 |
-
# FASTTEXT_MODEL_ID = "fasttext-tibetan" # Removed: Custom model loading to be handled in process.py directly
|
15 |
-
|
16 |
-
|
17 |
-
def get_model_and_device(model_id: str = DEFAULT_MODEL_NAME):
|
18 |
-
"""
|
19 |
-
Loads the Facebook official pre-trained FastText model for Tibetan.
|
20 |
-
|
21 |
-
Args:
|
22 |
-
model_id (str): The model ID. Must be 'facebook-fasttext-pretrained' (DEFAULT_MODEL_NAME).
|
23 |
-
|
24 |
-
Returns:
|
25 |
-
Tuple[Optional[Any], Optional[str]]:
|
26 |
-
A tuple containing the loaded FastText model and its type ("fasttext"),
|
27 |
-
or (None, None) if loading fails or model_id is unsupported.
|
28 |
-
"""
|
29 |
-
logger.info("Attempting to load FastText model via semantic_embedding.get_model_and_device: %s", model_id)
|
30 |
-
|
31 |
-
if model_id == DEFAULT_MODEL_NAME: # DEFAULT_MODEL_NAME is "facebook-fasttext-pretrained"
|
32 |
-
try:
|
33 |
-
# Importing here to minimize issues if fasttext_embedding also imports from semantic_embedding
|
34 |
-
from .fasttext_embedding import load_facebook_official_tibetan_model
|
35 |
-
|
36 |
-
model = load_facebook_official_tibetan_model()
|
37 |
-
|
38 |
-
if model:
|
39 |
-
logger.info(f"FastText model object received in get_model_and_device. Type: {type(model)}.")
|
40 |
-
try:
|
41 |
-
logger.info(f"Model dimensions: {model.get_dimension()}")
|
42 |
-
# Basic check for model validity via an expected attribute/method
|
43 |
-
if hasattr(model, 'get_word_vector'):
|
44 |
-
logger.info("Model has 'get_word_vector' method (Python API expected for fasttext.load_model results).")
|
45 |
-
except Exception as diag_e:
|
46 |
-
logger.error(f"Error during diagnostic check of FastText model '{model_id}': {diag_e}", exc_info=True)
|
47 |
-
return model, "fasttext"
|
48 |
-
else:
|
49 |
-
# This case implies load_facebook_official_tibetan_model returned None without raising an error.
|
50 |
-
logger.error(f"Model loading for '{model_id}' via load_facebook_official_tibetan_model() returned None unexpectedly.")
|
51 |
-
return None, None
|
52 |
-
except Exception as e:
|
53 |
-
logger.error(f"Failed to load or initialize FastText model '{model_id}': {e}. Semantic similarity will not be available.", exc_info=True)
|
54 |
-
return None, None
|
55 |
-
else:
|
56 |
-
logger.error(f"Unsupported model_id for get_model_and_device in semantic_embedding.py: '{model_id}'. Only '{DEFAULT_MODEL_NAME}' is supported by this function.")
|
57 |
-
return None, None
|
58 |
-
|
59 |
-
|
60 |
-
def generate_embeddings(texts: List[str], model: Any, tokenize_fn=None, use_stopwords: bool = True, use_lite_stopwords: bool = False, corpus_token_freq=None, doc_freq_map=None, total_docs_in_corpus=0) -> Optional[np.ndarray]:
|
61 |
-
"""
|
62 |
-
Generates FastText embeddings for a list of texts.
|
63 |
-
|
64 |
-
Args:
|
65 |
-
texts (list[str]): A list of texts to embed.
|
66 |
-
model: The loaded FastText model.
|
67 |
-
tokenize_fn: Optional tokenization function for FastText (if different from default botok based).
|
68 |
-
use_stopwords (bool): Whether to filter out stopwords for FastText embeddings.
|
69 |
-
use_lite_stopwords (bool): Whether to use the 'lite' stopwords list.
|
70 |
-
corpus_token_freq: Corpus-wide term frequencies for TF-IDF weighted FastText.
|
71 |
-
doc_freq_map: Document frequency map for TF-IDF weighted FastText.
|
72 |
-
total_docs_in_corpus: Total documents in corpus for TF-IDF weighted FastText.
|
73 |
-
|
74 |
-
Returns:
|
75 |
-
Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
|
76 |
-
"""
|
77 |
-
if not texts:
|
78 |
-
logger.warning(
|
79 |
-
"No texts provided to generate_embeddings. Returning None."
|
80 |
-
)
|
81 |
-
return None
|
82 |
-
|
83 |
-
logger.info(f"Generating FastText embeddings for {len(texts)} texts...")
|
84 |
-
|
85 |
-
try:
|
86 |
-
from .fasttext_embedding import get_batch_embeddings
|
87 |
-
|
88 |
-
stopwords_set = None
|
89 |
-
if use_stopwords:
|
90 |
-
if use_lite_stopwords:
|
91 |
-
from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE_SET
|
92 |
-
stopwords_set = TIBETAN_STOPWORDS_LITE_SET
|
93 |
-
else:
|
94 |
-
from .stopwords_bo import TIBETAN_STOPWORDS_SET
|
95 |
-
stopwords_set = TIBETAN_STOPWORDS_SET
|
96 |
-
|
97 |
-
embeddings = get_batch_embeddings(
|
98 |
-
texts,
|
99 |
-
model,
|
100 |
-
tokenize_fn=tokenize_fn,
|
101 |
-
use_stopwords=use_stopwords,
|
102 |
-
stopwords_set=stopwords_set,
|
103 |
-
corpus_token_freq=corpus_token_freq,
|
104 |
-
doc_freq_map=doc_freq_map,
|
105 |
-
total_docs_in_corpus=total_docs_in_corpus
|
106 |
-
)
|
107 |
-
if embeddings is None:
|
108 |
-
logger.error(f"get_batch_embeddings returned None for {len(texts)} texts. First few: {texts[:2]}")
|
109 |
-
return None
|
110 |
-
|
111 |
-
logger.info("FastText embeddings generated with shape: %s", str(embeddings.shape))
|
112 |
-
return embeddings
|
113 |
-
except ImportError:
|
114 |
-
logger.error("Required FastText modules not found. Please ensure 'fasttext' and its dependencies are correctly installed.")
|
115 |
-
return None
|
116 |
-
except Exception as e:
|
117 |
-
logger.error(f"An unexpected error occurred during FastText embedding generation: {e}", exc_info=True)
|
118 |
-
return None
|
119 |
-
|
120 |
-
|
121 |
-
def train_fasttext_model(corpus_texts: List[str], **kwargs):
|
122 |
-
"""
|
123 |
-
Train a FastText model on the provided corpus texts.
|
124 |
-
|
125 |
-
Args:
|
126 |
-
corpus_texts: List of texts to use for training
|
127 |
-
**kwargs: Additional parameters for training (dim, epoch, etc.)
|
128 |
-
|
129 |
-
Returns:
|
130 |
-
Trained model and path to the model file (Note: current implementation returns only model object)
|
131 |
-
""" # Docstring updated for return type
|
132 |
-
try:
|
133 |
-
from .fasttext_embedding import prepare_corpus_file, train_fasttext_model as train_ft
|
134 |
-
|
135 |
-
corpus_path = prepare_corpus_file(corpus_texts)
|
136 |
-
model = train_ft(corpus_path=corpus_path, **kwargs)
|
137 |
-
|
138 |
-
return model # Returns model object, not path as previously suggested by older docstring
|
139 |
-
except ImportError:
|
140 |
-
logger.error("FastText module not found. Please install it with 'pip install fasttext'.")
|
141 |
-
raise # Re-raising to signal critical failure if training components are missing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -12,6 +12,7 @@ numpy==1.26.4
|
|
12 |
scikit-learn==1.6.1
|
13 |
numba==0.61.2
|
14 |
fasttext==0.9.2
|
|
|
15 |
|
16 |
# Tibetan language processing
|
17 |
botok
|
|
|
12 |
scikit-learn==1.6.1
|
13 |
numba==0.61.2
|
14 |
fasttext==0.9.2
|
15 |
+
sentence-transformers==3.0.1
|
16 |
|
17 |
# Tibetan language processing
|
18 |
botok
|