Spaces:
Sleeping
Sleeping
Commit
·
b44d470
1
Parent(s):
b2ce320
bugs fixed
Browse files- app.py +27 -10
- pipeline/fast_lcs.pyx +29 -7
- pipeline/fasttext_embedding.py +54 -85
- pipeline/hf_embedding.py +21 -6
- pipeline/metrics.py +36 -20
- pipeline/process.py +10 -3
app.py
CHANGED
@@ -5,6 +5,7 @@ from pipeline.visualize import generate_visualizations, generate_word_count_char
|
|
5 |
from pipeline.llm_service import get_interpretation
|
6 |
import logging
|
7 |
import pandas as pd
|
|
|
8 |
|
9 |
from dotenv import load_dotenv
|
10 |
|
@@ -14,8 +15,6 @@ load_dotenv()
|
|
14 |
from theme import tibetan_theme
|
15 |
|
16 |
logger = logging.getLogger(__name__)
|
17 |
-
|
18 |
-
# Main interface logic
|
19 |
def main_interface():
|
20 |
with gr.Blocks(
|
21 |
theme=tibetan_theme,
|
@@ -24,8 +23,9 @@ def main_interface():
|
|
24 |
) as demo:
|
25 |
gr.Markdown(
|
26 |
"""# Tibetan Text Metrics Web App
|
27 |
-
<span style='font-size:18px;'>A user-friendly web application for analyzing textual similarities and variations in Tibetan manuscripts, providing a graphical interface to the core functionalities of the [Tibetan Text Metrics (TTM)](https://github.com/daniel-wojahn/tibetan-text-metrics) project. Powered by
|
28 |
""",
|
|
|
29 |
elem_classes="gr-markdown",
|
30 |
)
|
31 |
|
@@ -75,6 +75,21 @@ def main_interface():
|
|
75 |
info="Select the embedding model to use for semantic similarity analysis."
|
76 |
)
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
stopwords_dropdown = gr.Dropdown(
|
79 |
label="Stopword Filtering",
|
80 |
choices=[
|
@@ -258,7 +273,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
258 |
|
259 |
warning_box = gr.Markdown(visible=False)
|
260 |
|
261 |
-
def run_pipeline(files, enable_semantic, model_name, stopwords_option
|
262 |
"""Run the text analysis pipeline on the uploaded files.
|
263 |
|
264 |
Args:
|
@@ -389,12 +404,15 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
389 |
internal_model_id = "facebook-fasttext-pretrained"
|
390 |
|
391 |
df_results, word_counts_df_data, warning_raw = process_texts(
|
392 |
-
text_data,
|
393 |
-
|
394 |
-
|
|
|
395 |
use_stopwords=use_stopwords,
|
396 |
use_lite_stopwords=use_lite_stopwords,
|
397 |
-
progress_callback=progress_tracker
|
|
|
|
|
398 |
)
|
399 |
|
400 |
if df_results.empty:
|
@@ -493,7 +511,6 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
493 |
progress(1.0, desc="Analysis complete!")
|
494 |
|
495 |
# Add a timestamp to the interpretation
|
496 |
-
from datetime import datetime
|
497 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
|
498 |
interpretation = f"{interpretation}\n\n<small>Analysis generated on {timestamp}</small>"
|
499 |
return interpretation
|
@@ -503,7 +520,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
503 |
|
504 |
process_btn.click(
|
505 |
fn=run_pipeline,
|
506 |
-
inputs=[file_input, semantic_toggle_radio, model_dropdown, stopwords_dropdown],
|
507 |
outputs=[
|
508 |
csv_output,
|
509 |
metrics_preview,
|
|
|
5 |
from pipeline.llm_service import get_interpretation
|
6 |
import logging
|
7 |
import pandas as pd
|
8 |
+
from datetime import datetime
|
9 |
|
10 |
from dotenv import load_dotenv
|
11 |
|
|
|
15 |
from theme import tibetan_theme
|
16 |
|
17 |
logger = logging.getLogger(__name__)
|
|
|
|
|
18 |
def main_interface():
|
19 |
with gr.Blocks(
|
20 |
theme=tibetan_theme,
|
|
|
23 |
) as demo:
|
24 |
gr.Markdown(
|
25 |
"""# Tibetan Text Metrics Web App
|
26 |
+
<span style='font-size:18px;'>A user-friendly web application for analyzing textual similarities and variations in Tibetan manuscripts, providing a graphical interface to the core functionalities of the [Tibetan Text Metrics (TTM)](https://github.com/daniel-wojahn/tibetan-text-metrics) project. Powered by advanced language models via OpenRouter for in-depth text analysis.</span>
|
27 |
""",
|
28 |
+
|
29 |
elem_classes="gr-markdown",
|
30 |
)
|
31 |
|
|
|
75 |
info="Select the embedding model to use for semantic similarity analysis."
|
76 |
)
|
77 |
|
78 |
+
with gr.Accordion("Advanced Options", open=False):
|
79 |
+
batch_size_slider = gr.Slider(
|
80 |
+
minimum=1,
|
81 |
+
maximum=64,
|
82 |
+
value=8,
|
83 |
+
step=1,
|
84 |
+
label="Batch Size (for Hugging Face models)",
|
85 |
+
info="Adjust based on your hardware (VRAM). Lower this if you encounter memory issues."
|
86 |
+
)
|
87 |
+
progress_bar_checkbox = gr.Checkbox(
|
88 |
+
label="Show Embedding Progress Bar",
|
89 |
+
value=False,
|
90 |
+
info="Display a progress bar during embedding generation. Useful for large datasets."
|
91 |
+
)
|
92 |
+
|
93 |
stopwords_dropdown = gr.Dropdown(
|
94 |
label="Stopword Filtering",
|
95 |
choices=[
|
|
|
273 |
|
274 |
warning_box = gr.Markdown(visible=False)
|
275 |
|
276 |
+
def run_pipeline(files, enable_semantic, model_name, stopwords_option, batch_size, show_progress, progress=gr.Progress()):
|
277 |
"""Run the text analysis pipeline on the uploaded files.
|
278 |
|
279 |
Args:
|
|
|
404 |
internal_model_id = "facebook-fasttext-pretrained"
|
405 |
|
406 |
df_results, word_counts_df_data, warning_raw = process_texts(
|
407 |
+
text_data,
|
408 |
+
filenames,
|
409 |
+
enable_semantic=enable_semantic_bool,
|
410 |
+
model_name=internal_model_id,
|
411 |
use_stopwords=use_stopwords,
|
412 |
use_lite_stopwords=use_lite_stopwords,
|
413 |
+
progress_callback=progress_tracker,
|
414 |
+
batch_size=batch_size,
|
415 |
+
show_progress_bar=show_progress
|
416 |
)
|
417 |
|
418 |
if df_results.empty:
|
|
|
511 |
progress(1.0, desc="Analysis complete!")
|
512 |
|
513 |
# Add a timestamp to the interpretation
|
|
|
514 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
|
515 |
interpretation = f"{interpretation}\n\n<small>Analysis generated on {timestamp}</small>"
|
516 |
return interpretation
|
|
|
520 |
|
521 |
process_btn.click(
|
522 |
fn=run_pipeline,
|
523 |
+
inputs=[file_input, semantic_toggle_radio, model_dropdown, stopwords_dropdown, batch_size_slider, progress_bar_checkbox],
|
524 |
outputs=[
|
525 |
csv_output,
|
526 |
metrics_preview,
|
pipeline/fast_lcs.pyx
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
# fast_lcs.pyx
|
2 |
import numpy as np
|
3 |
|
4 |
cimport cython
|
@@ -8,16 +7,39 @@ cimport numpy as np
|
|
8 |
@cython.boundscheck(False)
|
9 |
@cython.wraparound(False)
|
10 |
def compute_lcs_fast(list words1, list words2):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
cdef int m = len(words1)
|
12 |
cdef int n = len(words2)
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
cdef int i, j
|
15 |
-
|
16 |
for i in range(1, m + 1):
|
17 |
for j in range(1, n + 1):
|
18 |
if words1[i - 1] == words2[j - 1]:
|
19 |
-
|
20 |
else:
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
1 |
import numpy as np
|
2 |
|
3 |
cimport cython
|
|
|
7 |
@cython.boundscheck(False)
|
8 |
@cython.wraparound(False)
|
9 |
def compute_lcs_fast(list words1, list words2):
|
10 |
+
"""
|
11 |
+
Computes the Longest Common Subsequence (LCS) of two lists of words.
|
12 |
+
|
13 |
+
This implementation is memory-optimized and uses O(min(m, n)) space, where
|
14 |
+
m and n are the lengths of the word lists.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
words1 (list): The first list of words.
|
18 |
+
words2 (list): The second list of words.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
int: The length of the Longest Common Subsequence.
|
22 |
+
"""
|
23 |
cdef int m = len(words1)
|
24 |
cdef int n = len(words2)
|
25 |
+
|
26 |
+
# Ensure words2 is the shorter sequence to optimize memory usage
|
27 |
+
if m < n:
|
28 |
+
return compute_lcs_fast(words2, words1)
|
29 |
+
|
30 |
+
# We only need two rows for the DP table
|
31 |
+
cdef np.ndarray[np.int32_t, ndim=1] prev_row = np.zeros(n + 1, dtype=np.int32)
|
32 |
+
cdef np.ndarray[np.int32_t, ndim=1] curr_row = np.zeros(n + 1, dtype=np.int32)
|
33 |
cdef int i, j
|
34 |
+
|
35 |
for i in range(1, m + 1):
|
36 |
for j in range(1, n + 1):
|
37 |
if words1[i - 1] == words2[j - 1]:
|
38 |
+
curr_row[j] = prev_row[j - 1] + 1
|
39 |
else:
|
40 |
+
curr_row[j] = max(prev_row[j], curr_row[j - 1])
|
41 |
+
|
42 |
+
# Copy current row to previous row for the next iteration
|
43 |
+
prev_row = curr_row.copy()
|
44 |
+
|
45 |
+
return int(prev_row[n])
|
pipeline/fasttext_embedding.py
CHANGED
@@ -66,13 +66,13 @@ def train_fasttext_model(
|
|
66 |
Args:
|
67 |
corpus_path: Path to the corpus file
|
68 |
model_path: Path where to save the trained model
|
69 |
-
dim: Embedding dimension (default:
|
70 |
-
epoch: Number of training epochs (default:
|
71 |
-
min_count: Minimum count of words (default:
|
72 |
window: Size of context window (default: 5)
|
73 |
minn: Minimum length of char n-gram (default: 3)
|
74 |
maxn: Maximum length of char n-gram (default: 6)
|
75 |
-
neg: Number of negatives in negative sampling (default:
|
76 |
model_type: FastText model type ('skipgram' or 'cbow')
|
77 |
|
78 |
Returns:
|
@@ -83,56 +83,48 @@ def train_fasttext_model(
|
|
83 |
logger.info("Training FastText model with %s, dim=%d, epoch=%d, window=%d, minn=%d, maxn=%d...",
|
84 |
model_type, dim, epoch, window, minn, maxn)
|
85 |
|
86 |
-
|
87 |
-
|
|
|
|
|
88 |
try:
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
|
|
|
|
|
101 |
except Exception as e:
|
102 |
-
logger.
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
minCount=min_count,
|
112 |
-
wordNgrams=1,
|
113 |
-
minn=minn,
|
114 |
-
maxn=maxn,
|
115 |
-
neg=neg,
|
116 |
-
window=window
|
117 |
-
)
|
118 |
-
else: # cbow
|
119 |
-
model = fasttext.train_unsupervised(
|
120 |
-
corpus_path,
|
121 |
-
model="cbow",
|
122 |
-
dim=dim,
|
123 |
-
epoch=epoch,
|
124 |
-
minCount=min_count,
|
125 |
-
wordNgrams=1,
|
126 |
-
minn=minn,
|
127 |
-
maxn=maxn,
|
128 |
-
neg=neg,
|
129 |
-
window=window
|
130 |
-
)
|
131 |
-
|
132 |
-
# Save the model
|
133 |
-
model.save_model(model_path)
|
134 |
-
logger.info("FastText model trained and saved to %s", model_path)
|
135 |
-
|
136 |
return model
|
137 |
|
138 |
|
@@ -200,6 +192,17 @@ def load_fasttext_model(model_path: str = DEFAULT_MODEL_PATH) -> Optional[fastte
|
|
200 |
return None
|
201 |
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
def get_text_embedding(
|
204 |
text: str,
|
205 |
model: fasttext.FastText._FastText,
|
@@ -248,40 +251,6 @@ def get_text_embedding(
|
|
248 |
if use_stopwords and stopwords_set:
|
249 |
logger.debug(f"Original tokens before stopword check (first 20): {tokens[:20]}")
|
250 |
original_token_count = len(tokens)
|
251 |
-
|
252 |
-
def _remove_stopwords_from_tokens(tokens: List[str], stopwords_set: Set[str]) -> List[str]:
|
253 |
-
"""
|
254 |
-
Removes stopwords from a list of tokens.
|
255 |
-
Handles Tibetan punctuation by checking both the token itself and the token after
|
256 |
-
stripping trailing '།' or '༔'.
|
257 |
-
"""
|
258 |
-
cleaned_tokens = []
|
259 |
-
removed_count = 0
|
260 |
-
for token in tokens:
|
261 |
-
# 1. Check if the original token itself is a stopword (e.g., standalone '།')
|
262 |
-
if token in stopwords_set:
|
263 |
-
removed_count += 1
|
264 |
-
continue # Skip this token
|
265 |
-
|
266 |
-
# 2. If not a direct stopword, check if it becomes one after stripping trailing punctuation
|
267 |
-
# This handles cases like "གྲུབ་པའི་།" where "གྲུབ་པའི་" is the stopword.
|
268 |
-
token_for_check = token
|
269 |
-
punctuation_was_stripped = False
|
270 |
-
if token.endswith(('།', '༔')):
|
271 |
-
stripped_token = token.rstrip('།༔')
|
272 |
-
if stripped_token != token: # Check if stripping actually changed the token
|
273 |
-
token_for_check = stripped_token
|
274 |
-
punctuation_was_stripped = True
|
275 |
-
|
276 |
-
if punctuation_was_stripped and token_for_check in stopwords_set:
|
277 |
-
removed_count += 1
|
278 |
-
continue # Skip this token
|
279 |
-
|
280 |
-
# 3. If neither the original token nor its base form is a stopword, keep it.
|
281 |
-
cleaned_tokens.append(token)
|
282 |
-
|
283 |
-
return cleaned_tokens
|
284 |
-
|
285 |
tokens = _remove_stopwords_from_tokens(tokens, stopwords_set)
|
286 |
removed_count = original_token_count - len(tokens)
|
287 |
logger.debug(f"Tokens after stopword removal (removed {removed_count}): {tokens[:20]}")
|
|
|
66 |
Args:
|
67 |
corpus_path: Path to the corpus file
|
68 |
model_path: Path where to save the trained model
|
69 |
+
dim: Embedding dimension (default: 100)
|
70 |
+
epoch: Number of training epochs (default: 5)
|
71 |
+
min_count: Minimum count of words (default: 5)
|
72 |
window: Size of context window (default: 5)
|
73 |
minn: Minimum length of char n-gram (default: 3)
|
74 |
maxn: Maximum length of char n-gram (default: 6)
|
75 |
+
neg: Number of negatives in negative sampling (default: 5)
|
76 |
model_type: FastText model type ('skipgram' or 'cbow')
|
77 |
|
78 |
Returns:
|
|
|
83 |
logger.info("Training FastText model with %s, dim=%d, epoch=%d, window=%d, minn=%d, maxn=%d...",
|
84 |
model_type, dim, epoch, window, minn, maxn)
|
85 |
|
86 |
+
processed_corpus_path = corpus_path + ".processed"
|
87 |
+
corpus_to_train = corpus_path
|
88 |
+
model = None
|
89 |
+
|
90 |
try:
|
91 |
+
# Preprocess the corpus to a temporary file
|
92 |
+
with open(corpus_path, 'r', encoding='utf-8') as f_in, open(processed_corpus_path, 'w', encoding='utf-8') as f_out:
|
93 |
+
content = f_in.read()
|
94 |
+
processed_content = content.replace('་', '་ ')
|
95 |
+
f_out.write(processed_content)
|
96 |
+
logger.info("Corpus preprocessed to temporary file for Tibetan syllable segmentation.")
|
97 |
+
corpus_to_train = processed_corpus_path
|
98 |
+
|
99 |
+
# Train the model with optimized parameters
|
100 |
+
if model_type == "skipgram":
|
101 |
+
model = fasttext.train_unsupervised(
|
102 |
+
corpus_to_train,
|
103 |
+
model="skipgram",
|
104 |
+
dim=dim, epoch=epoch, minCount=min_count, wordNgrams=1,
|
105 |
+
minn=minn, maxn=maxn, neg=neg, window=window
|
106 |
+
)
|
107 |
+
else: # cbow
|
108 |
+
model = fasttext.train_unsupervised(
|
109 |
+
corpus_to_train,
|
110 |
+
model="cbow",
|
111 |
+
dim=dim, epoch=epoch, minCount=min_count, wordNgrams=1,
|
112 |
+
minn=minn, maxn=maxn, neg=neg, window=window
|
113 |
+
)
|
114 |
|
115 |
+
model.save_model(model_path)
|
116 |
+
logger.info("FastText model trained and saved to %s", model_path)
|
117 |
+
|
118 |
except Exception as e:
|
119 |
+
logger.error(f"An error occurred during model training: {e}", exc_info=True)
|
120 |
+
# Re-raise the exception after logging and cleanup
|
121 |
+
raise
|
122 |
+
finally:
|
123 |
+
# Clean up the temporary processed file
|
124 |
+
if os.path.exists(processed_corpus_path):
|
125 |
+
os.remove(processed_corpus_path)
|
126 |
+
logger.info(f"Cleaned up temporary file: {processed_corpus_path}")
|
127 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
return model
|
129 |
|
130 |
|
|
|
192 |
return None
|
193 |
|
194 |
|
195 |
+
def _remove_stopwords_from_tokens(tokens: List[str], stopwords_set: Set[str]) -> List[str]:
|
196 |
+
"""
|
197 |
+
Removes stopwords from a list of tokens using a list comprehension for efficiency.
|
198 |
+
Handles Tibetan punctuation by checking both the token itself and the token after
|
199 |
+
stripping trailing '།' or '༔'.
|
200 |
+
"""
|
201 |
+
if not stopwords_set:
|
202 |
+
return tokens
|
203 |
+
return [token for token in tokens if token not in stopwords_set and token.rstrip('།༔') not in stopwords_set]
|
204 |
+
|
205 |
+
|
206 |
def get_text_embedding(
|
207 |
text: str,
|
208 |
model: fasttext.FastText._FastText,
|
|
|
251 |
if use_stopwords and stopwords_set:
|
252 |
logger.debug(f"Original tokens before stopword check (first 20): {tokens[:20]}")
|
253 |
original_token_count = len(tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
tokens = _remove_stopwords_from_tokens(tokens, stopwords_set)
|
255 |
removed_count = original_token_count - len(tokens)
|
256 |
logger.debug(f"Tokens after stopword removal (removed {removed_count}): {tokens[:20]}")
|
pipeline/hf_embedding.py
CHANGED
@@ -33,26 +33,41 @@ def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[st
|
|
33 |
logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
|
34 |
return None, None
|
35 |
|
36 |
-
def generate_embeddings(
|
|
|
|
|
|
|
|
|
|
|
37 |
"""
|
38 |
Generates embeddings for a list of texts using a SentenceTransformer model.
|
39 |
|
40 |
Args:
|
41 |
texts (list[str]): A list of texts to embed.
|
42 |
model (SentenceTransformer): The loaded SentenceTransformer model.
|
|
|
|
|
43 |
|
44 |
Returns:
|
45 |
-
|
46 |
"""
|
47 |
if not texts or not isinstance(model, SentenceTransformer):
|
48 |
-
logger.warning("Invalid input for generating embeddings.
|
49 |
-
|
|
|
|
|
50 |
|
51 |
logger.info(f"Generating embeddings for {len(texts)} texts with {type(model).__name__}...")
|
52 |
try:
|
53 |
-
embeddings = model.encode(
|
|
|
|
|
|
|
|
|
|
|
54 |
logger.info(f"Embeddings generated with shape: {embeddings.shape}")
|
55 |
return embeddings
|
56 |
except Exception as e:
|
57 |
logger.error(f"An unexpected error occurred during embedding generation: {e}", exc_info=True)
|
58 |
-
|
|
|
|
33 |
logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
|
34 |
return None, None
|
35 |
|
36 |
+
def generate_embeddings(
|
37 |
+
texts: List[str],
|
38 |
+
model: SentenceTransformer,
|
39 |
+
batch_size: int = 32,
|
40 |
+
show_progress_bar: bool = False
|
41 |
+
) -> np.ndarray:
|
42 |
"""
|
43 |
Generates embeddings for a list of texts using a SentenceTransformer model.
|
44 |
|
45 |
Args:
|
46 |
texts (list[str]): A list of texts to embed.
|
47 |
model (SentenceTransformer): The loaded SentenceTransformer model.
|
48 |
+
batch_size (int): The batch size for encoding.
|
49 |
+
show_progress_bar (bool): Whether to display a progress bar.
|
50 |
|
51 |
Returns:
|
52 |
+
np.ndarray: A numpy array containing the embeddings. Returns an empty array of the correct shape on failure.
|
53 |
"""
|
54 |
if not texts or not isinstance(model, SentenceTransformer):
|
55 |
+
logger.warning("Invalid input for generating embeddings. Returning empty array.")
|
56 |
+
# Return a correctly shaped empty array
|
57 |
+
embedding_dim = model.get_sentence_embedding_dimension() if isinstance(model, SentenceTransformer) else 768 # Fallback
|
58 |
+
return np.zeros((len(texts), embedding_dim))
|
59 |
|
60 |
logger.info(f"Generating embeddings for {len(texts)} texts with {type(model).__name__}...")
|
61 |
try:
|
62 |
+
embeddings = model.encode(
|
63 |
+
texts,
|
64 |
+
batch_size=batch_size,
|
65 |
+
convert_to_numpy=True,
|
66 |
+
show_progress_bar=show_progress_bar
|
67 |
+
)
|
68 |
logger.info(f"Embeddings generated with shape: {embeddings.shape}")
|
69 |
return embeddings
|
70 |
except Exception as e:
|
71 |
logger.error(f"An unexpected error occurred during embedding generation: {e}", exc_info=True)
|
72 |
+
embedding_dim = model.get_sentence_embedding_dimension()
|
73 |
+
return np.zeros((len(texts), embedding_dim))
|
pipeline/metrics.py
CHANGED
@@ -55,16 +55,18 @@ def compute_normalized_lcs(words1: List[str], words2: List[str]) -> float:
|
|
55 |
def compute_semantic_similarity(
|
56 |
text1_segment: str,
|
57 |
text2_segment: str,
|
58 |
-
tokens1: List[str],
|
59 |
-
tokens2: List[str],
|
60 |
-
model,
|
61 |
-
model_type: str = "fasttext",
|
62 |
use_stopwords: bool = True,
|
63 |
use_lite_stopwords: bool = False,
|
64 |
fasttext_tokenize_fn=None,
|
65 |
term_freq_corpus=None,
|
66 |
doc_freq_map=None,
|
67 |
-
total_docs_in_corpus=0
|
|
|
|
|
68 |
) -> float:
|
69 |
"""Computes semantic similarity using a FastText model."""
|
70 |
if model_type != "fasttext":
|
@@ -92,7 +94,9 @@ def compute_semantic_similarity(
|
|
92 |
tokenize_fn_param,
|
93 |
term_freq_corpus_param,
|
94 |
doc_freq_map_param,
|
95 |
-
total_docs_in_corpus_param
|
|
|
|
|
96 |
) -> Union[np.ndarray, None]:
|
97 |
"""Helper to get a single embedding for a text using FastText."""
|
98 |
if not raw_text_segment.strip():
|
@@ -109,7 +113,12 @@ def compute_semantic_similarity(
|
|
109 |
use_lite_stopwords=use_lite_stopwords_param
|
110 |
)
|
111 |
elif model_type == "sentence-transformer":
|
112 |
-
embedding = generate_hf_embeddings(
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
if embedding is None or embedding.size == 0:
|
115 |
logger.error(
|
@@ -120,12 +129,12 @@ def compute_semantic_similarity(
|
|
120 |
|
121 |
try:
|
122 |
# Pass all relevant parameters to _get_aggregated_embedding
|
123 |
-
emb1 = _get_aggregated_embedding(text1_segment, tokens1, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus)
|
124 |
-
emb2 = _get_aggregated_embedding(text2_segment, tokens2, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus)
|
125 |
|
126 |
if emb1 is None or emb2 is None or emb1.size == 0 or emb2.size == 0:
|
127 |
logger.error(
|
128 |
-
"Failed to obtain one or both
|
129 |
)
|
130 |
return np.nan
|
131 |
|
@@ -135,16 +144,16 @@ def compute_semantic_similarity(
|
|
135 |
|
136 |
# Handle cases where embeddings are all zeros
|
137 |
if np.all(emb1 == 0) and np.all(emb2 == 0):
|
138 |
-
logger.info("Both
|
139 |
return 0.0
|
140 |
if np.all(emb1 == 0) or np.all(emb2 == 0):
|
141 |
-
logger.info("One of the
|
142 |
return 0.0
|
143 |
|
144 |
# Handle NaN or Inf in embeddings
|
145 |
if np.isnan(emb1).any() or np.isinf(emb1).any() or \
|
146 |
np.isnan(emb2).any() or np.isinf(emb2).any():
|
147 |
-
logger.warning("NaN or Inf found in
|
148 |
return 0.0
|
149 |
|
150 |
# Ensure embeddings are 2D for cosine_similarity: [1, dim]
|
@@ -159,17 +168,22 @@ def compute_semantic_similarity(
|
|
159 |
safe_text1 = str(text1_segment)[:100] if text1_segment is not None else "N/A"
|
160 |
safe_text2 = str(text2_segment)[:100] if text2_segment is not None else "N/A"
|
161 |
logger.error(
|
162 |
-
f"Error during
|
163 |
)
|
164 |
-
logger.exception("Traceback for
|
165 |
return np.nan
|
166 |
|
167 |
|
168 |
def compute_all_metrics(
|
169 |
-
texts: Dict[str, str],
|
170 |
-
|
|
|
|
|
|
|
171 |
use_lite_stopwords: bool = False,
|
172 |
-
fasttext_tokenize_fn=None
|
|
|
|
|
173 |
) -> pd.DataFrame:
|
174 |
"""
|
175 |
Computes all selected similarity metrics between pairs of texts.
|
@@ -318,11 +332,13 @@ def compute_all_metrics(
|
|
318 |
if enable_semantic:
|
319 |
# Pass raw texts and their pre-computed botok tokens
|
320 |
semantic_sim = compute_semantic_similarity(
|
321 |
-
texts[f1], texts[f2], words1_raw, words2_raw, model, model_type, use_stopwords, use_lite_stopwords,
|
322 |
fasttext_tokenize_fn=fasttext_tokenize_fn,
|
323 |
term_freq_corpus=term_freq_corpus_for_fasttext if model_type == "fasttext" else None,
|
324 |
doc_freq_map=document_frequency_map_for_fasttext if model_type == "fasttext" else None,
|
325 |
-
total_docs_in_corpus=total_num_documents_for_fasttext if model_type == "fasttext" else 0
|
|
|
|
|
326 |
)
|
327 |
else:
|
328 |
semantic_sim = np.nan
|
|
|
55 |
def compute_semantic_similarity(
|
56 |
text1_segment: str,
|
57 |
text2_segment: str,
|
58 |
+
tokens1: List[str],
|
59 |
+
tokens2: List[str],
|
60 |
+
model,
|
61 |
+
model_type: str = "fasttext",
|
62 |
use_stopwords: bool = True,
|
63 |
use_lite_stopwords: bool = False,
|
64 |
fasttext_tokenize_fn=None,
|
65 |
term_freq_corpus=None,
|
66 |
doc_freq_map=None,
|
67 |
+
total_docs_in_corpus=0,
|
68 |
+
batch_size: int = 32,
|
69 |
+
show_progress_bar: bool = False
|
70 |
) -> float:
|
71 |
"""Computes semantic similarity using a FastText model."""
|
72 |
if model_type != "fasttext":
|
|
|
94 |
tokenize_fn_param,
|
95 |
term_freq_corpus_param,
|
96 |
doc_freq_map_param,
|
97 |
+
total_docs_in_corpus_param,
|
98 |
+
batch_size_param: int,
|
99 |
+
show_progress_bar_param: bool
|
100 |
) -> Union[np.ndarray, None]:
|
101 |
"""Helper to get a single embedding for a text using FastText."""
|
102 |
if not raw_text_segment.strip():
|
|
|
113 |
use_lite_stopwords=use_lite_stopwords_param
|
114 |
)
|
115 |
elif model_type == "sentence-transformer":
|
116 |
+
embedding = generate_hf_embeddings(
|
117 |
+
texts=[raw_text_segment],
|
118 |
+
model=model_obj,
|
119 |
+
batch_size=batch_size_param,
|
120 |
+
show_progress_bar=show_progress_bar_param
|
121 |
+
)
|
122 |
|
123 |
if embedding is None or embedding.size == 0:
|
124 |
logger.error(
|
|
|
129 |
|
130 |
try:
|
131 |
# Pass all relevant parameters to _get_aggregated_embedding
|
132 |
+
emb1 = _get_aggregated_embedding(text1_segment, tokens1, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus, batch_size, show_progress_bar)
|
133 |
+
emb2 = _get_aggregated_embedding(text2_segment, tokens2, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus, batch_size, show_progress_bar)
|
134 |
|
135 |
if emb1 is None or emb2 is None or emb1.size == 0 or emb2.size == 0:
|
136 |
logger.error(
|
137 |
+
"Failed to obtain one or both embeddings for semantic similarity."
|
138 |
)
|
139 |
return np.nan
|
140 |
|
|
|
144 |
|
145 |
# Handle cases where embeddings are all zeros
|
146 |
if np.all(emb1 == 0) and np.all(emb2 == 0):
|
147 |
+
logger.info("Both embeddings are zero. Semantic similarity is 0.0.")
|
148 |
return 0.0
|
149 |
if np.all(emb1 == 0) or np.all(emb2 == 0):
|
150 |
+
logger.info("One of the embeddings is zero. Semantic similarity is 0.0.")
|
151 |
return 0.0
|
152 |
|
153 |
# Handle NaN or Inf in embeddings
|
154 |
if np.isnan(emb1).any() or np.isinf(emb1).any() or \
|
155 |
np.isnan(emb2).any() or np.isinf(emb2).any():
|
156 |
+
logger.warning("NaN or Inf found in embeddings. Semantic similarity set to 0.0.")
|
157 |
return 0.0
|
158 |
|
159 |
# Ensure embeddings are 2D for cosine_similarity: [1, dim]
|
|
|
168 |
safe_text1 = str(text1_segment)[:100] if text1_segment is not None else "N/A"
|
169 |
safe_text2 = str(text2_segment)[:100] if text2_segment is not None else "N/A"
|
170 |
logger.error(
|
171 |
+
f"Error during semantic similarity calculation:\nText1: {safe_text1}...\nText2: {safe_text2}...\nError: {e}"
|
172 |
)
|
173 |
+
logger.exception("Traceback for semantic similarity calculation error:")
|
174 |
return np.nan
|
175 |
|
176 |
|
177 |
def compute_all_metrics(
|
178 |
+
texts: Dict[str, str],
|
179 |
+
model=None,
|
180 |
+
enable_semantic: bool = True,
|
181 |
+
model_type: str = "fasttext",
|
182 |
+
use_stopwords: bool = True,
|
183 |
use_lite_stopwords: bool = False,
|
184 |
+
fasttext_tokenize_fn=None,
|
185 |
+
batch_size: int = 32,
|
186 |
+
show_progress_bar: bool = False
|
187 |
) -> pd.DataFrame:
|
188 |
"""
|
189 |
Computes all selected similarity metrics between pairs of texts.
|
|
|
332 |
if enable_semantic:
|
333 |
# Pass raw texts and their pre-computed botok tokens
|
334 |
semantic_sim = compute_semantic_similarity(
|
335 |
+
texts[f1], texts[f2], words1_raw, words2_raw, model, model_type, use_stopwords, use_lite_stopwords,
|
336 |
fasttext_tokenize_fn=fasttext_tokenize_fn,
|
337 |
term_freq_corpus=term_freq_corpus_for_fasttext if model_type == "fasttext" else None,
|
338 |
doc_freq_map=document_frequency_map_for_fasttext if model_type == "fasttext" else None,
|
339 |
+
total_docs_in_corpus=total_num_documents_for_fasttext if model_type == "fasttext" else 0,
|
340 |
+
batch_size=batch_size,
|
341 |
+
show_progress_bar=show_progress_bar
|
342 |
)
|
343 |
else:
|
344 |
semantic_sim = np.nan
|
pipeline/process.py
CHANGED
@@ -55,7 +55,9 @@ def process_texts(
|
|
55 |
model_name: str = "facebook-fasttext-pretrained",
|
56 |
use_stopwords: bool = True,
|
57 |
use_lite_stopwords: bool = False,
|
58 |
-
progress_callback = None
|
|
|
|
|
59 |
) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
|
60 |
"""
|
61 |
Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
|
@@ -279,12 +281,17 @@ def process_texts(
|
|
279 |
logger.info("Using botok word-level tokenization for FastText model.")
|
280 |
|
281 |
pair_metrics = compute_all_metrics(
|
282 |
-
{seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
|
|
|
|
|
283 |
model=model,
|
284 |
enable_semantic=enable_semantic,
|
285 |
model_type=model_type,
|
286 |
use_stopwords=use_stopwords,
|
287 |
-
use_lite_stopwords=use_lite_stopwords
|
|
|
|
|
|
|
288 |
)
|
289 |
|
290 |
# Rename 'Text Pair' to show file stems and chapter number
|
|
|
55 |
model_name: str = "facebook-fasttext-pretrained",
|
56 |
use_stopwords: bool = True,
|
57 |
use_lite_stopwords: bool = False,
|
58 |
+
progress_callback = None,
|
59 |
+
batch_size: int = 32,
|
60 |
+
show_progress_bar: bool = False
|
61 |
) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
|
62 |
"""
|
63 |
Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
|
|
|
281 |
logger.info("Using botok word-level tokenization for FastText model.")
|
282 |
|
283 |
pair_metrics = compute_all_metrics(
|
284 |
+
texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
|
285 |
+
token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
|
286 |
+
metrics_to_compute=["jaccard", "lcs", "tfidf"],
|
287 |
model=model,
|
288 |
enable_semantic=enable_semantic,
|
289 |
model_type=model_type,
|
290 |
use_stopwords=use_stopwords,
|
291 |
+
use_lite_stopwords=use_lite_stopwords,
|
292 |
+
fasttext_tokenize_fn=tokenizer_for_fasttext,
|
293 |
+
batch_size=batch_size,
|
294 |
+
show_progress_bar=show_progress_bar
|
295 |
)
|
296 |
|
297 |
# Rename 'Text Pair' to show file stems and chapter number
|