daniel-wojahn commited on
Commit
b44d470
·
1 Parent(s): b2ce320

bugs fixed

Browse files
app.py CHANGED
@@ -5,6 +5,7 @@ from pipeline.visualize import generate_visualizations, generate_word_count_char
5
  from pipeline.llm_service import get_interpretation
6
  import logging
7
  import pandas as pd
 
8
 
9
  from dotenv import load_dotenv
10
 
@@ -14,8 +15,6 @@ load_dotenv()
14
  from theme import tibetan_theme
15
 
16
  logger = logging.getLogger(__name__)
17
-
18
- # Main interface logic
19
  def main_interface():
20
  with gr.Blocks(
21
  theme=tibetan_theme,
@@ -24,8 +23,9 @@ def main_interface():
24
  ) as demo:
25
  gr.Markdown(
26
  """# Tibetan Text Metrics Web App
27
- <span style='font-size:18px;'>A user-friendly web application for analyzing textual similarities and variations in Tibetan manuscripts, providing a graphical interface to the core functionalities of the [Tibetan Text Metrics (TTM)](https://github.com/daniel-wojahn/tibetan-text-metrics) project. Powered by Mistral 7B via OpenRouter for advanced text analysis.</span>
28
  """,
 
29
  elem_classes="gr-markdown",
30
  )
31
 
@@ -75,6 +75,21 @@ def main_interface():
75
  info="Select the embedding model to use for semantic similarity analysis."
76
  )
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  stopwords_dropdown = gr.Dropdown(
79
  label="Stopword Filtering",
80
  choices=[
@@ -258,7 +273,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
258
 
259
  warning_box = gr.Markdown(visible=False)
260
 
261
- def run_pipeline(files, enable_semantic, model_name, stopwords_option="Aggressive (All function words)", progress=gr.Progress()):
262
  """Run the text analysis pipeline on the uploaded files.
263
 
264
  Args:
@@ -389,12 +404,15 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
389
  internal_model_id = "facebook-fasttext-pretrained"
390
 
391
  df_results, word_counts_df_data, warning_raw = process_texts(
392
- text_data, filenames,
393
- enable_semantic=enable_semantic_bool,
394
- model_name=internal_model_id, # Use the mapped internal ID
 
395
  use_stopwords=use_stopwords,
396
  use_lite_stopwords=use_lite_stopwords,
397
- progress_callback=progress_tracker
 
 
398
  )
399
 
400
  if df_results.empty:
@@ -493,7 +511,6 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
493
  progress(1.0, desc="Analysis complete!")
494
 
495
  # Add a timestamp to the interpretation
496
- from datetime import datetime
497
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
498
  interpretation = f"{interpretation}\n\n<small>Analysis generated on {timestamp}</small>"
499
  return interpretation
@@ -503,7 +520,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
503
 
504
  process_btn.click(
505
  fn=run_pipeline,
506
- inputs=[file_input, semantic_toggle_radio, model_dropdown, stopwords_dropdown],
507
  outputs=[
508
  csv_output,
509
  metrics_preview,
 
5
  from pipeline.llm_service import get_interpretation
6
  import logging
7
  import pandas as pd
8
+ from datetime import datetime
9
 
10
  from dotenv import load_dotenv
11
 
 
15
  from theme import tibetan_theme
16
 
17
  logger = logging.getLogger(__name__)
 
 
18
  def main_interface():
19
  with gr.Blocks(
20
  theme=tibetan_theme,
 
23
  ) as demo:
24
  gr.Markdown(
25
  """# Tibetan Text Metrics Web App
26
+ <span style='font-size:18px;'>A user-friendly web application for analyzing textual similarities and variations in Tibetan manuscripts, providing a graphical interface to the core functionalities of the [Tibetan Text Metrics (TTM)](https://github.com/daniel-wojahn/tibetan-text-metrics) project. Powered by advanced language models via OpenRouter for in-depth text analysis.</span>
27
  """,
28
+
29
  elem_classes="gr-markdown",
30
  )
31
 
 
75
  info="Select the embedding model to use for semantic similarity analysis."
76
  )
77
 
78
+ with gr.Accordion("Advanced Options", open=False):
79
+ batch_size_slider = gr.Slider(
80
+ minimum=1,
81
+ maximum=64,
82
+ value=8,
83
+ step=1,
84
+ label="Batch Size (for Hugging Face models)",
85
+ info="Adjust based on your hardware (VRAM). Lower this if you encounter memory issues."
86
+ )
87
+ progress_bar_checkbox = gr.Checkbox(
88
+ label="Show Embedding Progress Bar",
89
+ value=False,
90
+ info="Display a progress bar during embedding generation. Useful for large datasets."
91
+ )
92
+
93
  stopwords_dropdown = gr.Dropdown(
94
  label="Stopword Filtering",
95
  choices=[
 
273
 
274
  warning_box = gr.Markdown(visible=False)
275
 
276
+ def run_pipeline(files, enable_semantic, model_name, stopwords_option, batch_size, show_progress, progress=gr.Progress()):
277
  """Run the text analysis pipeline on the uploaded files.
278
 
279
  Args:
 
404
  internal_model_id = "facebook-fasttext-pretrained"
405
 
406
  df_results, word_counts_df_data, warning_raw = process_texts(
407
+ text_data,
408
+ filenames,
409
+ enable_semantic=enable_semantic_bool,
410
+ model_name=internal_model_id,
411
  use_stopwords=use_stopwords,
412
  use_lite_stopwords=use_lite_stopwords,
413
+ progress_callback=progress_tracker,
414
+ batch_size=batch_size,
415
+ show_progress_bar=show_progress
416
  )
417
 
418
  if df_results.empty:
 
511
  progress(1.0, desc="Analysis complete!")
512
 
513
  # Add a timestamp to the interpretation
 
514
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
515
  interpretation = f"{interpretation}\n\n<small>Analysis generated on {timestamp}</small>"
516
  return interpretation
 
520
 
521
  process_btn.click(
522
  fn=run_pipeline,
523
+ inputs=[file_input, semantic_toggle_radio, model_dropdown, stopwords_dropdown, batch_size_slider, progress_bar_checkbox],
524
  outputs=[
525
  csv_output,
526
  metrics_preview,
pipeline/fast_lcs.pyx CHANGED
@@ -1,4 +1,3 @@
1
- # fast_lcs.pyx
2
  import numpy as np
3
 
4
  cimport cython
@@ -8,16 +7,39 @@ cimport numpy as np
8
  @cython.boundscheck(False)
9
  @cython.wraparound(False)
10
  def compute_lcs_fast(list words1, list words2):
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  cdef int m = len(words1)
12
  cdef int n = len(words2)
13
- cdef np.ndarray[np.int32_t, ndim=2] dp = np.zeros((m + 1, n + 1), dtype=np.int32)
 
 
 
 
 
 
 
14
  cdef int i, j
15
-
16
  for i in range(1, m + 1):
17
  for j in range(1, n + 1):
18
  if words1[i - 1] == words2[j - 1]:
19
- dp[i, j] = dp[i - 1, j - 1] + 1
20
  else:
21
- dp[i, j] = max(dp[i - 1, j], dp[i, j - 1])
22
-
23
- return int(dp[m, n])
 
 
 
 
 
1
  import numpy as np
2
 
3
  cimport cython
 
7
  @cython.boundscheck(False)
8
  @cython.wraparound(False)
9
  def compute_lcs_fast(list words1, list words2):
10
+ """
11
+ Computes the Longest Common Subsequence (LCS) of two lists of words.
12
+
13
+ This implementation is memory-optimized and uses O(min(m, n)) space, where
14
+ m and n are the lengths of the word lists.
15
+
16
+ Args:
17
+ words1 (list): The first list of words.
18
+ words2 (list): The second list of words.
19
+
20
+ Returns:
21
+ int: The length of the Longest Common Subsequence.
22
+ """
23
  cdef int m = len(words1)
24
  cdef int n = len(words2)
25
+
26
+ # Ensure words2 is the shorter sequence to optimize memory usage
27
+ if m < n:
28
+ return compute_lcs_fast(words2, words1)
29
+
30
+ # We only need two rows for the DP table
31
+ cdef np.ndarray[np.int32_t, ndim=1] prev_row = np.zeros(n + 1, dtype=np.int32)
32
+ cdef np.ndarray[np.int32_t, ndim=1] curr_row = np.zeros(n + 1, dtype=np.int32)
33
  cdef int i, j
34
+
35
  for i in range(1, m + 1):
36
  for j in range(1, n + 1):
37
  if words1[i - 1] == words2[j - 1]:
38
+ curr_row[j] = prev_row[j - 1] + 1
39
  else:
40
+ curr_row[j] = max(prev_row[j], curr_row[j - 1])
41
+
42
+ # Copy current row to previous row for the next iteration
43
+ prev_row = curr_row.copy()
44
+
45
+ return int(prev_row[n])
pipeline/fasttext_embedding.py CHANGED
@@ -66,13 +66,13 @@ def train_fasttext_model(
66
  Args:
67
  corpus_path: Path to the corpus file
68
  model_path: Path where to save the trained model
69
- dim: Embedding dimension (default: 300)
70
- epoch: Number of training epochs (default: 15)
71
- min_count: Minimum count of words (default: 3)
72
  window: Size of context window (default: 5)
73
  minn: Minimum length of char n-gram (default: 3)
74
  maxn: Maximum length of char n-gram (default: 6)
75
- neg: Number of negatives in negative sampling (default: 10)
76
  model_type: FastText model type ('skipgram' or 'cbow')
77
 
78
  Returns:
@@ -83,56 +83,48 @@ def train_fasttext_model(
83
  logger.info("Training FastText model with %s, dim=%d, epoch=%d, window=%d, minn=%d, maxn=%d...",
84
  model_type, dim, epoch, window, minn, maxn)
85
 
86
- # Preprocess corpus for Tibetan - segment by syllable points
87
- # This is based on research showing syllable segmentation works better for Tibetan
 
 
88
  try:
89
- with open(corpus_path, 'r', encoding='utf-8') as f:
90
- content = f.read()
91
-
92
- # Ensure syllable segmentation by adding spaces after Tibetan syllable markers (if not already present)
93
- # This improves model quality for Tibetan text according to research
94
- processed_content = content.replace('་', '་ ')
95
-
96
- # Write back the processed content
97
- with open(corpus_path, 'w', encoding='utf-8') as f:
98
- f.write(processed_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- logger.info("Preprocessed corpus with syllable segmentation for Tibetan text")
 
 
101
  except Exception as e:
102
- logger.warning("Could not preprocess corpus for syllable segmentation: %s", str(e))
103
-
104
- # Train the model with optimized parameters
105
- if model_type == "skipgram":
106
- model = fasttext.train_unsupervised(
107
- corpus_path,
108
- model="skipgram",
109
- dim=dim,
110
- epoch=epoch,
111
- minCount=min_count,
112
- wordNgrams=1,
113
- minn=minn,
114
- maxn=maxn,
115
- neg=neg,
116
- window=window
117
- )
118
- else: # cbow
119
- model = fasttext.train_unsupervised(
120
- corpus_path,
121
- model="cbow",
122
- dim=dim,
123
- epoch=epoch,
124
- minCount=min_count,
125
- wordNgrams=1,
126
- minn=minn,
127
- maxn=maxn,
128
- neg=neg,
129
- window=window
130
- )
131
-
132
- # Save the model
133
- model.save_model(model_path)
134
- logger.info("FastText model trained and saved to %s", model_path)
135
-
136
  return model
137
 
138
 
@@ -200,6 +192,17 @@ def load_fasttext_model(model_path: str = DEFAULT_MODEL_PATH) -> Optional[fastte
200
  return None
201
 
202
 
 
 
 
 
 
 
 
 
 
 
 
203
  def get_text_embedding(
204
  text: str,
205
  model: fasttext.FastText._FastText,
@@ -248,40 +251,6 @@ def get_text_embedding(
248
  if use_stopwords and stopwords_set:
249
  logger.debug(f"Original tokens before stopword check (first 20): {tokens[:20]}")
250
  original_token_count = len(tokens)
251
-
252
- def _remove_stopwords_from_tokens(tokens: List[str], stopwords_set: Set[str]) -> List[str]:
253
- """
254
- Removes stopwords from a list of tokens.
255
- Handles Tibetan punctuation by checking both the token itself and the token after
256
- stripping trailing '།' or '༔'.
257
- """
258
- cleaned_tokens = []
259
- removed_count = 0
260
- for token in tokens:
261
- # 1. Check if the original token itself is a stopword (e.g., standalone '།')
262
- if token in stopwords_set:
263
- removed_count += 1
264
- continue # Skip this token
265
-
266
- # 2. If not a direct stopword, check if it becomes one after stripping trailing punctuation
267
- # This handles cases like "གྲུབ་པའི་།" where "གྲུབ་པའི་" is the stopword.
268
- token_for_check = token
269
- punctuation_was_stripped = False
270
- if token.endswith(('།', '༔')):
271
- stripped_token = token.rstrip('།༔')
272
- if stripped_token != token: # Check if stripping actually changed the token
273
- token_for_check = stripped_token
274
- punctuation_was_stripped = True
275
-
276
- if punctuation_was_stripped and token_for_check in stopwords_set:
277
- removed_count += 1
278
- continue # Skip this token
279
-
280
- # 3. If neither the original token nor its base form is a stopword, keep it.
281
- cleaned_tokens.append(token)
282
-
283
- return cleaned_tokens
284
-
285
  tokens = _remove_stopwords_from_tokens(tokens, stopwords_set)
286
  removed_count = original_token_count - len(tokens)
287
  logger.debug(f"Tokens after stopword removal (removed {removed_count}): {tokens[:20]}")
 
66
  Args:
67
  corpus_path: Path to the corpus file
68
  model_path: Path where to save the trained model
69
+ dim: Embedding dimension (default: 100)
70
+ epoch: Number of training epochs (default: 5)
71
+ min_count: Minimum count of words (default: 5)
72
  window: Size of context window (default: 5)
73
  minn: Minimum length of char n-gram (default: 3)
74
  maxn: Maximum length of char n-gram (default: 6)
75
+ neg: Number of negatives in negative sampling (default: 5)
76
  model_type: FastText model type ('skipgram' or 'cbow')
77
 
78
  Returns:
 
83
  logger.info("Training FastText model with %s, dim=%d, epoch=%d, window=%d, minn=%d, maxn=%d...",
84
  model_type, dim, epoch, window, minn, maxn)
85
 
86
+ processed_corpus_path = corpus_path + ".processed"
87
+ corpus_to_train = corpus_path
88
+ model = None
89
+
90
  try:
91
+ # Preprocess the corpus to a temporary file
92
+ with open(corpus_path, 'r', encoding='utf-8') as f_in, open(processed_corpus_path, 'w', encoding='utf-8') as f_out:
93
+ content = f_in.read()
94
+ processed_content = content.replace('་', '་ ')
95
+ f_out.write(processed_content)
96
+ logger.info("Corpus preprocessed to temporary file for Tibetan syllable segmentation.")
97
+ corpus_to_train = processed_corpus_path
98
+
99
+ # Train the model with optimized parameters
100
+ if model_type == "skipgram":
101
+ model = fasttext.train_unsupervised(
102
+ corpus_to_train,
103
+ model="skipgram",
104
+ dim=dim, epoch=epoch, minCount=min_count, wordNgrams=1,
105
+ minn=minn, maxn=maxn, neg=neg, window=window
106
+ )
107
+ else: # cbow
108
+ model = fasttext.train_unsupervised(
109
+ corpus_to_train,
110
+ model="cbow",
111
+ dim=dim, epoch=epoch, minCount=min_count, wordNgrams=1,
112
+ minn=minn, maxn=maxn, neg=neg, window=window
113
+ )
114
 
115
+ model.save_model(model_path)
116
+ logger.info("FastText model trained and saved to %s", model_path)
117
+
118
  except Exception as e:
119
+ logger.error(f"An error occurred during model training: {e}", exc_info=True)
120
+ # Re-raise the exception after logging and cleanup
121
+ raise
122
+ finally:
123
+ # Clean up the temporary processed file
124
+ if os.path.exists(processed_corpus_path):
125
+ os.remove(processed_corpus_path)
126
+ logger.info(f"Cleaned up temporary file: {processed_corpus_path}")
127
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  return model
129
 
130
 
 
192
  return None
193
 
194
 
195
+ def _remove_stopwords_from_tokens(tokens: List[str], stopwords_set: Set[str]) -> List[str]:
196
+ """
197
+ Removes stopwords from a list of tokens using a list comprehension for efficiency.
198
+ Handles Tibetan punctuation by checking both the token itself and the token after
199
+ stripping trailing '།' or '༔'.
200
+ """
201
+ if not stopwords_set:
202
+ return tokens
203
+ return [token for token in tokens if token not in stopwords_set and token.rstrip('།༔') not in stopwords_set]
204
+
205
+
206
  def get_text_embedding(
207
  text: str,
208
  model: fasttext.FastText._FastText,
 
251
  if use_stopwords and stopwords_set:
252
  logger.debug(f"Original tokens before stopword check (first 20): {tokens[:20]}")
253
  original_token_count = len(tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  tokens = _remove_stopwords_from_tokens(tokens, stopwords_set)
255
  removed_count = original_token_count - len(tokens)
256
  logger.debug(f"Tokens after stopword removal (removed {removed_count}): {tokens[:20]}")
pipeline/hf_embedding.py CHANGED
@@ -33,26 +33,41 @@ def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[st
33
  logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
34
  return None, None
35
 
36
- def generate_embeddings(texts: List[str], model: SentenceTransformer) -> Optional[np.ndarray]:
 
 
 
 
 
37
  """
38
  Generates embeddings for a list of texts using a SentenceTransformer model.
39
 
40
  Args:
41
  texts (list[str]): A list of texts to embed.
42
  model (SentenceTransformer): The loaded SentenceTransformer model.
 
 
43
 
44
  Returns:
45
- Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
46
  """
47
  if not texts or not isinstance(model, SentenceTransformer):
48
- logger.warning("Invalid input for generating embeddings. Texts list is empty or model is not a SentenceTransformer.")
49
- return None
 
 
50
 
51
  logger.info(f"Generating embeddings for {len(texts)} texts with {type(model).__name__}...")
52
  try:
53
- embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
 
 
 
 
 
54
  logger.info(f"Embeddings generated with shape: {embeddings.shape}")
55
  return embeddings
56
  except Exception as e:
57
  logger.error(f"An unexpected error occurred during embedding generation: {e}", exc_info=True)
58
- return None
 
 
33
  logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
34
  return None, None
35
 
36
+ def generate_embeddings(
37
+ texts: List[str],
38
+ model: SentenceTransformer,
39
+ batch_size: int = 32,
40
+ show_progress_bar: bool = False
41
+ ) -> np.ndarray:
42
  """
43
  Generates embeddings for a list of texts using a SentenceTransformer model.
44
 
45
  Args:
46
  texts (list[str]): A list of texts to embed.
47
  model (SentenceTransformer): The loaded SentenceTransformer model.
48
+ batch_size (int): The batch size for encoding.
49
+ show_progress_bar (bool): Whether to display a progress bar.
50
 
51
  Returns:
52
+ np.ndarray: A numpy array containing the embeddings. Returns an empty array of the correct shape on failure.
53
  """
54
  if not texts or not isinstance(model, SentenceTransformer):
55
+ logger.warning("Invalid input for generating embeddings. Returning empty array.")
56
+ # Return a correctly shaped empty array
57
+ embedding_dim = model.get_sentence_embedding_dimension() if isinstance(model, SentenceTransformer) else 768 # Fallback
58
+ return np.zeros((len(texts), embedding_dim))
59
 
60
  logger.info(f"Generating embeddings for {len(texts)} texts with {type(model).__name__}...")
61
  try:
62
+ embeddings = model.encode(
63
+ texts,
64
+ batch_size=batch_size,
65
+ convert_to_numpy=True,
66
+ show_progress_bar=show_progress_bar
67
+ )
68
  logger.info(f"Embeddings generated with shape: {embeddings.shape}")
69
  return embeddings
70
  except Exception as e:
71
  logger.error(f"An unexpected error occurred during embedding generation: {e}", exc_info=True)
72
+ embedding_dim = model.get_sentence_embedding_dimension()
73
+ return np.zeros((len(texts), embedding_dim))
pipeline/metrics.py CHANGED
@@ -55,16 +55,18 @@ def compute_normalized_lcs(words1: List[str], words2: List[str]) -> float:
55
  def compute_semantic_similarity(
56
  text1_segment: str,
57
  text2_segment: str,
58
- tokens1: List[str], # botok tokens for text1, not directly used by FastText path but kept for signature
59
- tokens2: List[str], # botok tokens for text2, not directly used by FastText path but kept for signature
60
- model, # FastText model object
61
- model_type: str = "fasttext", # Should always be 'fasttext' when called
62
  use_stopwords: bool = True,
63
  use_lite_stopwords: bool = False,
64
  fasttext_tokenize_fn=None,
65
  term_freq_corpus=None,
66
  doc_freq_map=None,
67
- total_docs_in_corpus=0
 
 
68
  ) -> float:
69
  """Computes semantic similarity using a FastText model."""
70
  if model_type != "fasttext":
@@ -92,7 +94,9 @@ def compute_semantic_similarity(
92
  tokenize_fn_param,
93
  term_freq_corpus_param,
94
  doc_freq_map_param,
95
- total_docs_in_corpus_param
 
 
96
  ) -> Union[np.ndarray, None]:
97
  """Helper to get a single embedding for a text using FastText."""
98
  if not raw_text_segment.strip():
@@ -109,7 +113,12 @@ def compute_semantic_similarity(
109
  use_lite_stopwords=use_lite_stopwords_param
110
  )
111
  elif model_type == "sentence-transformer":
112
- embedding = generate_hf_embeddings(texts=[raw_text_segment], model=model_obj)
 
 
 
 
 
113
 
114
  if embedding is None or embedding.size == 0:
115
  logger.error(
@@ -120,12 +129,12 @@ def compute_semantic_similarity(
120
 
121
  try:
122
  # Pass all relevant parameters to _get_aggregated_embedding
123
- emb1 = _get_aggregated_embedding(text1_segment, tokens1, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus)
124
- emb2 = _get_aggregated_embedding(text2_segment, tokens2, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus)
125
 
126
  if emb1 is None or emb2 is None or emb1.size == 0 or emb2.size == 0:
127
  logger.error(
128
- "Failed to obtain one or both FastText embeddings for semantic similarity."
129
  )
130
  return np.nan
131
 
@@ -135,16 +144,16 @@ def compute_semantic_similarity(
135
 
136
  # Handle cases where embeddings are all zeros
137
  if np.all(emb1 == 0) and np.all(emb2 == 0):
138
- logger.info("Both FastText embeddings are zero. Semantic similarity is 0.0.")
139
  return 0.0
140
  if np.all(emb1 == 0) or np.all(emb2 == 0):
141
- logger.info("One of the FastText embeddings is zero. Semantic similarity is 0.0.")
142
  return 0.0
143
 
144
  # Handle NaN or Inf in embeddings
145
  if np.isnan(emb1).any() or np.isinf(emb1).any() or \
146
  np.isnan(emb2).any() or np.isinf(emb2).any():
147
- logger.warning("NaN or Inf found in FastText embeddings. Semantic similarity set to 0.0.")
148
  return 0.0
149
 
150
  # Ensure embeddings are 2D for cosine_similarity: [1, dim]
@@ -159,17 +168,22 @@ def compute_semantic_similarity(
159
  safe_text1 = str(text1_segment)[:100] if text1_segment is not None else "N/A"
160
  safe_text2 = str(text2_segment)[:100] if text2_segment is not None else "N/A"
161
  logger.error(
162
- f"Error during FastText semantic similarity calculation:\nText1: {safe_text1}...\nText2: {safe_text2}...\nError: {e}"
163
  )
164
- logger.exception("Traceback for FastText semantic similarity calculation error:")
165
  return np.nan
166
 
167
 
168
  def compute_all_metrics(
169
- texts: Dict[str, str], model=None, enable_semantic: bool = True, # device=None removed
170
- model_type: str = "fasttext", use_stopwords: bool = True,
 
 
 
171
  use_lite_stopwords: bool = False,
172
- fasttext_tokenize_fn=None # Added for FastText specific tokenizer
 
 
173
  ) -> pd.DataFrame:
174
  """
175
  Computes all selected similarity metrics between pairs of texts.
@@ -318,11 +332,13 @@ def compute_all_metrics(
318
  if enable_semantic:
319
  # Pass raw texts and their pre-computed botok tokens
320
  semantic_sim = compute_semantic_similarity(
321
- texts[f1], texts[f2], words1_raw, words2_raw, model, model_type, use_stopwords, use_lite_stopwords, # device removed
322
  fasttext_tokenize_fn=fasttext_tokenize_fn,
323
  term_freq_corpus=term_freq_corpus_for_fasttext if model_type == "fasttext" else None,
324
  doc_freq_map=document_frequency_map_for_fasttext if model_type == "fasttext" else None,
325
- total_docs_in_corpus=total_num_documents_for_fasttext if model_type == "fasttext" else 0
 
 
326
  )
327
  else:
328
  semantic_sim = np.nan
 
55
  def compute_semantic_similarity(
56
  text1_segment: str,
57
  text2_segment: str,
58
+ tokens1: List[str],
59
+ tokens2: List[str],
60
+ model,
61
+ model_type: str = "fasttext",
62
  use_stopwords: bool = True,
63
  use_lite_stopwords: bool = False,
64
  fasttext_tokenize_fn=None,
65
  term_freq_corpus=None,
66
  doc_freq_map=None,
67
+ total_docs_in_corpus=0,
68
+ batch_size: int = 32,
69
+ show_progress_bar: bool = False
70
  ) -> float:
71
  """Computes semantic similarity using a FastText model."""
72
  if model_type != "fasttext":
 
94
  tokenize_fn_param,
95
  term_freq_corpus_param,
96
  doc_freq_map_param,
97
+ total_docs_in_corpus_param,
98
+ batch_size_param: int,
99
+ show_progress_bar_param: bool
100
  ) -> Union[np.ndarray, None]:
101
  """Helper to get a single embedding for a text using FastText."""
102
  if not raw_text_segment.strip():
 
113
  use_lite_stopwords=use_lite_stopwords_param
114
  )
115
  elif model_type == "sentence-transformer":
116
+ embedding = generate_hf_embeddings(
117
+ texts=[raw_text_segment],
118
+ model=model_obj,
119
+ batch_size=batch_size_param,
120
+ show_progress_bar=show_progress_bar_param
121
+ )
122
 
123
  if embedding is None or embedding.size == 0:
124
  logger.error(
 
129
 
130
  try:
131
  # Pass all relevant parameters to _get_aggregated_embedding
132
+ emb1 = _get_aggregated_embedding(text1_segment, tokens1, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus, batch_size, show_progress_bar)
133
+ emb2 = _get_aggregated_embedding(text2_segment, tokens2, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus, batch_size, show_progress_bar)
134
 
135
  if emb1 is None or emb2 is None or emb1.size == 0 or emb2.size == 0:
136
  logger.error(
137
+ "Failed to obtain one or both embeddings for semantic similarity."
138
  )
139
  return np.nan
140
 
 
144
 
145
  # Handle cases where embeddings are all zeros
146
  if np.all(emb1 == 0) and np.all(emb2 == 0):
147
+ logger.info("Both embeddings are zero. Semantic similarity is 0.0.")
148
  return 0.0
149
  if np.all(emb1 == 0) or np.all(emb2 == 0):
150
+ logger.info("One of the embeddings is zero. Semantic similarity is 0.0.")
151
  return 0.0
152
 
153
  # Handle NaN or Inf in embeddings
154
  if np.isnan(emb1).any() or np.isinf(emb1).any() or \
155
  np.isnan(emb2).any() or np.isinf(emb2).any():
156
+ logger.warning("NaN or Inf found in embeddings. Semantic similarity set to 0.0.")
157
  return 0.0
158
 
159
  # Ensure embeddings are 2D for cosine_similarity: [1, dim]
 
168
  safe_text1 = str(text1_segment)[:100] if text1_segment is not None else "N/A"
169
  safe_text2 = str(text2_segment)[:100] if text2_segment is not None else "N/A"
170
  logger.error(
171
+ f"Error during semantic similarity calculation:\nText1: {safe_text1}...\nText2: {safe_text2}...\nError: {e}"
172
  )
173
+ logger.exception("Traceback for semantic similarity calculation error:")
174
  return np.nan
175
 
176
 
177
  def compute_all_metrics(
178
+ texts: Dict[str, str],
179
+ model=None,
180
+ enable_semantic: bool = True,
181
+ model_type: str = "fasttext",
182
+ use_stopwords: bool = True,
183
  use_lite_stopwords: bool = False,
184
+ fasttext_tokenize_fn=None,
185
+ batch_size: int = 32,
186
+ show_progress_bar: bool = False
187
  ) -> pd.DataFrame:
188
  """
189
  Computes all selected similarity metrics between pairs of texts.
 
332
  if enable_semantic:
333
  # Pass raw texts and their pre-computed botok tokens
334
  semantic_sim = compute_semantic_similarity(
335
+ texts[f1], texts[f2], words1_raw, words2_raw, model, model_type, use_stopwords, use_lite_stopwords,
336
  fasttext_tokenize_fn=fasttext_tokenize_fn,
337
  term_freq_corpus=term_freq_corpus_for_fasttext if model_type == "fasttext" else None,
338
  doc_freq_map=document_frequency_map_for_fasttext if model_type == "fasttext" else None,
339
+ total_docs_in_corpus=total_num_documents_for_fasttext if model_type == "fasttext" else 0,
340
+ batch_size=batch_size,
341
+ show_progress_bar=show_progress_bar
342
  )
343
  else:
344
  semantic_sim = np.nan
pipeline/process.py CHANGED
@@ -55,7 +55,9 @@ def process_texts(
55
  model_name: str = "facebook-fasttext-pretrained",
56
  use_stopwords: bool = True,
57
  use_lite_stopwords: bool = False,
58
- progress_callback = None
 
 
59
  ) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
60
  """
61
  Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
@@ -279,12 +281,17 @@ def process_texts(
279
  logger.info("Using botok word-level tokenization for FastText model.")
280
 
281
  pair_metrics = compute_all_metrics(
282
- {seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
 
 
283
  model=model,
284
  enable_semantic=enable_semantic,
285
  model_type=model_type,
286
  use_stopwords=use_stopwords,
287
- use_lite_stopwords=use_lite_stopwords
 
 
 
288
  )
289
 
290
  # Rename 'Text Pair' to show file stems and chapter number
 
55
  model_name: str = "facebook-fasttext-pretrained",
56
  use_stopwords: bool = True,
57
  use_lite_stopwords: bool = False,
58
+ progress_callback = None,
59
+ batch_size: int = 32,
60
+ show_progress_bar: bool = False
61
  ) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
62
  """
63
  Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
 
281
  logger.info("Using botok word-level tokenization for FastText model.")
282
 
283
  pair_metrics = compute_all_metrics(
284
+ texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
285
+ token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
286
+ metrics_to_compute=["jaccard", "lcs", "tfidf"],
287
  model=model,
288
  enable_semantic=enable_semantic,
289
  model_type=model_type,
290
  use_stopwords=use_stopwords,
291
+ use_lite_stopwords=use_lite_stopwords,
292
+ fasttext_tokenize_fn=tokenizer_for_fasttext,
293
+ batch_size=batch_size,
294
+ show_progress_bar=show_progress_bar
295
  )
296
 
297
  # Rename 'Text Pair' to show file stems and chapter number