daniel-wojahn commited on
Commit
59185b1
·
1 Parent(s): 36f4f52

remove naming

Browse files
Files changed (2) hide show
  1. app.py +7 -69
  2. pipeline/process.py +12 -21
app.py CHANGED
@@ -48,18 +48,6 @@ def main_interface():
48
  "<small>Note: Maximum file size: 10MB per file. For optimal performance, use files under 1MB.</small>",
49
  elem_classes="gr-markdown"
50
  )
51
-
52
- with gr.Column(scale=1, elem_classes="step-column", elem_id="chapter-rename-column"):
53
- chapter_rename_group = gr.Group(visible=False)
54
- with chapter_rename_group:
55
- gr.Markdown(
56
- """## Step 1.5: Name Your Chapters (Optional)
57
- <span style='font-size:16px;'>Provide a name for each chapter below. These names will be used in the heatmaps and results.</span>
58
- """,
59
- elem_classes="gr-markdown",
60
- )
61
- chapter_names_ui = gr.Column()
62
-
63
  with gr.Column(scale=1, elem_classes="step-column"):
64
  with gr.Group():
65
  gr.Markdown(
@@ -271,43 +259,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
271
 
272
  warning_box = gr.Markdown(visible=False)
273
 
274
- # State to hold file info and chapter names
275
- file_data_state = gr.State(value={})
276
-
277
- def setup_chapter_rename_ui(files):
278
- if not files:
279
- return gr.update(visible=False), [], {}
280
-
281
- file_data = {}
282
- chapter_name_inputs = []
283
- for file in files:
284
- try:
285
- file_path = Path(file.name)
286
- content = file_path.read_text(encoding="utf-8-sig")
287
- segments = [seg for seg in content.split('༈') if seg.strip()]
288
- num_chapters = len(segments)
289
- file_data[file_path.name] = {'path': file.name, 'chapters': num_chapters}
290
-
291
- for i in range(num_chapters):
292
- default_name = f"{file_path.name} - Chapter {i + 1}"
293
- chapter_name_inputs.append(gr.Textbox(label=f"Name for Chapter {i+1} in '{file_path.name}'", value=default_name))
294
- except Exception as e:
295
- logger.error(f"Error processing file {file.name} for chapter renaming: {e}")
296
- pass
297
-
298
- if not chapter_name_inputs:
299
- return gr.update(visible=False), [], {}
300
-
301
- return gr.update(visible=True), chapter_name_inputs, file_data
302
-
303
- # Wire up the chapter renaming UI
304
- file_input.upload(
305
- setup_chapter_rename_ui,
306
- inputs=[file_input],
307
- outputs=[chapter_rename_group, chapter_names_ui, file_data_state]
308
- )
309
-
310
- def run_pipeline(files, enable_semantic, model_name, stopwords_option, batch_size, show_progress, chapter_names_list, progress=gr.Progress()):
311
  """Run the text analysis pipeline on the uploaded files.
312
 
313
  Args:
@@ -388,8 +340,6 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
388
  Path(file.name).name for file in files
389
  ] # Use Path().name to get just the filename
390
  text_data = {}
391
- # The chapter_names_list is a list of lists, flatten it
392
- flat_chapter_names = [name for sublist in chapter_names_list for name in sublist]
393
 
394
  # Read files with progress updates
395
  for i, file in enumerate(files):
@@ -440,16 +390,15 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
440
  internal_model_id = "facebook-fasttext-pretrained"
441
 
442
  df_results, word_counts_df_data, warning_raw = process_texts(
443
- text_data=text_data,
444
- filenames=filenames,
445
  enable_semantic=enable_semantic_bool,
446
  model_name=internal_model_id,
447
  use_stopwords=use_stopwords,
448
  use_lite_stopwords=use_lite_stopwords,
449
  progress_callback=progress_tracker,
450
  batch_size=batch_size,
451
- show_progress_bar=show_progress,
452
- chapter_names=flat_chapter_names
453
  )
454
 
455
  if df_results.empty:
@@ -555,20 +504,9 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
555
  logger.error(f"Error in interpret_results: {e}", exc_info=True)
556
  return f"Error interpreting results: {str(e)}"
557
 
558
- # The `process_btn.click` call needs to be defined here.
559
- # It will take inputs from all the configuration UI elements and the dynamic chapter name fields.
560
- # The `chapter_names_ui` component, being a `gr.Column`, will pass the values of its children as a list.
561
  process_btn.click(
562
  fn=run_pipeline,
563
- inputs=[
564
- file_input,
565
- semantic_toggle_radio,
566
- model_dropdown,
567
- stopwords_dropdown,
568
- batch_size_slider,
569
- progress_bar_checkbox,
570
- chapter_names_ui, # Pass the column containing dynamic textboxes
571
- ],
572
  outputs=[
573
  csv_output,
574
  metrics_preview,
@@ -578,7 +516,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
578
  heatmap_tabs["Semantic Similarity"],
579
  heatmap_tabs["TF-IDF Cosine Sim"],
580
  warning_box,
581
- ],
582
  )
583
 
584
  # Connect the interpret button
@@ -593,4 +531,4 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
593
 
594
  if __name__ == "__main__":
595
  demo = main_interface()
596
- demo.launch(share=True)
 
48
  "<small>Note: Maximum file size: 10MB per file. For optimal performance, use files under 1MB.</small>",
49
  elem_classes="gr-markdown"
50
  )
 
 
 
 
 
 
 
 
 
 
 
 
51
  with gr.Column(scale=1, elem_classes="step-column"):
52
  with gr.Group():
53
  gr.Markdown(
 
259
 
260
  warning_box = gr.Markdown(visible=False)
261
 
262
+ def run_pipeline(files, enable_semantic, model_name, stopwords_option, batch_size, show_progress, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  """Run the text analysis pipeline on the uploaded files.
264
 
265
  Args:
 
340
  Path(file.name).name for file in files
341
  ] # Use Path().name to get just the filename
342
  text_data = {}
 
 
343
 
344
  # Read files with progress updates
345
  for i, file in enumerate(files):
 
390
  internal_model_id = "facebook-fasttext-pretrained"
391
 
392
  df_results, word_counts_df_data, warning_raw = process_texts(
393
+ text_data,
394
+ filenames,
395
  enable_semantic=enable_semantic_bool,
396
  model_name=internal_model_id,
397
  use_stopwords=use_stopwords,
398
  use_lite_stopwords=use_lite_stopwords,
399
  progress_callback=progress_tracker,
400
  batch_size=batch_size,
401
+ show_progress_bar=show_progress
 
402
  )
403
 
404
  if df_results.empty:
 
504
  logger.error(f"Error in interpret_results: {e}", exc_info=True)
505
  return f"Error interpreting results: {str(e)}"
506
 
 
 
 
507
  process_btn.click(
508
  fn=run_pipeline,
509
+ inputs=[file_input, semantic_toggle_radio, model_dropdown, stopwords_dropdown, batch_size_slider, progress_bar_checkbox],
 
 
 
 
 
 
 
 
510
  outputs=[
511
  csv_output,
512
  metrics_preview,
 
516
  heatmap_tabs["Semantic Similarity"],
517
  heatmap_tabs["TF-IDF Cosine Sim"],
518
  warning_box,
519
+ ]
520
  )
521
 
522
  # Connect the interpret button
 
531
 
532
  if __name__ == "__main__":
533
  demo = main_interface()
534
+ demo.launch()
pipeline/process.py CHANGED
@@ -57,8 +57,7 @@ def process_texts(
57
  use_lite_stopwords: bool = False,
58
  progress_callback = None,
59
  batch_size: int = 32,
60
- show_progress_bar: bool = False,
61
- chapter_names: List[str] = None
62
  ) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
63
  """
64
  Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
@@ -153,7 +152,6 @@ def process_texts(
153
  chapter_marker = "༈"
154
  fallback = False
155
  segment_texts = {}
156
- chapter_name_counter = 0
157
 
158
  # Process each file
159
  for i, fname in enumerate(filenames):
@@ -183,19 +181,14 @@ def process_texts(
183
  continue
184
 
185
  for idx, seg in enumerate(segments):
186
- # Use custom chapter name if available
187
- custom_name = chapter_names[chapter_name_counter] if chapter_names and chapter_name_counter < len(chapter_names) else f"Chapter {idx + 1}"
188
- seg_id = f"{fname}|{custom_name}"
189
  cleaned_seg = clean_tibetan_text_for_fasttext(seg)
190
- segment_texts[seg_id] = (cleaned_seg, idx + 1) # Store text and original number
191
- chapter_name_counter += 1
192
  else:
193
  # No chapter markers found, treat entire file as one segment
194
- custom_name = chapter_names[chapter_name_counter] if chapter_names and chapter_name_counter < len(chapter_names) else "Chapter 1"
195
- seg_id = f"{fname}|{custom_name}"
196
  cleaned_content = clean_tibetan_text_for_fasttext(content.strip())
197
- segment_texts[seg_id] = (cleaned_content, 1)
198
- chapter_name_counter += 1
199
  fallback = True
200
 
201
  # Generate warning if no chapter markers found
@@ -220,7 +213,7 @@ def process_texts(
220
  logger.warning(f"Progress callback error (non-critical): {e}")
221
 
222
  all_segment_ids = list(segment_texts.keys())
223
- all_segment_contents = [data[0] for data in segment_texts.values()]
224
  tokenized_segments_list = tokenize_texts(all_segment_contents)
225
 
226
  segment_tokens = dict(zip(all_segment_ids, tokenized_segments_list))
@@ -301,7 +294,7 @@ def process_texts(
301
  logger.info("Using botok word-level tokenization for FastText model.")
302
 
303
  pair_metrics = compute_all_metrics(
304
- texts={seg1: segment_texts[seg1][0], seg2: segment_texts[seg2][0]},
305
  token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
306
  model=model,
307
  enable_semantic=enable_semantic,
@@ -313,10 +306,9 @@ def process_texts(
313
  show_progress_bar=show_progress_bar
314
  )
315
 
316
- # Rename 'Text Pair' to show file stems and chapter name
317
- chapter_name = seg1.split('|', 1)[1]
318
  pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
319
- pair_metrics.loc[:, "Chapter"] = chapter_name
320
  results.append(pair_metrics)
321
 
322
  except Exception as e:
@@ -341,7 +333,7 @@ def process_texts(
341
  word_counts_data = []
342
 
343
  # Process each segment
344
- for i, (seg_id, (text_content, chapter_num)) in enumerate(segment_texts.items()):
345
  # Update progress
346
  if progress_callback is not None and len(segment_texts) > 0:
347
  try:
@@ -350,7 +342,8 @@ def process_texts(
350
  except Exception as e:
351
  logger.warning(f"Progress callback error (non-critical): {e}")
352
 
353
- fname, chapter_name = seg_id.split("|", 1)
 
354
 
355
  try:
356
  # Use botok for accurate word count for raw Tibetan text
@@ -363,7 +356,6 @@ def process_texts(
363
  word_counts_data.append(
364
  {
365
  "Filename": fname.replace(".txt", ""),
366
- "ChapterName": chapter_name,
367
  "ChapterNumber": chapter_num,
368
  "SegmentID": seg_id,
369
  "WordCount": word_count,
@@ -375,7 +367,6 @@ def process_texts(
375
  word_counts_data.append(
376
  {
377
  "Filename": fname.replace(".txt", ""),
378
- "ChapterName": chapter_name,
379
  "ChapterNumber": chapter_num,
380
  "SegmentID": seg_id,
381
  "WordCount": 0,
 
57
  use_lite_stopwords: bool = False,
58
  progress_callback = None,
59
  batch_size: int = 32,
60
+ show_progress_bar: bool = False
 
61
  ) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
62
  """
63
  Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
 
152
  chapter_marker = "༈"
153
  fallback = False
154
  segment_texts = {}
 
155
 
156
  # Process each file
157
  for i, fname in enumerate(filenames):
 
181
  continue
182
 
183
  for idx, seg in enumerate(segments):
184
+ seg_id = f"{fname}|chapter {idx+1}"
 
 
185
  cleaned_seg = clean_tibetan_text_for_fasttext(seg)
186
+ segment_texts[seg_id] = cleaned_seg
 
187
  else:
188
  # No chapter markers found, treat entire file as one segment
189
+ seg_id = f"{fname}|chapter 1"
 
190
  cleaned_content = clean_tibetan_text_for_fasttext(content.strip())
191
+ segment_texts[seg_id] = cleaned_content
 
192
  fallback = True
193
 
194
  # Generate warning if no chapter markers found
 
213
  logger.warning(f"Progress callback error (non-critical): {e}")
214
 
215
  all_segment_ids = list(segment_texts.keys())
216
+ all_segment_contents = list(segment_texts.values())
217
  tokenized_segments_list = tokenize_texts(all_segment_contents)
218
 
219
  segment_tokens = dict(zip(all_segment_ids, tokenized_segments_list))
 
294
  logger.info("Using botok word-level tokenization for FastText model.")
295
 
296
  pair_metrics = compute_all_metrics(
297
+ texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
298
  token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
299
  model=model,
300
  enable_semantic=enable_semantic,
 
306
  show_progress_bar=show_progress_bar
307
  )
308
 
309
+ # Rename 'Text Pair' to show file stems and chapter number
 
310
  pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
311
+ pair_metrics.loc[:, "Chapter"] = idx + 1
312
  results.append(pair_metrics)
313
 
314
  except Exception as e:
 
333
  word_counts_data = []
334
 
335
  # Process each segment
336
+ for i, (seg_id, text_content) in enumerate(segment_texts.items()):
337
  # Update progress
338
  if progress_callback is not None and len(segment_texts) > 0:
339
  try:
 
342
  except Exception as e:
343
  logger.warning(f"Progress callback error (non-critical): {e}")
344
 
345
+ fname, chapter_info = seg_id.split("|", 1)
346
+ chapter_num = int(chapter_info.replace("chapter ", ""))
347
 
348
  try:
349
  # Use botok for accurate word count for raw Tibetan text
 
356
  word_counts_data.append(
357
  {
358
  "Filename": fname.replace(".txt", ""),
 
359
  "ChapterNumber": chapter_num,
360
  "SegmentID": seg_id,
361
  "WordCount": word_count,
 
367
  word_counts_data.append(
368
  {
369
  "Filename": fname.replace(".txt", ""),
 
370
  "ChapterNumber": chapter_num,
371
  "SegmentID": seg_id,
372
  "WordCount": 0,