Spaces:
Sleeping
Sleeping
Commit
·
59185b1
1
Parent(s):
36f4f52
remove naming
Browse files- app.py +7 -69
- pipeline/process.py +12 -21
app.py
CHANGED
@@ -48,18 +48,6 @@ def main_interface():
|
|
48 |
"<small>Note: Maximum file size: 10MB per file. For optimal performance, use files under 1MB.</small>",
|
49 |
elem_classes="gr-markdown"
|
50 |
)
|
51 |
-
|
52 |
-
with gr.Column(scale=1, elem_classes="step-column", elem_id="chapter-rename-column"):
|
53 |
-
chapter_rename_group = gr.Group(visible=False)
|
54 |
-
with chapter_rename_group:
|
55 |
-
gr.Markdown(
|
56 |
-
"""## Step 1.5: Name Your Chapters (Optional)
|
57 |
-
<span style='font-size:16px;'>Provide a name for each chapter below. These names will be used in the heatmaps and results.</span>
|
58 |
-
""",
|
59 |
-
elem_classes="gr-markdown",
|
60 |
-
)
|
61 |
-
chapter_names_ui = gr.Column()
|
62 |
-
|
63 |
with gr.Column(scale=1, elem_classes="step-column"):
|
64 |
with gr.Group():
|
65 |
gr.Markdown(
|
@@ -271,43 +259,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
271 |
|
272 |
warning_box = gr.Markdown(visible=False)
|
273 |
|
274 |
-
|
275 |
-
file_data_state = gr.State(value={})
|
276 |
-
|
277 |
-
def setup_chapter_rename_ui(files):
|
278 |
-
if not files:
|
279 |
-
return gr.update(visible=False), [], {}
|
280 |
-
|
281 |
-
file_data = {}
|
282 |
-
chapter_name_inputs = []
|
283 |
-
for file in files:
|
284 |
-
try:
|
285 |
-
file_path = Path(file.name)
|
286 |
-
content = file_path.read_text(encoding="utf-8-sig")
|
287 |
-
segments = [seg for seg in content.split('༈') if seg.strip()]
|
288 |
-
num_chapters = len(segments)
|
289 |
-
file_data[file_path.name] = {'path': file.name, 'chapters': num_chapters}
|
290 |
-
|
291 |
-
for i in range(num_chapters):
|
292 |
-
default_name = f"{file_path.name} - Chapter {i + 1}"
|
293 |
-
chapter_name_inputs.append(gr.Textbox(label=f"Name for Chapter {i+1} in '{file_path.name}'", value=default_name))
|
294 |
-
except Exception as e:
|
295 |
-
logger.error(f"Error processing file {file.name} for chapter renaming: {e}")
|
296 |
-
pass
|
297 |
-
|
298 |
-
if not chapter_name_inputs:
|
299 |
-
return gr.update(visible=False), [], {}
|
300 |
-
|
301 |
-
return gr.update(visible=True), chapter_name_inputs, file_data
|
302 |
-
|
303 |
-
# Wire up the chapter renaming UI
|
304 |
-
file_input.upload(
|
305 |
-
setup_chapter_rename_ui,
|
306 |
-
inputs=[file_input],
|
307 |
-
outputs=[chapter_rename_group, chapter_names_ui, file_data_state]
|
308 |
-
)
|
309 |
-
|
310 |
-
def run_pipeline(files, enable_semantic, model_name, stopwords_option, batch_size, show_progress, chapter_names_list, progress=gr.Progress()):
|
311 |
"""Run the text analysis pipeline on the uploaded files.
|
312 |
|
313 |
Args:
|
@@ -388,8 +340,6 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
388 |
Path(file.name).name for file in files
|
389 |
] # Use Path().name to get just the filename
|
390 |
text_data = {}
|
391 |
-
# The chapter_names_list is a list of lists, flatten it
|
392 |
-
flat_chapter_names = [name for sublist in chapter_names_list for name in sublist]
|
393 |
|
394 |
# Read files with progress updates
|
395 |
for i, file in enumerate(files):
|
@@ -440,16 +390,15 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
440 |
internal_model_id = "facebook-fasttext-pretrained"
|
441 |
|
442 |
df_results, word_counts_df_data, warning_raw = process_texts(
|
443 |
-
text_data
|
444 |
-
filenames
|
445 |
enable_semantic=enable_semantic_bool,
|
446 |
model_name=internal_model_id,
|
447 |
use_stopwords=use_stopwords,
|
448 |
use_lite_stopwords=use_lite_stopwords,
|
449 |
progress_callback=progress_tracker,
|
450 |
batch_size=batch_size,
|
451 |
-
show_progress_bar=show_progress
|
452 |
-
chapter_names=flat_chapter_names
|
453 |
)
|
454 |
|
455 |
if df_results.empty:
|
@@ -555,20 +504,9 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
555 |
logger.error(f"Error in interpret_results: {e}", exc_info=True)
|
556 |
return f"Error interpreting results: {str(e)}"
|
557 |
|
558 |
-
# The `process_btn.click` call needs to be defined here.
|
559 |
-
# It will take inputs from all the configuration UI elements and the dynamic chapter name fields.
|
560 |
-
# The `chapter_names_ui` component, being a `gr.Column`, will pass the values of its children as a list.
|
561 |
process_btn.click(
|
562 |
fn=run_pipeline,
|
563 |
-
inputs=[
|
564 |
-
file_input,
|
565 |
-
semantic_toggle_radio,
|
566 |
-
model_dropdown,
|
567 |
-
stopwords_dropdown,
|
568 |
-
batch_size_slider,
|
569 |
-
progress_bar_checkbox,
|
570 |
-
chapter_names_ui, # Pass the column containing dynamic textboxes
|
571 |
-
],
|
572 |
outputs=[
|
573 |
csv_output,
|
574 |
metrics_preview,
|
@@ -578,7 +516,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
578 |
heatmap_tabs["Semantic Similarity"],
|
579 |
heatmap_tabs["TF-IDF Cosine Sim"],
|
580 |
warning_box,
|
581 |
-
]
|
582 |
)
|
583 |
|
584 |
# Connect the interpret button
|
@@ -593,4 +531,4 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
|
|
593 |
|
594 |
if __name__ == "__main__":
|
595 |
demo = main_interface()
|
596 |
-
demo.launch(
|
|
|
48 |
"<small>Note: Maximum file size: 10MB per file. For optimal performance, use files under 1MB.</small>",
|
49 |
elem_classes="gr-markdown"
|
50 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
with gr.Column(scale=1, elem_classes="step-column"):
|
52 |
with gr.Group():
|
53 |
gr.Markdown(
|
|
|
259 |
|
260 |
warning_box = gr.Markdown(visible=False)
|
261 |
|
262 |
+
def run_pipeline(files, enable_semantic, model_name, stopwords_option, batch_size, show_progress, progress=gr.Progress()):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
"""Run the text analysis pipeline on the uploaded files.
|
264 |
|
265 |
Args:
|
|
|
340 |
Path(file.name).name for file in files
|
341 |
] # Use Path().name to get just the filename
|
342 |
text_data = {}
|
|
|
|
|
343 |
|
344 |
# Read files with progress updates
|
345 |
for i, file in enumerate(files):
|
|
|
390 |
internal_model_id = "facebook-fasttext-pretrained"
|
391 |
|
392 |
df_results, word_counts_df_data, warning_raw = process_texts(
|
393 |
+
text_data,
|
394 |
+
filenames,
|
395 |
enable_semantic=enable_semantic_bool,
|
396 |
model_name=internal_model_id,
|
397 |
use_stopwords=use_stopwords,
|
398 |
use_lite_stopwords=use_lite_stopwords,
|
399 |
progress_callback=progress_tracker,
|
400 |
batch_size=batch_size,
|
401 |
+
show_progress_bar=show_progress
|
|
|
402 |
)
|
403 |
|
404 |
if df_results.empty:
|
|
|
504 |
logger.error(f"Error in interpret_results: {e}", exc_info=True)
|
505 |
return f"Error interpreting results: {str(e)}"
|
506 |
|
|
|
|
|
|
|
507 |
process_btn.click(
|
508 |
fn=run_pipeline,
|
509 |
+
inputs=[file_input, semantic_toggle_radio, model_dropdown, stopwords_dropdown, batch_size_slider, progress_bar_checkbox],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
outputs=[
|
511 |
csv_output,
|
512 |
metrics_preview,
|
|
|
516 |
heatmap_tabs["Semantic Similarity"],
|
517 |
heatmap_tabs["TF-IDF Cosine Sim"],
|
518 |
warning_box,
|
519 |
+
]
|
520 |
)
|
521 |
|
522 |
# Connect the interpret button
|
|
|
531 |
|
532 |
if __name__ == "__main__":
|
533 |
demo = main_interface()
|
534 |
+
demo.launch()
|
pipeline/process.py
CHANGED
@@ -57,8 +57,7 @@ def process_texts(
|
|
57 |
use_lite_stopwords: bool = False,
|
58 |
progress_callback = None,
|
59 |
batch_size: int = 32,
|
60 |
-
show_progress_bar: bool = False
|
61 |
-
chapter_names: List[str] = None
|
62 |
) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
|
63 |
"""
|
64 |
Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
|
@@ -153,7 +152,6 @@ def process_texts(
|
|
153 |
chapter_marker = "༈"
|
154 |
fallback = False
|
155 |
segment_texts = {}
|
156 |
-
chapter_name_counter = 0
|
157 |
|
158 |
# Process each file
|
159 |
for i, fname in enumerate(filenames):
|
@@ -183,19 +181,14 @@ def process_texts(
|
|
183 |
continue
|
184 |
|
185 |
for idx, seg in enumerate(segments):
|
186 |
-
|
187 |
-
custom_name = chapter_names[chapter_name_counter] if chapter_names and chapter_name_counter < len(chapter_names) else f"Chapter {idx + 1}"
|
188 |
-
seg_id = f"{fname}|{custom_name}"
|
189 |
cleaned_seg = clean_tibetan_text_for_fasttext(seg)
|
190 |
-
segment_texts[seg_id] =
|
191 |
-
chapter_name_counter += 1
|
192 |
else:
|
193 |
# No chapter markers found, treat entire file as one segment
|
194 |
-
|
195 |
-
seg_id = f"{fname}|{custom_name}"
|
196 |
cleaned_content = clean_tibetan_text_for_fasttext(content.strip())
|
197 |
-
segment_texts[seg_id] =
|
198 |
-
chapter_name_counter += 1
|
199 |
fallback = True
|
200 |
|
201 |
# Generate warning if no chapter markers found
|
@@ -220,7 +213,7 @@ def process_texts(
|
|
220 |
logger.warning(f"Progress callback error (non-critical): {e}")
|
221 |
|
222 |
all_segment_ids = list(segment_texts.keys())
|
223 |
-
all_segment_contents =
|
224 |
tokenized_segments_list = tokenize_texts(all_segment_contents)
|
225 |
|
226 |
segment_tokens = dict(zip(all_segment_ids, tokenized_segments_list))
|
@@ -301,7 +294,7 @@ def process_texts(
|
|
301 |
logger.info("Using botok word-level tokenization for FastText model.")
|
302 |
|
303 |
pair_metrics = compute_all_metrics(
|
304 |
-
texts={seg1: segment_texts[seg1]
|
305 |
token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
|
306 |
model=model,
|
307 |
enable_semantic=enable_semantic,
|
@@ -313,10 +306,9 @@ def process_texts(
|
|
313 |
show_progress_bar=show_progress_bar
|
314 |
)
|
315 |
|
316 |
-
# Rename 'Text Pair' to show file stems and chapter
|
317 |
-
chapter_name = seg1.split('|', 1)[1]
|
318 |
pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
|
319 |
-
pair_metrics.loc[:, "Chapter"] =
|
320 |
results.append(pair_metrics)
|
321 |
|
322 |
except Exception as e:
|
@@ -341,7 +333,7 @@ def process_texts(
|
|
341 |
word_counts_data = []
|
342 |
|
343 |
# Process each segment
|
344 |
-
for i, (seg_id,
|
345 |
# Update progress
|
346 |
if progress_callback is not None and len(segment_texts) > 0:
|
347 |
try:
|
@@ -350,7 +342,8 @@ def process_texts(
|
|
350 |
except Exception as e:
|
351 |
logger.warning(f"Progress callback error (non-critical): {e}")
|
352 |
|
353 |
-
fname,
|
|
|
354 |
|
355 |
try:
|
356 |
# Use botok for accurate word count for raw Tibetan text
|
@@ -363,7 +356,6 @@ def process_texts(
|
|
363 |
word_counts_data.append(
|
364 |
{
|
365 |
"Filename": fname.replace(".txt", ""),
|
366 |
-
"ChapterName": chapter_name,
|
367 |
"ChapterNumber": chapter_num,
|
368 |
"SegmentID": seg_id,
|
369 |
"WordCount": word_count,
|
@@ -375,7 +367,6 @@ def process_texts(
|
|
375 |
word_counts_data.append(
|
376 |
{
|
377 |
"Filename": fname.replace(".txt", ""),
|
378 |
-
"ChapterName": chapter_name,
|
379 |
"ChapterNumber": chapter_num,
|
380 |
"SegmentID": seg_id,
|
381 |
"WordCount": 0,
|
|
|
57 |
use_lite_stopwords: bool = False,
|
58 |
progress_callback = None,
|
59 |
batch_size: int = 32,
|
60 |
+
show_progress_bar: bool = False
|
|
|
61 |
) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
|
62 |
"""
|
63 |
Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
|
|
|
152 |
chapter_marker = "༈"
|
153 |
fallback = False
|
154 |
segment_texts = {}
|
|
|
155 |
|
156 |
# Process each file
|
157 |
for i, fname in enumerate(filenames):
|
|
|
181 |
continue
|
182 |
|
183 |
for idx, seg in enumerate(segments):
|
184 |
+
seg_id = f"{fname}|chapter {idx+1}"
|
|
|
|
|
185 |
cleaned_seg = clean_tibetan_text_for_fasttext(seg)
|
186 |
+
segment_texts[seg_id] = cleaned_seg
|
|
|
187 |
else:
|
188 |
# No chapter markers found, treat entire file as one segment
|
189 |
+
seg_id = f"{fname}|chapter 1"
|
|
|
190 |
cleaned_content = clean_tibetan_text_for_fasttext(content.strip())
|
191 |
+
segment_texts[seg_id] = cleaned_content
|
|
|
192 |
fallback = True
|
193 |
|
194 |
# Generate warning if no chapter markers found
|
|
|
213 |
logger.warning(f"Progress callback error (non-critical): {e}")
|
214 |
|
215 |
all_segment_ids = list(segment_texts.keys())
|
216 |
+
all_segment_contents = list(segment_texts.values())
|
217 |
tokenized_segments_list = tokenize_texts(all_segment_contents)
|
218 |
|
219 |
segment_tokens = dict(zip(all_segment_ids, tokenized_segments_list))
|
|
|
294 |
logger.info("Using botok word-level tokenization for FastText model.")
|
295 |
|
296 |
pair_metrics = compute_all_metrics(
|
297 |
+
texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
|
298 |
token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
|
299 |
model=model,
|
300 |
enable_semantic=enable_semantic,
|
|
|
306 |
show_progress_bar=show_progress_bar
|
307 |
)
|
308 |
|
309 |
+
# Rename 'Text Pair' to show file stems and chapter number
|
|
|
310 |
pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
|
311 |
+
pair_metrics.loc[:, "Chapter"] = idx + 1
|
312 |
results.append(pair_metrics)
|
313 |
|
314 |
except Exception as e:
|
|
|
333 |
word_counts_data = []
|
334 |
|
335 |
# Process each segment
|
336 |
+
for i, (seg_id, text_content) in enumerate(segment_texts.items()):
|
337 |
# Update progress
|
338 |
if progress_callback is not None and len(segment_texts) > 0:
|
339 |
try:
|
|
|
342 |
except Exception as e:
|
343 |
logger.warning(f"Progress callback error (non-critical): {e}")
|
344 |
|
345 |
+
fname, chapter_info = seg_id.split("|", 1)
|
346 |
+
chapter_num = int(chapter_info.replace("chapter ", ""))
|
347 |
|
348 |
try:
|
349 |
# Use botok for accurate word count for raw Tibetan text
|
|
|
356 |
word_counts_data.append(
|
357 |
{
|
358 |
"Filename": fname.replace(".txt", ""),
|
|
|
359 |
"ChapterNumber": chapter_num,
|
360 |
"SegmentID": seg_id,
|
361 |
"WordCount": word_count,
|
|
|
367 |
word_counts_data.append(
|
368 |
{
|
369 |
"Filename": fname.replace(".txt", ""),
|
|
|
370 |
"ChapterNumber": chapter_num,
|
371 |
"SegmentID": seg_id,
|
372 |
"WordCount": 0,
|