Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

App Files Files Community

daniel-wojahn commited on 6 days ago

Commit

59185b1

1 Parent(s): 36f4f52

remove naming

Browse files

Files changed (2) hide show

app.py +7 -69
pipeline/process.py +12 -21

app.py CHANGED Viewed

@@ -48,18 +48,6 @@ def main_interface():
                         "<small>Note: Maximum file size: 10MB per file. For optimal performance, use files under 1MB.</small>",
                         elem_classes="gr-markdown"
                     )
-            with gr.Column(scale=1, elem_classes="step-column", elem_id="chapter-rename-column"):
-                chapter_rename_group = gr.Group(visible=False)
-                with chapter_rename_group:
-                    gr.Markdown(
-                        """## Step 1.5: Name Your Chapters (Optional)
-<span style='font-size:16px;'>Provide a name for each chapter below. These names will be used in the heatmaps and results.</span>
-                        """,
-                        elem_classes="gr-markdown",
-                    )
-                    chapter_names_ui = gr.Column()
             with gr.Column(scale=1, elem_classes="step-column"):
                 with gr.Group():
                     gr.Markdown(
@@ -271,43 +259,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
         warning_box = gr.Markdown(visible=False)
-        # State to hold file info and chapter names
-        file_data_state = gr.State(value={})
-        def setup_chapter_rename_ui(files):
-            if not files:
-                return gr.update(visible=False), [], {}
-            file_data = {}
-            chapter_name_inputs = []
-            for file in files:
-                try:
-                    file_path = Path(file.name)
-                    content = file_path.read_text(encoding="utf-8-sig")
-                    segments = [seg for seg in content.split('༈') if seg.strip()]
-                    num_chapters = len(segments)
-                    file_data[file_path.name] = {'path': file.name, 'chapters': num_chapters}
-                    for i in range(num_chapters):
-                        default_name = f"{file_path.name} - Chapter {i + 1}"
-                        chapter_name_inputs.append(gr.Textbox(label=f"Name for Chapter {i+1} in '{file_path.name}'", value=default_name))
-                except Exception as e:
-                    logger.error(f"Error processing file {file.name} for chapter renaming: {e}")
-                    pass
-            if not chapter_name_inputs:
-                return gr.update(visible=False), [], {}
-            return gr.update(visible=True), chapter_name_inputs, file_data
-        # Wire up the chapter renaming UI
-        file_input.upload(
-            setup_chapter_rename_ui,
-            inputs=[file_input],
-            outputs=[chapter_rename_group, chapter_names_ui, file_data_state]
-        )
-        def run_pipeline(files, enable_semantic, model_name, stopwords_option, batch_size, show_progress, chapter_names_list, progress=gr.Progress()):
             """Run the text analysis pipeline on the uploaded files.
             Args:
@@ -388,8 +340,6 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                     Path(file.name).name for file in files
                 ]  # Use Path().name to get just the filename
                 text_data = {}
-                # The chapter_names_list is a list of lists, flatten it
-                flat_chapter_names = [name for sublist in chapter_names_list for name in sublist]
                 # Read files with progress updates
                 for i, file in enumerate(files):
@@ -440,16 +390,15 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                     internal_model_id = "facebook-fasttext-pretrained"
                 df_results, word_counts_df_data, warning_raw = process_texts(
-                    text_data=text_data,
-                    filenames=filenames,
                     enable_semantic=enable_semantic_bool,
                     model_name=internal_model_id,
                     use_stopwords=use_stopwords,
                     use_lite_stopwords=use_lite_stopwords,
                     progress_callback=progress_tracker,
                     batch_size=batch_size,
-                    show_progress_bar=show_progress,
-                    chapter_names=flat_chapter_names
                 )
                 if df_results.empty:
@@ -555,20 +504,9 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                 logger.error(f"Error in interpret_results: {e}", exc_info=True)
                 return f"Error interpreting results: {str(e)}"
-        # The `process_btn.click` call needs to be defined here.
-        # It will take inputs from all the configuration UI elements and the dynamic chapter name fields.
-        # The `chapter_names_ui` component, being a `gr.Column`, will pass the values of its children as a list.
         process_btn.click(
             fn=run_pipeline,
-            inputs=[
-                file_input,
-                semantic_toggle_radio,
-                model_dropdown,
-                stopwords_dropdown,
-                batch_size_slider,
-                progress_bar_checkbox,
-                chapter_names_ui, # Pass the column containing dynamic textboxes
-            ],
             outputs=[
                 csv_output,
                 metrics_preview,
@@ -578,7 +516,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                 heatmap_tabs["Semantic Similarity"],
                 heatmap_tabs["TF-IDF Cosine Sim"],
                 warning_box,
-            ],
         )
         # Connect the interpret button
@@ -593,4 +531,4 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
 if __name__ == "__main__":
     demo = main_interface()
-    demo.launch(share=True)

                         "<small>Note: Maximum file size: 10MB per file. For optimal performance, use files under 1MB.</small>",
                         elem_classes="gr-markdown"
                     )
             with gr.Column(scale=1, elem_classes="step-column"):
                 with gr.Group():
                     gr.Markdown(
         warning_box = gr.Markdown(visible=False)
+        def run_pipeline(files, enable_semantic, model_name, stopwords_option, batch_size, show_progress, progress=gr.Progress()):
             """Run the text analysis pipeline on the uploaded files.
             Args:
                     Path(file.name).name for file in files
                 ]  # Use Path().name to get just the filename
                 text_data = {}
                 # Read files with progress updates
                 for i, file in enumerate(files):
                     internal_model_id = "facebook-fasttext-pretrained"
                 df_results, word_counts_df_data, warning_raw = process_texts(
+                    text_data,
+                    filenames,
                     enable_semantic=enable_semantic_bool,
                     model_name=internal_model_id,
                     use_stopwords=use_stopwords,
                     use_lite_stopwords=use_lite_stopwords,
                     progress_callback=progress_tracker,
                     batch_size=batch_size,
+                    show_progress_bar=show_progress
                 )
                 if df_results.empty:
                 logger.error(f"Error in interpret_results: {e}", exc_info=True)
                 return f"Error interpreting results: {str(e)}"
         process_btn.click(
             fn=run_pipeline,
+            inputs=[file_input, semantic_toggle_radio, model_dropdown, stopwords_dropdown, batch_size_slider, progress_bar_checkbox],
             outputs=[
                 csv_output,
                 metrics_preview,
                 heatmap_tabs["Semantic Similarity"],
                 heatmap_tabs["TF-IDF Cosine Sim"],
                 warning_box,
+            ]
         )
         # Connect the interpret button
 if __name__ == "__main__":
     demo = main_interface()
+    demo.launch()

pipeline/process.py CHANGED Viewed

@@ -57,8 +57,7 @@ def process_texts(
     use_lite_stopwords: bool = False,
     progress_callback = None,
     batch_size: int = 32,
-    show_progress_bar: bool = False,
-    chapter_names: List[str] = None
 ) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
     """
     Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
@@ -153,7 +152,6 @@ def process_texts(
     chapter_marker = "༈"
     fallback = False
     segment_texts = {}
-    chapter_name_counter = 0
     # Process each file
     for i, fname in enumerate(filenames):
@@ -183,19 +181,14 @@ def process_texts(
                 continue
             for idx, seg in enumerate(segments):
-                # Use custom chapter name if available
-                custom_name = chapter_names[chapter_name_counter] if chapter_names and chapter_name_counter < len(chapter_names) else f"Chapter {idx + 1}"
-                seg_id = f"{fname}|{custom_name}"
                 cleaned_seg = clean_tibetan_text_for_fasttext(seg)
-                segment_texts[seg_id] = (cleaned_seg, idx + 1) # Store text and original number
-                chapter_name_counter += 1
         else:
             # No chapter markers found, treat entire file as one segment
-            custom_name = chapter_names[chapter_name_counter] if chapter_names and chapter_name_counter < len(chapter_names) else "Chapter 1"
-            seg_id = f"{fname}|{custom_name}"
             cleaned_content = clean_tibetan_text_for_fasttext(content.strip())
-            segment_texts[seg_id] = (cleaned_content, 1)
-            chapter_name_counter += 1
             fallback = True
     # Generate warning if no chapter markers found
@@ -220,7 +213,7 @@ def process_texts(
             logger.warning(f"Progress callback error (non-critical): {e}")
     all_segment_ids = list(segment_texts.keys())
-    all_segment_contents = [data[0] for data in segment_texts.values()]
     tokenized_segments_list = tokenize_texts(all_segment_contents)
     segment_tokens = dict(zip(all_segment_ids, tokenized_segments_list))
@@ -301,7 +294,7 @@ def process_texts(
                     logger.info("Using botok word-level tokenization for FastText model.")
                 pair_metrics = compute_all_metrics(
-                    texts={seg1: segment_texts[seg1][0], seg2: segment_texts[seg2][0]},
                     token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
                     model=model,
                     enable_semantic=enable_semantic,
@@ -313,10 +306,9 @@ def process_texts(
                     show_progress_bar=show_progress_bar
                 )
-                # Rename 'Text Pair' to show file stems and chapter name
-                chapter_name = seg1.split('|', 1)[1]
                 pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
-                pair_metrics.loc[:, "Chapter"] = chapter_name
                 results.append(pair_metrics)
             except Exception as e:
@@ -341,7 +333,7 @@ def process_texts(
     word_counts_data = []
     # Process each segment
-    for i, (seg_id, (text_content, chapter_num)) in enumerate(segment_texts.items()):
         # Update progress
         if progress_callback is not None and len(segment_texts) > 0:
             try:
@@ -350,7 +342,8 @@ def process_texts(
             except Exception as e:
                 logger.warning(f"Progress callback error (non-critical): {e}")
-        fname, chapter_name = seg_id.split("|", 1)
         try:
             # Use botok for accurate word count for raw Tibetan text
@@ -363,7 +356,6 @@ def process_texts(
             word_counts_data.append(
                 {
                     "Filename": fname.replace(".txt", ""),
-                    "ChapterName": chapter_name,
                     "ChapterNumber": chapter_num,
                     "SegmentID": seg_id,
                     "WordCount": word_count,
@@ -375,7 +367,6 @@ def process_texts(
             word_counts_data.append(
                 {
                     "Filename": fname.replace(".txt", ""),
-                    "ChapterName": chapter_name,
                     "ChapterNumber": chapter_num,
                     "SegmentID": seg_id,
                     "WordCount": 0,

     use_lite_stopwords: bool = False,
     progress_callback = None,
     batch_size: int = 32,
+    show_progress_bar: bool = False
 ) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
     """
     Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
     chapter_marker = "༈"
     fallback = False
     segment_texts = {}
     # Process each file
     for i, fname in enumerate(filenames):
                 continue
             for idx, seg in enumerate(segments):
+                seg_id = f"{fname}|chapter {idx+1}"
                 cleaned_seg = clean_tibetan_text_for_fasttext(seg)
+                segment_texts[seg_id] = cleaned_seg
         else:
             # No chapter markers found, treat entire file as one segment
+            seg_id = f"{fname}|chapter 1"
             cleaned_content = clean_tibetan_text_for_fasttext(content.strip())
+            segment_texts[seg_id] = cleaned_content
             fallback = True
     # Generate warning if no chapter markers found
             logger.warning(f"Progress callback error (non-critical): {e}")
     all_segment_ids = list(segment_texts.keys())
+    all_segment_contents = list(segment_texts.values())
     tokenized_segments_list = tokenize_texts(all_segment_contents)
     segment_tokens = dict(zip(all_segment_ids, tokenized_segments_list))
                     logger.info("Using botok word-level tokenization for FastText model.")
                 pair_metrics = compute_all_metrics(
+                    texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
                     token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
                     model=model,
                     enable_semantic=enable_semantic,
                     show_progress_bar=show_progress_bar
                 )
+                # Rename 'Text Pair' to show file stems and chapter number
                 pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
+                pair_metrics.loc[:, "Chapter"] = idx + 1
                 results.append(pair_metrics)
             except Exception as e:
     word_counts_data = []
     # Process each segment
+    for i, (seg_id, text_content) in enumerate(segment_texts.items()):
         # Update progress
         if progress_callback is not None and len(segment_texts) > 0:
             try:
             except Exception as e:
                 logger.warning(f"Progress callback error (non-critical): {e}")
+        fname, chapter_info = seg_id.split("|", 1)
+        chapter_num = int(chapter_info.replace("chapter ", ""))
         try:
             # Use botok for accurate word count for raw Tibetan text
             word_counts_data.append(
                 {
                     "Filename": fname.replace(".txt", ""),
                     "ChapterNumber": chapter_num,
                     "SegmentID": seg_id,
                     "WordCount": word_count,
             word_counts_data.append(
                 {
                     "Filename": fname.replace(".txt", ""),
                     "ChapterNumber": chapter_num,
                     "SegmentID": seg_id,
                     "WordCount": 0,