import multiprocessing import threading import gradio as gr from mining import mining from sts import sts from utils import getDataFrame, save_to_csv, delete_folder_periodically import logging # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) CONCURRENCY_LIMIT = 5 AVAILABLE_MODELS = [ "Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2", "intfloat/multilingual-e5-large-instruct" ] MODEL_DESCRIPTIONS = { "Lajavaness/bilingual-embedding-large": "Multilingual model optimized for multiple languages. [More info](https://huggingface.co/Lajavaness/bilingual-embedding-large)", "sentence-transformers/all-mpnet-base-v2": "High-quality general-purpose model. [More info](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)", "intfloat/multilingual-e5-large-instruct": "Multilingual model with instructions. [More info](https://huggingface.co/intfloat/multilingual-e5-large-instruct)" } def create_interface(): with gr.Blocks(title="Sentence Transformers Demo") as demo: gr.Markdown("# Sentence Transformers Demo") gr.Markdown("This application provides two main functionalities: Paraphrase Mining and Semantic Textual Similarity (STS).") with gr.Tab("Paraphrase Mining"): with gr.Row(): with gr.Column(): gr.Markdown( "### Paraphrase Mining\n" "Find paraphrases (texts with identical/similar meaning) in a large corpus of sentences.\n" "Upload a CSV file containing your sentences and select a model to begin." ) with gr.Row(): with gr.Column(): gr.Markdown("#### Input Sentences") upload_button_sentences = gr.UploadButton( label="Upload Sentences CSV", file_types=['.csv'], file_count="single", variant="primary" ) output_data_sentences = gr.Dataframe( headers=["_id", "text"], col_count=2, label="Sentences Data", interactive=False ) upload_button_sentences.upload( fn=getDataFrame, inputs=upload_button_sentences, outputs=output_data_sentences, concurrency_limit=CONCURRENCY_LIMIT ) with gr.Row(): with gr.Column(): model = gr.Dropdown( choices=AVAILABLE_MODELS, label="Select Model", value=AVAILABLE_MODELS[0], interactive=True ) model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]]) def update_model_description(model_name): return MODEL_DESCRIPTIONS[model_name] model.change( fn=update_model_description, inputs=model, outputs=model_description ) score_mining = gr.Slider( minimum=0.0, maximum=1.0, value=0.96, step=0.01, label="Similarity Threshold", interactive=True ) submit_button_mining = gr.Button("Process", variant="primary") with gr.Row(): with gr.Column(): output_mining = gr.Dataframe( headers=["score", "sentence_1", "sentence_2"], type="polars", label="Mining Results" ) submit_button_mining.click( fn=mining, inputs=[model, upload_button_sentences, score_mining], outputs=output_mining ).then( fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."), inputs=[output_mining], outputs=[] ) download_button = gr.Button("Download Results as CSV", variant="secondary") download_file = gr.File(label="Downloadable File") download_button.click( fn=save_to_csv, inputs=output_mining, outputs=download_file ).then( fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."), inputs=[download_file], outputs=[] ) with gr.Tab("Semantic Textual Similarity"): with gr.Row(): with gr.Column(): gr.Markdown( "### Semantic Textual Similarity (STS)\n" "Calculate semantic similarity between two sets of sentences.\n" "Upload two CSV files containing your sentences and select a model to begin." ) with gr.Row(): with gr.Column(): gr.Markdown("#### First Set of Sentences") upload_button_sentences1 = gr.UploadButton( label="Upload First Set CSV", file_types=['.csv'], file_count="single", variant="primary" ) output_data_sentences1 = gr.Dataframe( headers=["_id", "text"], col_count=2, label="First Set Data", interactive=False ) upload_button_sentences1.upload( fn=getDataFrame, inputs=upload_button_sentences1, outputs=output_data_sentences1, concurrency_limit=CONCURRENCY_LIMIT ) with gr.Column(): gr.Markdown("#### Second Set of Sentences") upload_button_sentences2 = gr.UploadButton( label="Upload Second Set CSV", file_types=['.csv'], file_count="single", variant="primary" ) output_data_sentences2 = gr.Dataframe( headers=["_id", "text"], col_count=2, label="Second Set Data", interactive=False ) upload_button_sentences2.upload( fn=getDataFrame, inputs=upload_button_sentences2, outputs=output_data_sentences2, concurrency_limit=CONCURRENCY_LIMIT ) with gr.Row(): with gr.Column(): model = gr.Dropdown( choices=AVAILABLE_MODELS, label="Select Model", value=AVAILABLE_MODELS[0], interactive=True ) model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]]) model.change( fn=update_model_description, inputs=model, outputs=model_description ) score_sts = gr.Slider( minimum=0.0, maximum=1.0, value=0.96, step=0.01, label="Similarity Threshold", interactive=True ) submit_button_sts = gr.Button("Process", variant="primary") with gr.Row(): with gr.Column(): output_sts = gr.Dataframe( headers=["score", "sentences1", "sentences2"], type="polars", label="Similarity Results" ) submit_button_sts.click( fn=sts, inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts], outputs=output_sts ).then( fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."), inputs=[output_sts], outputs=[] ) download_button = gr.Button("Download Results as CSV", variant="secondary") download_file = gr.File(label="Downloadable File") download_button.click( fn=save_to_csv, inputs=output_sts, outputs=download_file ).then( fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."), inputs=[download_file], outputs=[] ) return demo if __name__ == "__main__": try: multiprocessing.set_start_method("spawn") # Start cleanup thread folder_path = "data" thread = threading.Thread( target=delete_folder_periodically, args=(folder_path, 1800), daemon=True ) thread.start() # Create and launch interface demo = create_interface() demo.launch( share=False, server_name="0.0.0.0", server_port=7860, show_error=True, show_api=False ) except Exception as e: logger.error(f"Error starting application: {str(e)}") raise