import gradio as gr from sentence_transformers import SentenceTransformer import pandas as pd from datasets import load_dataset from annoy import AnnoyIndex import os try: # Load the dataset (Italian subset, test split) dataset = load_dataset("PhilipMay/stsb_multi_mt", name="it", split="test") df = pd.DataFrame(dataset) # Extract sentences (sentence1 and sentence2) sentences1 = df["sentence1"].tolist() sentences2 = df["sentence2"].tolist() # Sentence-transformers models to test model_names = [ "nickprock/multi-sentence-BERTino", "nickprock/sentence-bert-base-italian-uncased", "nickprock/sentence-bert-base-italian-xxl-uncased", "nickprock/Italian-ModernBERT-base-embed-mmarco-mnrl", ] models = {name: SentenceTransformer(name) for name in model_names} annoy_indexes1 = {} # Store Annoy indexes for sentence1 annoy_indexes2 = {} # Store Annoy indexes for sentence2 def find_similar_sentence_annoy(sentence, model_name, sentence_list, annoy_index): """Finds the most similar sentence using Annoy.""" model = models[model_name] sentence_embedding = model.encode(sentence) nearest_neighbors = annoy_index[model_name].get_nns_by_vector(sentence_embedding, 1) best_sentence_index = nearest_neighbors[0] return sentence_list[best_sentence_index] def calculate_similarity(sentence1, sentence2, model): """Calculates the cosine similarity between two sentences using a given model.""" embedding1 = model.encode(sentence1, convert_to_tensor=True) embedding2 = model.encode(sentence2, convert_to_tensor=True) similarity = util.cos_sim(embedding1, embedding2).item() return similarity def compare_models_annoy(sentence, model1_name, model2_name, model3_name, model4_name): """Compares the results of different models using Annoy.""" sentence1_results = {} sentence2_results = {} similarity_results = {} sentence1_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences1, annoy_indexes1) sentence1_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences1, annoy_indexes1) sentence1_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences1, annoy_indexes1) sentence1_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences1, annoy_indexes1) sentence2_results[model1_name] = find_similar_sentence_annoy(sentence, model1_name, sentences2, annoy_indexes2) sentence2_results[model2_name] = find_similar_sentence_annoy(sentence, model2_name, sentences2, annoy_indexes2) sentence2_results[model3_name] = find_similar_sentence_annoy(sentence, model3_name, sentences2, annoy_indexes2) sentence2_results[model4_name] = find_similar_sentence_annoy(sentence, model4_name, sentences2, annoy_indexes2) # Calculate similarity between the retrieved sentences for model_name in model_names: similarity_results[model_name] = calculate_similarity( sentence1_results[model_name], sentence2_results[model_name], models[model_name] ) return sentence1_results, sentence2_results, similarity_results def format_results(sentence1_results, sentence2_results, similarity_results): """Formats the results for display in Gradio.""" output_text = "" for model_name in model_names: output_text += f"**{model_name}**\n" output_text += f"Most Similar Sentence from sentence1: {sentence1_results[model_name]}\n" output_text += f"Most Similar Sentence from sentence2: {sentence2_results[model_name]}\n" output_text += f"Similarity between retrieved sentences: {similarity_results[model_name]:.4f}\n\n" return output_text def gradio_interface(sentence, model1_name, model2_name, model3_name, model4_name): """Gradio interface function.""" sentence1_results, sentence2_results, similarity_results = compare_models_annoy( sentence, model1_name, model2_name, model3_name, model4_name ) return format_results(sentence1_results, sentence2_results, similarity_results) iface = gr.Interface( fn=gradio_interface, inputs=[ gr.Textbox(lines=2, placeholder="Enter your sentence here..."), gr.Dropdown(model_names, value=model_names[0], label="Model 1"), gr.Dropdown(model_names, value=model_names[1], label="Model 2"), gr.Dropdown(model_names, value=model_names[2], label="Model 3"), gr.Dropdown(model_names, value=model_names[3], label="Model 4"), ], outputs=gr.Markdown(), title="Sentence Transformer Model Comparison (Annoy)", description="Enter a sentence and compare the most similar sentences generated by different sentence-transformer models (using Annoy for faster search) from both sentence1 and sentence2.", ) iface.launch() except Exception as e: print(f"Error loading dataset: {e}") iface = gr.Interface( fn=lambda: "Dataset loading failed. Check console for details.", inputs=[], outputs=gr.Textbox(), title="Dataset Loading Error", description="There was an error loading the dataset.", ) iface.launch()