import gradio as gr
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

# Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to compute document embeddings and apply PCA
# Modify the Gradio interface to accept a list of identifiers and texts
def compute_pca(data):
    # data is expected to be a list of dictionaries with 'Identifier' and 'Text' keys
    df = pd.DataFrame(data, columns=["Identifier", "Text"])


    # Remove rows where 'Identifier' or 'Text' is empty or contains only whitespace
    valid_entries = df[
        (df['Identifier'].str.strip() != '') & 
        (df['Text'].str.strip() != '')
    ]
    
    if valid_entries.empty:
        return gr.Plot.update(value=None, label="No data to process. Please fill in the boxes.")
    
    # Generate embeddings
    embeddings = model.encode(valid_entries['Text'].tolist())
    
    # Perform PCA to reduce to 2 dimensions
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(embeddings)
    
    # Add PCA results to the DataFrame
    valid_entries = valid_entries.reset_index(drop=True)
    valid_entries['PC1'] = pca_result[:, 0]
    valid_entries['PC2'] = pca_result[:, 1]
    
    # Plot the PCA result with identifiers as labels
    fig = px.scatter(valid_entries, x='PC1', y='PC2', text='Identifier', title='PCA of Text Embeddings')
    return fig

def text_editor_app():
    with gr.Blocks() as demo:
        identifier_inputs = []
        text_inputs = []
        
        gr.Markdown("### Enter at least two identifier-text pairs:")

        for i in range(4):  # Assuming we have 4 entries
            with gr.Column():
                id_input = gr.Textbox(label=f"Identifier {i+1}")
                text_input = gr.Textbox(label=f"Text {i+1}")
                identifier_inputs.append(id_input)
                text_inputs.append(text_input)
            gr.Markdown("---")  # Add a horizontal rule to create a break

        # Button to run the analysis
        analyze_button = gr.Button("Run Analysis")
        
        # Output plot
        output_plot = gr.Plot(label="PCA Visualization")

        # Function to collect inputs and process them
        def collect_inputs(*args):
            # args will be identifier1, text1, identifier2, text2, ..., identifier4, text4
            # So we need to pair them up
            data = []
            for i in range(0, len(args), 2):
                identifier = args[i]
                text = args[i+1]
                data.append([identifier, text])
            return compute_pca(data)

        inputs = []
        for id_input, text_input in zip(identifier_inputs, text_inputs):
            inputs.extend([id_input, text_input])

        analyze_button.click(fn=collect_inputs, inputs=inputs, outputs=output_plot)

    return demo


# Launch the app
text_editor_app().launch()