File size: 2,980 Bytes
f4d46f6
76026d0
f4d46f6
 
 
 
 
 
 
 
250aae7
76026d0
 
 
f8825b7
76026d0
 
7fc6901
 
76026d0
 
7fc6901
 
1b5bc8f
aafdbf2
 
f4d46f6
1b5bc8f
f4d46f6
250aae7
f4d46f6
 
 
1b5bc8f
7fc6901
1b5bc8f
 
f4d46f6
68dda59
1b5bc8f
f4d46f6
 
 
 
5c5f1db
 
11f43b6
 
76026d0
 
 
2834d04
 
76026d0
 
 
 
 
 
6119c28
76026d0
 
 
11f43b6
47c5583
76026d0
 
11f43b6
 
 
 
 
76026d0
 
11f43b6
 
 
76026d0
47c5583
76026d0
f4d46f6
 
76026d0
 
 
f4d46f6
1014197
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

# Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to compute document embeddings and apply PCA
# Modify the Gradio interface to accept a list of identifiers and texts
def compute_pca(data):
    # data is expected to be a list of dictionaries with 'Identifier' and 'Text' keys
    df = pd.DataFrame(data, columns=["Identifier", "Text"])


    # Remove rows where 'Identifier' or 'Text' is empty or contains only whitespace
    valid_entries = df[
        (df['Identifier'].str.strip() != '') & 
        (df['Text'].str.strip() != '')
    ]
    
    if valid_entries.empty:
        return gr.Plot.update(value=None, label="No data to process. Please fill in the boxes.")
    
    # Generate embeddings
    embeddings = model.encode(valid_entries['Text'].tolist())
    
    # Perform PCA to reduce to 2 dimensions
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(embeddings)
    
    # Add PCA results to the DataFrame
    valid_entries = valid_entries.reset_index(drop=True)
    valid_entries['PC1'] = pca_result[:, 0]
    valid_entries['PC2'] = pca_result[:, 1]
    
    # Plot the PCA result with identifiers as labels
    fig = px.scatter(valid_entries, x='PC1', y='PC2', text='Identifier', title='PCA of Text Embeddings')
    return fig

def text_editor_app():
    with gr.Blocks() as demo:
        identifier_inputs = []
        text_inputs = []
        
        gr.Markdown("### Enter at least two identifier-text pairs:")

        for i in range(4):  # Assuming we have 4 entries
            with gr.Column():
                id_input = gr.Textbox(label=f"Identifier {i+1}")
                text_input = gr.Textbox(label=f"Text {i+1}")
                identifier_inputs.append(id_input)
                text_inputs.append(text_input)
            gr.Markdown("---")  # Add a horizontal rule to create a break

        # Button to run the analysis
        analyze_button = gr.Button("Run Analysis")
        
        # Output plot
        output_plot = gr.Plot(label="PCA Visualization")

        # Function to collect inputs and process them
        def collect_inputs(*args):
            # args will be identifier1, text1, identifier2, text2, ..., identifier4, text4
            # So we need to pair them up
            data = []
            for i in range(0, len(args), 2):
                identifier = args[i]
                text = args[i+1]
                data.append([identifier, text])
            return compute_pca(data)

        inputs = []
        for id_input, text_input in zip(identifier_inputs, text_inputs):
            inputs.extend([id_input, text_input])

        analyze_button.click(fn=collect_inputs, inputs=inputs, outputs=output_plot)

    return demo




# Launch the app
text_editor_app().launch()