File size: 2,341 Bytes
f4d46f6
 
 
 
 
 
 
 
 
 
250aae7
1b5bc8f
 
 
 
7fc6901
 
 
 
 
 
 
1b5bc8f
aafdbf2
 
f4d46f6
1b5bc8f
f4d46f6
250aae7
f4d46f6
 
 
1b5bc8f
7fc6901
1b5bc8f
 
f4d46f6
68dda59
1b5bc8f
f4d46f6
 
 
 
47c5583
 
 
 
 
 
 
 
 
807ad31
250aae7
f4d46f6
807ad31
47c5583
 
 
 
 
 
807ad31
f4d46f6
 
1b5bc8f
47c5583
f4d46f6
1014197
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

# Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to compute document embeddings and apply PCA
# Modify the Gradio interface to accept a list of identifiers and texts
def compute_pca(data):
    # data is expected to be a list of dictionaries with 'Identifier' and 'Text' keys
    df = pd.DataFrame(data)

    # Remove rows where 'Identifier' or 'Text' is empty or contains only whitespace
    valid_entries = df[
        (df['Identifier'].str.strip() != '') & 
        (df['Text'].str.strip() != '')
    ]
    
    if valid_entries.empty:
        return gr.Plot.update(value=None, label="No data to process. Please fill in the boxes.")
    
    # Generate embeddings
    embeddings = model.encode(valid_entries['Text'].tolist())
    
    # Perform PCA to reduce to 2 dimensions
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(embeddings)
    
    # Add PCA results to the DataFrame
    valid_entries = valid_entries.reset_index(drop=True)
    valid_entries['PC1'] = pca_result[:, 0]
    valid_entries['PC2'] = pca_result[:, 1]
    
    # Plot the PCA result with identifiers as labels
    fig = px.scatter(valid_entries, x='PC1', y='PC2', text='Identifier', title='PCA of Text Embeddings')
    return fig

def text_editor_app():
    with gr.Blocks() as demo:
        identifiers = []
        texts = []
        with gr.Row():
            for i in range(4):  # Assuming 4 entries
                with gr.Column():
                    id_input = gr.Textbox(label=f"Identifier {i+1}")
                    text_input = gr.Textbox(label=f"Text {i+1}")
                    identifiers.append(id_input)
                    texts.append(text_input)

        analyze_button = gr.Button("Run Analysis")
        output_plot = gr.Plot(label="PCA Visualization")

        def collect_inputs(*args):
            data = list(zip(args[:4], args[4:]))  # Pair identifiers and texts
            return compute_pca(data)

        inputs = identifiers + texts
        analyze_button.click(fn=collect_inputs, inputs=inputs, outputs=output_plot)

    return demo



# Launch the app
text_editor_app().launch()