File size: 4,284 Bytes
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99b887a
 
 
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4bc190
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4bc190
 
 
 
 
 
 
532392b
c4bc190
 
 
532392b
 
 
c4bc190
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4bc190
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import streamlit as st
import pandas as pd
import json
import os
from sentence_transformers import SentenceTransformer, util
from loguru import logger

# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")

# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# Load preloaded dataset
@st.cache_data
def load_data():
    file_path = "data/merged_dataset.csv.zip"
    with gzip.open(file_path, "rt") as f:
        df = pd.read_csv(f)

df = load_data()

# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
    """Compute sentence embeddings."""
    return model.encode(problems, normalize_embeddings=True)

def find_similar_problems(df, similarity_threshold=0.9):
    """Find similar problems using cosine similarity."""
    embeddings = compute_embeddings(df['problem'].tolist())
    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
    clusters = {}
    for i in range(len(df)):
        current_uuid = df["uuid"][i]
        similar_items = [
            (df["uuid"][j], float(similarity_matrix[i][j]))  # Convert float32 to float
            for j in range(i + 1, len(df))
            if similarity_matrix[i][j] > similarity_threshold
        ]
        if similar_items:
            clusters[current_uuid] = similar_items
    return clusters

def analyze_clusters(df, similarity_threshold=0.9):
    """Analyze duplicate problem clusters."""
    clusters = find_similar_problems(df, similarity_threshold)
    detailed_analysis = {}
    for key, values in clusters.items():
        base_row = df[df["uuid"] == key].iloc[0]
        cluster_details = []
        for val, score in values:
            comparison_row = df[df["uuid"] == val].iloc[0]
            
            column_differences = {}
            for col in df.columns:
                if col != "uuid":
                    base_val = base_row[col]
                    comp_val = comparison_row[col]
                    # Convert numpy types to native Python types
                    if hasattr(base_val, 'item'):
                        base_val = base_val.item()
                    if hasattr(comp_val, 'item'):
                        comp_val = comp_val.item()
                    column_differences[col] = {
                        'base': base_val,
                        'comparison': comp_val,
                        'match': bool(base_val == comp_val)  # Convert numpy bool to Python bool
                    }
            cluster_details.append({
                'uuid': val,
                'similarity_score': float(score),  # Convert float32 to float
                'column_differences': column_differences,
            })
        detailed_analysis[key] = cluster_details
    return detailed_analysis

# ================== STREAMLIT UI ==================
st.title("πŸ” Problem Deduplication Explorer")

st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
    "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)

if st.sidebar.button("Run Deduplication Analysis"):
    with st.spinner("Analyzing..."):
        results = analyze_clusters(df, similarity_threshold)
    st.success("Analysis Complete!")
    
    st.subheader("πŸ“Š Duplicate Problem Clusters")
    for base_uuid, cluster in results.items():
        base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
        st.markdown(f"### Problem: {base_problem}")
        for entry in cluster:
            similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
            st.write(f"**Similar to:** {similar_problem}")
            st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
            with st.expander("Show Column Differences"):
                st.json(entry["column_differences"])
            st.markdown("---")

    # Export results
    st.sidebar.download_button(
        label="Download Results as JSON",
        data=json.dumps(results, indent=2),
        file_name="deduplication_results.json",
        mime="application/json"
    )

# ================== DATAFRAME DISPLAY ==================
st.subheader("πŸ“„ Explore the Dataset")
st.dataframe(df)