import streamlit as st import pandas as pd import json import os import zipfile from sentence_transformers import SentenceTransformer, util from loguru import logger # ================== CONFIGURATION ================== st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide") # Load a pre-trained model for embeddings MODEL_NAME = "all-MiniLM-L6-v2" model = SentenceTransformer(MODEL_NAME) # Load preloaded dataset @st.cache_data def load_data(): file_path = "data/merged_dataset.csv.zip" with zipfile.ZipFile(file_path, 'r') as zip_ref: zip_ref.printdir() zip_ref.extractall("data/extracted") df = pd.read_csv("data/extracted/merged_dataset.csv") return df df = load_data() display_columns = ["uuid","problem", "source", "question_type", "problem_type"] df_filtered = df[display_columns] # ================== FUNCTION DEFINITIONS ================== def compute_embeddings(problems): """Compute sentence embeddings.""" return model.encode(problems, normalize_embeddings=True) def find_similar_problems(df, similarity_threshold=0.9): """Find similar problems using cosine similarity.""" embeddings = compute_embeddings(df['problem'].tolist()) similarity_matrix = util.cos_sim(embeddings, embeddings).numpy() pairs = [] for i in range(len(df)): for j in range(i + 1, len(df)): score = similarity_matrix[i][j] if score > similarity_threshold: pairs.append((df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score))) return sorted(pairs, key=lambda x: x[2], reverse=True) # Sort by similarity score def analyze_clusters(df, similarity_threshold=0.9): """Analyze duplicate problem clusters.""" pairs = find_similar_problems(df, similarity_threshold) detailed_analysis = [] for base_uuid, comp_uuid, score in pairs: base_row = df[df["uuid"] == base_uuid].iloc[0] comp_row = df[df["uuid"] == comp_uuid].iloc[0] column_differences = {} for col in df.columns: if col != "uuid": base_val = base_row[col] comp_val = comp_row[col] column_differences[col] = { 'base': base_val, 'comparison': comp_val, 'match': bool(base_val == comp_val) } detailed_analysis.append({ 'base_uuid': base_uuid, 'comp_uuid': comp_uuid, 'similarity_score': score, 'column_differences': column_differences, }) return detailed_analysis # ================== STREAMLIT UI ================== st.title("🔍 Problem Deduplication Explorer") st.sidebar.header("Settings") similarity_threshold = st.sidebar.slider( "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01 ) # Display first 5 rows of dataset st.subheader("📄 Explore the Dataset") st.dataframe(df_filtered.head(5)) if st.sidebar.button("Run Deduplication Analysis"): with st.spinner("Analyzing..."): results = analyze_clusters(df, similarity_threshold) st.success("Analysis Complete!") st.subheader("📊 Duplicate Problem Pairs") # Filtering options sources = df["source"].unique().tolist() question_types = df["question_type"].unique().tolist() selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources) selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types) if selected_source: results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source] if selected_qtype: results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype] # Display top 5 initially num_display = 5 shown_results = results[:num_display] for entry in shown_results: base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0] similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0] st.markdown(f"### Problem: {base_problem}") st.write(f"**Similar to:** {similar_problem}") st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}") with st.expander("Show Column Differences"): st.json(entry["column_differences"]) st.markdown("---") if len(results) > num_display: if st.button("Show More Results"): extra_results = results[num_display:num_display * 2] for entry in extra_results: base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0] similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0] st.markdown(f"### Problem: {base_problem}") st.write(f"**Similar to:** {similar_problem}") st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}") with st.expander("Show Column Differences"): st.json(entry["column_differences"]) st.markdown("---")