math-dedup / app.py
dsleo's picture
optim + logging
f83d20c verified
raw
history blame
5.93 kB
import streamlit as st
import pandas as pd
import numpy as np
import json
import os
import time
import zipfile
from sentence_transformers import SentenceTransformer, util
from loguru import logger
# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")
# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)
# Load preloaded dataset
@st.cache_data
def load_data():
file_path = "data/merged_dataset.csv.zip"
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.printdir()
zip_ref.extractall("data/extracted")
df = pd.read_csv("data/extracted/merged_dataset.csv")
return df
df = load_data()
display_columns = ["uuid","problem", "source", "question_type", "problem_type"]
df = df[display_columns]
# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
"""Compute sentence embeddings."""
return model.encode(problems, normalize_embeddings=True)
def find_similar_problems(df, similarity_threshold=0.9):
"""Find similar problems using cosine similarity"""
st.status("πŸ”„ Computing problem embeddings...")
start_time = time.time()
embeddings = compute_embeddings(df['problem'].tolist())
st.success("βœ… Embeddings computed!", icon="βœ…")
st.status("πŸ”„ Computing cosine similarity matrix...")
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
st.success("βœ… Similarity matrix computed!", icon="βœ…")
num_problems = len(df)
upper_triangle_indices = np.triu_indices(num_problems, k=1)
st.status("πŸ”„ Filtering similar problems...")
i_indices, j_indices = upper_triangle_indices
similarity_scores = similarity_matrix[i_indices, j_indices]
mask = similarity_scores > similarity_threshold
filtered_i = i_indices[mask]
filtered_j = j_indices[mask]
filtered_scores = similarity_scores[mask]
pairs = [
(df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score))
for i, j, score in zip(filtered_i, filtered_j, filtered_scores)
]
sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)
st.success(f"βœ… Analysis complete! Found {len(sorted_pairs)} similar problems in {time.time() - start_time:.2f}s", icon="πŸŽ‰")
return sorted_pairs
def analyze_clusters(df, similarity_threshold=0.9):
"""Analyze duplicate problem clusters."""
pairs = find_similar_problems(df, similarity_threshold)
detailed_analysis = []
for base_uuid, comp_uuid, score in pairs:
base_row = df[df["uuid"] == base_uuid].iloc[0]
comp_row = df[df["uuid"] == comp_uuid].iloc[0]
column_differences = {}
for col in df.columns:
if col != "uuid":
base_val = base_row[col]
comp_val = comp_row[col]
column_differences[col] = {
'base': base_val,
'comparison': comp_val,
'match': bool(base_val == comp_val)
}
detailed_analysis.append({
'base_uuid': base_uuid,
'comp_uuid': comp_uuid,
'similarity_score': score,
'column_differences': column_differences,
})
return detailed_analysis
# ================== STREAMLIT UI ==================
st.title("πŸ” Problem Deduplication Explorer")
st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
"Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)
# Display first 5 rows of dataset
st.subheader("πŸ“„ Explore the Dataset")
st.dataframe(df.head(5))
if st.sidebar.button("Run Deduplication Analysis"):
with st.spinner("Analyzing..."):
results = analyze_clusters(df, similarity_threshold)
st.success("Analysis Complete!")
st.subheader("πŸ“Š Duplicate Problem Pairs")
# Filtering options
sources = df["source"].unique().tolist()
question_types = df["question_type"].unique().tolist()
selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources)
selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types)
if selected_source:
results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
if selected_qtype:
results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
# Display top 5 initially
num_display = 5
shown_results = results[:num_display]
for entry in shown_results:
base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
st.markdown(f"### Problem: {base_problem}")
st.write(f"**Similar to:** {similar_problem}")
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
with st.expander("Show Column Differences"):
st.json(entry["column_differences"])
st.markdown("---")
if len(results) > num_display:
if st.button("Show More Results"):
extra_results = results[num_display:num_display * 2]
for entry in extra_results:
base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
st.markdown(f"### Problem: {base_problem}")
st.write(f"**Similar to:** {similar_problem}")
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
with st.expander("Show Column Differences"):
st.json(entry["column_differences"])
st.markdown("---")