math-dedup / app.py
dsleo's picture
Update app.py
c6f32b1 verified
raw
history blame
5.18 kB
import streamlit as st
import pandas as pd
import json
import os
import zipfile
from sentence_transformers import SentenceTransformer, util
from loguru import logger
# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")
# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)
# Load preloaded dataset
@st.cache_data
def load_data():
file_path = "data/merged_dataset.csv.zip"
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.printdir()
zip_ref.extractall("data/extracted")
df = pd.read_csv("data/extracted/merged_dataset.csv")
return df
df = load_data()
display_columns = ["uuid","problem", "source", "question_type", "problem_type"]
df_filtered = df[display_columns]
# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
"""Compute sentence embeddings."""
return model.encode(problems, normalize_embeddings=True)
def find_similar_problems(df, similarity_threshold=0.9):
"""Find similar problems using cosine similarity."""
embeddings = compute_embeddings(df['problem'].tolist())
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
pairs = []
for i in range(len(df)):
for j in range(i + 1, len(df)):
score = similarity_matrix[i][j]
if score > similarity_threshold:
pairs.append((df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score)))
return sorted(pairs, key=lambda x: x[2], reverse=True) # Sort by similarity score
def analyze_clusters(df, similarity_threshold=0.9):
"""Analyze duplicate problem clusters."""
pairs = find_similar_problems(df, similarity_threshold)
detailed_analysis = []
for base_uuid, comp_uuid, score in pairs:
base_row = df[df["uuid"] == base_uuid].iloc[0]
comp_row = df[df["uuid"] == comp_uuid].iloc[0]
column_differences = {}
for col in df.columns:
if col != "uuid":
base_val = base_row[col]
comp_val = comp_row[col]
column_differences[col] = {
'base': base_val,
'comparison': comp_val,
'match': bool(base_val == comp_val)
}
detailed_analysis.append({
'base_uuid': base_uuid,
'comp_uuid': comp_uuid,
'similarity_score': score,
'column_differences': column_differences,
})
return detailed_analysis
# ================== STREAMLIT UI ==================
st.title("πŸ” Problem Deduplication Explorer")
st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
"Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)
# Display first 5 rows of dataset
st.subheader("πŸ“„ Explore the Dataset")
st.dataframe(df_filtered.head(5))
if st.sidebar.button("Run Deduplication Analysis"):
with st.spinner("Analyzing..."):
results = analyze_clusters(df, similarity_threshold)
st.success("Analysis Complete!")
st.subheader("πŸ“Š Duplicate Problem Pairs")
# Filtering options
sources = df["source"].unique().tolist()
question_types = df["question_type"].unique().tolist()
selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources)
selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types)
if selected_source:
results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
if selected_qtype:
results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
# Display top 5 initially
num_display = 5
shown_results = results[:num_display]
for entry in shown_results:
base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
st.markdown(f"### Problem: {base_problem}")
st.write(f"**Similar to:** {similar_problem}")
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
with st.expander("Show Column Differences"):
st.json(entry["column_differences"])
st.markdown("---")
if len(results) > num_display:
if st.button("Show More Results"):
extra_results = results[num_display:num_display * 2]
for entry in extra_results:
base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
st.markdown(f"### Problem: {base_problem}")
st.write(f"**Similar to:** {similar_problem}")
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
with st.expander("Show Column Differences"):
st.json(entry["column_differences"])
st.markdown("---")