Spaces:
Sleeping
Sleeping
File size: 4,448 Bytes
532392b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import streamlit as st
import pandas as pd
import json
import os
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI
from loguru import logger
# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")
# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)
# Load preloaded dataset
@st.cache_data
def load_data():
data = [
{
"uuid": "350d6834-3231-5d23-89e9-c7dc0f3fde0b",
"problem": "A function $f$ has the property that $f(3x-1)=x^2+x+1$ for all real numbers $x$. What is $f(5)$?",
"source": "aops-wiki",
"question_type": "MCQ",
"problem_type": "Algebra"
},
{
"uuid": "b67e9cf9-8b3a-5a34-a118-4ce2aeb2c3d8",
"problem": "A function $f$ has the property that $f(3x-1)=x^2+x+1$ for all real numbers $x$. What is $f(5)$?",
"source": "MATH-train",
"question_type": "math-word-problem",
"problem_type": "Algebra"
},
]
return pd.DataFrame(data)
df = load_data()
# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
"""Compute sentence embeddings."""
return model.encode(problems, normalize_embeddings=True)
def find_similar_problems(df, similarity_threshold=0.9):
"""Find similar problems using cosine similarity."""
embeddings = compute_embeddings(df['problem'].tolist())
similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
clusters = {}
for i in range(len(df)):
current_uuid = df["uuid"][i]
similar_items = [
(df["uuid"][j], similarity_matrix[i][j])
for j in range(i + 1, len(df))
if similarity_matrix[i][j] > similarity_threshold
]
if similar_items:
clusters[current_uuid] = similar_items
return clusters
def analyze_clusters(df, similarity_threshold=0.9):
"""Analyze duplicate problem clusters."""
clusters = find_similar_problems(df, similarity_threshold)
detailed_analysis = {}
for key, values in clusters.items():
base_row = df[df["uuid"] == key].iloc[0]
cluster_details = []
for val, score in values:
comparison_row = df[df["uuid"] == val].iloc[0]
column_differences = {}
for col in df.columns:
if col != "uuid":
column_differences[col] = {
'base': base_row[col],
'comparison': comparison_row[col],
'match': base_row[col] == comparison_row[col]
}
cluster_details.append({
'uuid': val,
'similarity_score': score,
'column_differences': column_differences,
})
detailed_analysis[key] = cluster_details
return detailed_analysis
# ================== STREAMLIT UI ==================
st.title("π Problem Deduplication Explorer")
st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
"Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)
if st.sidebar.button("Run Deduplication Analysis"):
with st.spinner("Analyzing..."):
results = analyze_clusters(df, similarity_threshold)
st.success("Analysis Complete!")
st.subheader("π Duplicate Problem Clusters")
for base_uuid, cluster in results.items():
base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
st.markdown(f"### Problem: {base_problem}")
for entry in cluster:
similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
st.write(f"**Similar to:** {similar_problem}")
st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
with st.expander("Show Column Differences"):
st.json(entry["column_differences"])
st.markdown("---")
# Export results
st.sidebar.download_button(
label="Download Results as JSON",
data=json.dumps(results, indent=2),
file_name="deduplication_results.json",
mime="application/json"
)
# ================== DATAFRAME DISPLAY ==================
st.subheader("π Explore the Dataset")
st.dataframe(df)
|