dsleo commited on
Commit
17186a1
Β·
verified Β·
1 Parent(s): 7362def
Files changed (1) hide show
  1. app.py +185 -116
app.py CHANGED
@@ -9,98 +9,110 @@ from sentence_transformers import SentenceTransformer, util
9
  from loguru import logger
10
 
11
  # ================== CONFIGURATION ==================
12
- st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Load a pre-trained model for embeddings
15
- MODEL_NAME = "all-MiniLM-L6-v2"
16
- model = SentenceTransformer(MODEL_NAME)
17
 
18
  # Load preloaded dataset
19
  @st.cache_data
20
  def load_data():
21
- file_path = "data/merged_dataset.csv.zip"
22
- with zipfile.ZipFile(file_path, 'r') as zip_ref:
23
- zip_ref.printdir()
24
- zip_ref.extractall("data/extracted")
25
- df = pd.read_csv("data/extracted/merged_dataset.csv")
26
- return df
27
-
28
- df = load_data()
29
-
30
- display_columns = ["uuid","problem", "source", "question_type", "problem_type"]
31
- df = df[display_columns]
32
-
33
- # ================== FUNCTION DEFINITIONS ==================
34
  def compute_embeddings(problems):
35
- """Compute sentence embeddings."""
36
- return model.encode(problems, normalize_embeddings=True)
 
 
 
 
37
 
38
- def find_similar_problems(df, similarity_threshold=0.9):
 
39
  """Find similar problems using cosine similarity, optimized for speed."""
40
-
41
- status_msgs = [] # Store status messages to clear later
42
-
43
- # Step 1: Compute embeddings
44
- msg = st.status("πŸ”„ Computing problem embeddings...")
45
- status_msgs.append(msg)
46
- start_time = time.time()
47
  embeddings = compute_embeddings(df['problem'].tolist())
 
 
 
 
 
48
 
49
- # Step 2: Compute similarity matrix
50
- msg = st.status("πŸ”„ Computing cosine similarity matrix...")
51
- status_msgs.append(msg)
52
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
 
 
53
 
54
- # Step 3: Filter top similarities
55
- msg = st.status("πŸ”„ Filtering similar problems...")
56
- status_msgs.append(msg)
57
-
58
  num_problems = len(df)
59
  upper_triangle_indices = np.triu_indices(num_problems, k=1)
 
60
 
61
- i_indices, j_indices = upper_triangle_indices
62
- similarity_scores = similarity_matrix[i_indices, j_indices]
63
-
64
  # Filter based on threshold
65
  mask = similarity_scores > similarity_threshold
66
- filtered_i = i_indices[mask]
67
- filtered_j = j_indices[mask]
68
- filtered_scores = similarity_scores[mask]
69
-
70
  pairs = [
71
- (df.iloc[i]["uuid"], df.iloc[j]["uuid"], float(score))
72
- for i, j, score in zip(filtered_i, filtered_j, filtered_scores)
 
 
73
  ]
74
-
75
- sorted_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)
76
 
77
- # Step 4: Remove intermediate messages
78
- for msg in status_msgs:
79
- msg.empty() # Clear only the intermediate messages
 
80
 
81
- # Step 5: Display final success message
82
- st.success(f"βœ… Analysis complete! Found {len(sorted_pairs)} similar problems in {time.time() - start_time:.2f}s", icon="πŸŽ‰")
83
 
84
- return sorted_pairs
85
-
86
- def analyze_clusters(df, similarity_threshold=0.9):
87
- """Analyze duplicate problem clusters."""
88
- pairs = find_similar_problems(df, similarity_threshold)
 
89
  detailed_analysis = []
90
  for base_uuid, comp_uuid, score in pairs:
91
- base_row = df[df["uuid"] == base_uuid].iloc[0]
92
- comp_row = df[df["uuid"] == comp_uuid].iloc[0]
 
 
 
 
 
 
 
 
 
93
 
94
- column_differences = {}
95
- for col in df.columns:
96
- if col != "uuid":
97
- base_val = base_row[col]
98
- comp_val = comp_row[col]
99
- column_differences[col] = {
100
- 'base': base_val,
101
- 'comparison': comp_val,
102
- 'match': bool(base_val == comp_val)
103
- }
104
  detailed_analysis.append({
105
  'base_uuid': base_uuid,
106
  'comp_uuid': comp_uuid,
@@ -110,61 +122,118 @@ def analyze_clusters(df, similarity_threshold=0.9):
110
  return detailed_analysis
111
 
112
  # ================== STREAMLIT UI ==================
113
- st.title("πŸ” Problem Deduplication Explorer")
114
-
115
- st.sidebar.header("Settings")
116
- similarity_threshold = st.sidebar.slider(
117
- "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
118
- )
119
-
120
- # Display first 5 rows of dataset
121
- st.subheader("πŸ“„ Explore the Dataset")
122
- st.dataframe(df.head(5))
123
-
124
- if st.sidebar.button("Run Deduplication Analysis"):
125
- with st.spinner("Analyzing..."):
126
- results = analyze_clusters(df, similarity_threshold)
127
- st.success("Analysis Complete!")
128
-
129
- st.subheader("πŸ“Š Duplicate Problem Pairs")
130
 
131
- # Filtering options
132
- sources = df["source"].unique().tolist()
133
- question_types = df["question_type"].unique().tolist()
 
134
 
135
- selected_source = st.sidebar.selectbox("Filter by Source", [None] + sources)
136
- selected_qtype = st.sidebar.selectbox("Filter by Question Type", [None] + question_types)
 
137
 
138
- if selected_source:
139
- results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
140
- if selected_qtype:
141
- results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- # Display top 5 initially
144
- num_display = 5
145
- shown_results = results[:num_display]
146
 
147
- for entry in shown_results:
148
- base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
149
- similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
 
 
 
 
 
 
 
150
 
151
- st.markdown(f"### Problem: {base_problem}")
152
- st.write(f"**Similar to:** {similar_problem}")
153
- st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
154
- with st.expander("Show Column Differences"):
155
- st.json(entry["column_differences"])
156
- st.markdown("---")
157
-
158
- if len(results) > num_display:
159
- if st.button("Show More Results"):
160
- extra_results = results[num_display:num_display * 2]
161
- for entry in extra_results:
162
- base_problem = df[df["uuid"] == entry["base_uuid"]]["problem"].values[0]
163
- similar_problem = df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- st.markdown(f"### Problem: {base_problem}")
166
- st.write(f"**Similar to:** {similar_problem}")
167
- st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
168
- with st.expander("Show Column Differences"):
169
  st.json(entry["column_differences"])
170
  st.markdown("---")
 
 
 
 
9
  from loguru import logger
10
 
11
  # ================== CONFIGURATION ==================
12
+ st.set_page_config(
13
+ page_title="Problem Deduplication Explorer",
14
+ layout="wide",
15
+ initial_sidebar_state="expanded"
16
+ )
17
+
18
+ # Load a pre-trained model for embeddings with HF caching
19
+ @st.cache_resource
20
+ def load_model():
21
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
22
+ try:
23
+ return SentenceTransformer(model_name, cache_folder="/tmp/sentence_transformers")
24
+ except Exception as e:
25
+ st.error(f"Error loading model: {e}")
26
+ return None
27
 
28
+ model = load_model()
 
 
29
 
30
  # Load preloaded dataset
31
  @st.cache_data
32
  def load_data():
33
+ try:
34
+ file_path = "data/merged_dataset.csv.zip"
35
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
36
+ with zip_ref.open(zip_ref.namelist()[0]) as file:
37
+ df = pd.read_csv(file)
38
+ return df[["uuid", "problem", "source", "question_type", "problem_type"]]
39
+ except Exception as e:
40
+ st.error(f"Error loading dataset: {e}")
41
+ # Return empty DataFrame with correct columns if loading fails
42
+ return pd.DataFrame(columns=["uuid", "problem", "source", "question_type", "problem_type"])
43
+
44
+ # Cache embeddings computation with error handling
45
+ @st.cache_data
46
  def compute_embeddings(problems):
47
+ """Compute and cache sentence embeddings."""
48
+ try:
49
+ return model.encode(problems, normalize_embeddings=True)
50
+ except Exception as e:
51
+ st.error(f"Error computing embeddings: {e}")
52
+ return np.array([])
53
 
54
+ # ================== FUNCTION DEFINITIONS ==================
55
+ def find_similar_problems(df, similarity_threshold=0.9, progress_bar=None):
56
  """Find similar problems using cosine similarity, optimized for speed."""
57
+ if df.empty:
58
+ return []
59
+
60
+ # Compute embeddings with progress tracking
 
 
 
61
  embeddings = compute_embeddings(df['problem'].tolist())
62
+ if embeddings.size == 0:
63
+ return []
64
+
65
+ if progress_bar:
66
+ progress_bar.progress(0.33, "Computing similarity matrix...")
67
 
68
+ # Compute similarity matrix
 
 
69
  similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
70
+ if progress_bar:
71
+ progress_bar.progress(0.66, "Finding similar pairs...")
72
 
73
+ # Use numpy operations for better performance
 
 
 
74
  num_problems = len(df)
75
  upper_triangle_indices = np.triu_indices(num_problems, k=1)
76
+ similarity_scores = similarity_matrix[upper_triangle_indices]
77
 
 
 
 
78
  # Filter based on threshold
79
  mask = similarity_scores > similarity_threshold
80
+ filtered_indices = np.where(mask)[0]
81
+
 
 
82
  pairs = [
83
+ (df.iloc[upper_triangle_indices[0][i]]["uuid"],
84
+ df.iloc[upper_triangle_indices[1][i]]["uuid"],
85
+ float(similarity_scores[i]))
86
+ for i in filtered_indices
87
  ]
 
 
88
 
89
+ if progress_bar:
90
+ progress_bar.progress(1.0, "Analysis complete!")
91
+ time.sleep(0.5)
92
+ progress_bar.empty()
93
 
94
+ return sorted(pairs, key=lambda x: x[2], reverse=True)
 
95
 
96
+ @st.cache_data
97
+ def analyze_clusters(_df, pairs):
98
+ """Analyze duplicate problem clusters with caching."""
99
+ if not pairs or _df.empty:
100
+ return []
101
+
102
  detailed_analysis = []
103
  for base_uuid, comp_uuid, score in pairs:
104
+ base_row = _df[_df["uuid"] == base_uuid].iloc[0]
105
+ comp_row = _df[_df["uuid"] == comp_uuid].iloc[0]
106
+
107
+ column_differences = {
108
+ col: {
109
+ 'base': base_row[col],
110
+ 'comparison': comp_row[col],
111
+ 'match': bool(base_row[col] == comp_row[col])
112
+ }
113
+ for col in _df.columns if col != "uuid"
114
+ }
115
 
 
 
 
 
 
 
 
 
 
 
116
  detailed_analysis.append({
117
  'base_uuid': base_uuid,
118
  'comp_uuid': comp_uuid,
 
122
  return detailed_analysis
123
 
124
  # ================== STREAMLIT UI ==================
125
+ def main():
126
+ st.title("πŸ” Problem Deduplication Explorer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ # Check if model loaded successfully
129
+ if model is None:
130
+ st.error("Failed to load the model. Please try again later.")
131
+ return
132
 
133
+ # Initialize session state for pagination
134
+ if 'page_number' not in st.session_state:
135
+ st.session_state.page_number = 0
136
 
137
+ # Sidebar configuration
138
+ with st.sidebar:
139
+ st.header("Settings")
140
+ similarity_threshold = st.slider(
141
+ "Similarity Threshold",
142
+ min_value=0.5,
143
+ max_value=1.0,
144
+ value=0.9,
145
+ step=0.01,
146
+ help="Higher values mean more similar problems"
147
+ )
148
+
149
+ items_per_page = st.select_slider(
150
+ "Items per page",
151
+ options=[5, 10, 20, 50],
152
+ value=10,
153
+ help="Number of results to show per page"
154
+ )
155
+
156
+ # Load and display dataset
157
+ df = load_data()
158
 
159
+ if df.empty:
160
+ st.error("Failed to load the dataset. Please check if the data file exists in the correct location.")
161
+ return
162
 
163
+ with st.expander("πŸ“„ Dataset Preview", expanded=False):
164
+ st.dataframe(
165
+ df.head(),
166
+ use_container_width=True,
167
+ hide_index=True
168
+ )
169
+
170
+ # Analysis section
171
+ if st.sidebar.button("Run Deduplication Analysis", type="primary"):
172
+ progress_bar = st.progress(0, "Starting analysis...")
173
 
174
+ # Run analysis
175
+ pairs = find_similar_problems(df, similarity_threshold, progress_bar)
176
+ results = analyze_clusters(df, pairs)
177
+
178
+ if not results:
179
+ st.warning("No similar problems found with the current threshold.")
180
+ return
181
+
182
+ # Filtering options
183
+ sources = sorted(df["source"].unique().tolist())
184
+ question_types = sorted(df["question_type"].unique().tolist())
185
+
186
+ col1, col2 = st.columns(2)
187
+ with col1:
188
+ selected_source = st.selectbox("Filter by Source", [None] + sources)
189
+ with col2:
190
+ selected_qtype = st.selectbox("Filter by Question Type", [None] + question_types)
191
+
192
+ # Apply filters
193
+ if selected_source:
194
+ results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["source"].values[0] == selected_source]
195
+ if selected_qtype:
196
+ results = [r for r in results if df[df["uuid"] == r["base_uuid"]]["question_type"].values[0] == selected_qtype]
197
+
198
+ if not results:
199
+ st.warning("No results found with the current filters.")
200
+ return
201
+
202
+ # Pagination
203
+ total_pages = len(results) // items_per_page
204
+
205
+ col1, col2, col3 = st.columns([1, 3, 1])
206
+ with col1:
207
+ if st.button("← Previous", disabled=st.session_state.page_number <= 0):
208
+ st.session_state.page_number -= 1
209
+ with col2:
210
+ st.write(f"Page {st.session_state.page_number + 1} of {total_pages + 1}")
211
+ with col3:
212
+ if st.button("Next β†’", disabled=st.session_state.page_number >= total_pages):
213
+ st.session_state.page_number += 1
214
+
215
+ # Display results
216
+ start_idx = st.session_state.page_number * items_per_page
217
+ end_idx = start_idx + items_per_page
218
+ page_results = results[start_idx:end_idx]
219
+
220
+ for entry in page_results:
221
+ with st.container():
222
+ col1, col2 = st.columns([1, 1])
223
+
224
+ with col1:
225
+ st.markdown("### Original Problem")
226
+ st.info(df[df["uuid"] == entry["base_uuid"]]["problem"].values[0])
227
+
228
+ with col2:
229
+ st.markdown("### Similar Problem")
230
+ st.info(df[df["uuid"] == entry["comp_uuid"]]["problem"].values[0])
231
 
232
+ st.metric("Similarity Score", f"{entry['similarity_score']:.4f}")
233
+
234
+ with st.expander("Show Details"):
 
235
  st.json(entry["column_differences"])
236
  st.markdown("---")
237
+
238
+ if __name__ == "__main__":
239
+ main()