AnshulS commited on
Commit
85c1934
·
verified ·
1 Parent(s): e5d19f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -12
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import pandas as pd
2
  import gradio as gr
 
3
  from retriever import get_relevant_passages
4
  from reranker import rerank
5
 
@@ -11,7 +12,7 @@ def clean_df(df):
11
  print(f"Original columns: {df.columns}")
12
 
13
  # Ensure clean URLs from the second column
14
- second_col = df.iloc[:, 2].astype(str) # Pre-packaged Job Solutions column
15
 
16
  if second_col.str.contains('http').any() or second_col.str.contains('www').any():
17
  df["url"] = second_col # Already has full URLs
@@ -20,18 +21,18 @@ def clean_df(df):
20
  df["url"] = "https://www.shl.com" + second_col.str.replace(r'^(?!/)', '/', regex=True)
21
 
22
  # Map T/F to Yes/No for remote testing and adaptive support
23
- df["remote_support"] = df.iloc[:, 3].map(lambda x: "Yes" if x == "T" else "No")
24
- df["adaptive_support"] = df.iloc[:, 4].map(lambda x: "Yes" if x == "T" else "No")
25
 
26
  # Handle test_type properly - convert string representation of list to actual list
27
- df["test_type"] = df.iloc[:, 5].apply(lambda x: eval(x) if isinstance(x, str) else x)
28
 
29
  # Get description from column 7
30
- df["description"] = df.iloc[:, 6]
31
 
32
  # Extract duration with error handling from column 10
33
  df["duration"] = pd.to_numeric(
34
- df.iloc[:, 9].astype(str).str.extract(r'(\d+)')[0],
35
  errors='coerce'
36
  )
37
 
@@ -84,15 +85,19 @@ def recommend(query):
84
  # Print some debug info
85
  print(f"Processing query: {query[:50]}...")
86
 
87
- # Get relevant passages
88
- top_k_df = get_relevant_passages(query, df_clean, top_k=20)
89
 
90
  # Debug: Check if we got any results
91
  print(f"Retrieved {len(top_k_df)} assessments")
92
 
93
  if top_k_df.empty:
94
  return {"error": "No matching assessments found"}
95
-
 
 
 
 
96
  # Convert test_type to list if it's not already
97
  top_k_df['test_type'] = top_k_df['test_type'].apply(
98
  lambda x: x if isinstance(x, list) else
@@ -103,6 +108,15 @@ def recommend(query):
103
  top_k_df['duration'] = top_k_df['duration'].fillna(-1).astype(int)
104
  top_k_df.loc[top_k_df['duration'] == -1, 'duration'] = None
105
 
 
 
 
 
 
 
 
 
 
106
  # Convert DataFrame to list of dictionaries
107
  candidates = top_k_df.to_dict(orient="records")
108
 
@@ -116,10 +130,22 @@ def recommend(query):
116
  # Get recommendations
117
  result = rerank(query, candidates)
118
 
119
- # Post-process result
120
  if 'recommended_assessments' in result:
121
- result['recommended_assessments'] = validate_and_fix_urls(result['recommended_assessments'])
122
- print(f"Returning {len(result['recommended_assessments'])} recommended assessments")
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  return result
125
  except Exception as e:
 
1
  import pandas as pd
2
  import gradio as gr
3
+ import numpy as np
4
  from retriever import get_relevant_passages
5
  from reranker import rerank
6
 
 
12
  print(f"Original columns: {df.columns}")
13
 
14
  # Ensure clean URLs from the second column
15
+ second_col = df.iloc[:, 3].astype(str) # Pre-packaged Job Solutions column
16
 
17
  if second_col.str.contains('http').any() or second_col.str.contains('www').any():
18
  df["url"] = second_col # Already has full URLs
 
21
  df["url"] = "https://www.shl.com" + second_col.str.replace(r'^(?!/)', '/', regex=True)
22
 
23
  # Map T/F to Yes/No for remote testing and adaptive support
24
+ df["remote_support"] = df.iloc[:, 4].map(lambda x: "Yes" if x == "T" else "No")
25
+ df["adaptive_support"] = df.iloc[:, 5].map(lambda x: "Yes" if x == "T" else "No")
26
 
27
  # Handle test_type properly - convert string representation of list to actual list
28
+ df["test_type"] = df.iloc[:, 6].apply(lambda x: eval(x) if isinstance(x, str) else x)
29
 
30
  # Get description from column 7
31
+ df["description"] = df.iloc[:, 7]
32
 
33
  # Extract duration with error handling from column 10
34
  df["duration"] = pd.to_numeric(
35
+ df.iloc[:, 10].astype(str).str.extract(r'(\d+)')[0],
36
  errors='coerce'
37
  )
38
 
 
85
  # Print some debug info
86
  print(f"Processing query: {query[:50]}...")
87
 
88
+ # Get relevant passages with increased diversity (more candidates)
89
+ top_k_df = get_relevant_passages(query, df_clean, top_k=30)
90
 
91
  # Debug: Check if we got any results
92
  print(f"Retrieved {len(top_k_df)} assessments")
93
 
94
  if top_k_df.empty:
95
  return {"error": "No matching assessments found"}
96
+
97
+ # Remove duplicates by URL - keep the first occurrence (highest ranked)
98
+ top_k_df = top_k_df.drop_duplicates(subset=['url'])
99
+ print(f"After deduplication: {len(top_k_df)} unique assessments")
100
+
101
  # Convert test_type to list if it's not already
102
  top_k_df['test_type'] = top_k_df['test_type'].apply(
103
  lambda x: x if isinstance(x, list) else
 
108
  top_k_df['duration'] = top_k_df['duration'].fillna(-1).astype(int)
109
  top_k_df.loc[top_k_df['duration'] == -1, 'duration'] = None
110
 
111
+ # Add a diverse set of assessment types - ensure we have different test types represented
112
+ test_types = set()
113
+ for _, row in top_k_df.iterrows():
114
+ if isinstance(row['test_type'], list):
115
+ for test_type in row['test_type']:
116
+ test_types.add(test_type)
117
+
118
+ print(f"Found assessments covering {len(test_types)} different test types")
119
+
120
  # Convert DataFrame to list of dictionaries
121
  candidates = top_k_df.to_dict(orient="records")
122
 
 
130
  # Get recommendations
131
  result = rerank(query, candidates)
132
 
133
+ # Post-process result to ensure no duplicates
134
  if 'recommended_assessments' in result:
135
+ recommendations = result['recommended_assessments']
136
+
137
+ # Deduplicate by URL
138
+ unique_urls = set()
139
+ unique_recommendations = []
140
+
141
+ for rec in recommendations:
142
+ if rec['url'] not in unique_urls:
143
+ unique_urls.add(rec['url'])
144
+ unique_recommendations.append(rec)
145
+
146
+ # Validate URLs
147
+ result['recommended_assessments'] = validate_and_fix_urls(unique_recommendations)
148
+ print(f"Returning {len(result['recommended_assessments'])} unique recommended assessments")
149
 
150
  return result
151
  except Exception as e: