Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import pandas as pd
|
2 |
import gradio as gr
|
|
|
3 |
from retriever import get_relevant_passages
|
4 |
from reranker import rerank
|
5 |
|
@@ -11,7 +12,7 @@ def clean_df(df):
|
|
11 |
print(f"Original columns: {df.columns}")
|
12 |
|
13 |
# Ensure clean URLs from the second column
|
14 |
-
second_col = df.iloc[:,
|
15 |
|
16 |
if second_col.str.contains('http').any() or second_col.str.contains('www').any():
|
17 |
df["url"] = second_col # Already has full URLs
|
@@ -20,18 +21,18 @@ def clean_df(df):
|
|
20 |
df["url"] = "https://www.shl.com" + second_col.str.replace(r'^(?!/)', '/', regex=True)
|
21 |
|
22 |
# Map T/F to Yes/No for remote testing and adaptive support
|
23 |
-
df["remote_support"] = df.iloc[:,
|
24 |
-
df["adaptive_support"] = df.iloc[:,
|
25 |
|
26 |
# Handle test_type properly - convert string representation of list to actual list
|
27 |
-
df["test_type"] = df.iloc[:,
|
28 |
|
29 |
# Get description from column 7
|
30 |
-
df["description"] = df.iloc[:,
|
31 |
|
32 |
# Extract duration with error handling from column 10
|
33 |
df["duration"] = pd.to_numeric(
|
34 |
-
df.iloc[:,
|
35 |
errors='coerce'
|
36 |
)
|
37 |
|
@@ -84,15 +85,19 @@ def recommend(query):
|
|
84 |
# Print some debug info
|
85 |
print(f"Processing query: {query[:50]}...")
|
86 |
|
87 |
-
# Get relevant passages
|
88 |
-
top_k_df = get_relevant_passages(query, df_clean, top_k=
|
89 |
|
90 |
# Debug: Check if we got any results
|
91 |
print(f"Retrieved {len(top_k_df)} assessments")
|
92 |
|
93 |
if top_k_df.empty:
|
94 |
return {"error": "No matching assessments found"}
|
95 |
-
|
|
|
|
|
|
|
|
|
96 |
# Convert test_type to list if it's not already
|
97 |
top_k_df['test_type'] = top_k_df['test_type'].apply(
|
98 |
lambda x: x if isinstance(x, list) else
|
@@ -103,6 +108,15 @@ def recommend(query):
|
|
103 |
top_k_df['duration'] = top_k_df['duration'].fillna(-1).astype(int)
|
104 |
top_k_df.loc[top_k_df['duration'] == -1, 'duration'] = None
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# Convert DataFrame to list of dictionaries
|
107 |
candidates = top_k_df.to_dict(orient="records")
|
108 |
|
@@ -116,10 +130,22 @@ def recommend(query):
|
|
116 |
# Get recommendations
|
117 |
result = rerank(query, candidates)
|
118 |
|
119 |
-
# Post-process result
|
120 |
if 'recommended_assessments' in result:
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
return result
|
125 |
except Exception as e:
|
|
|
1 |
import pandas as pd
|
2 |
import gradio as gr
|
3 |
+
import numpy as np
|
4 |
from retriever import get_relevant_passages
|
5 |
from reranker import rerank
|
6 |
|
|
|
12 |
print(f"Original columns: {df.columns}")
|
13 |
|
14 |
# Ensure clean URLs from the second column
|
15 |
+
second_col = df.iloc[:, 3].astype(str) # Pre-packaged Job Solutions column
|
16 |
|
17 |
if second_col.str.contains('http').any() or second_col.str.contains('www').any():
|
18 |
df["url"] = second_col # Already has full URLs
|
|
|
21 |
df["url"] = "https://www.shl.com" + second_col.str.replace(r'^(?!/)', '/', regex=True)
|
22 |
|
23 |
# Map T/F to Yes/No for remote testing and adaptive support
|
24 |
+
df["remote_support"] = df.iloc[:, 4].map(lambda x: "Yes" if x == "T" else "No")
|
25 |
+
df["adaptive_support"] = df.iloc[:, 5].map(lambda x: "Yes" if x == "T" else "No")
|
26 |
|
27 |
# Handle test_type properly - convert string representation of list to actual list
|
28 |
+
df["test_type"] = df.iloc[:, 6].apply(lambda x: eval(x) if isinstance(x, str) else x)
|
29 |
|
30 |
# Get description from column 7
|
31 |
+
df["description"] = df.iloc[:, 7]
|
32 |
|
33 |
# Extract duration with error handling from column 10
|
34 |
df["duration"] = pd.to_numeric(
|
35 |
+
df.iloc[:, 10].astype(str).str.extract(r'(\d+)')[0],
|
36 |
errors='coerce'
|
37 |
)
|
38 |
|
|
|
85 |
# Print some debug info
|
86 |
print(f"Processing query: {query[:50]}...")
|
87 |
|
88 |
+
# Get relevant passages with increased diversity (more candidates)
|
89 |
+
top_k_df = get_relevant_passages(query, df_clean, top_k=30)
|
90 |
|
91 |
# Debug: Check if we got any results
|
92 |
print(f"Retrieved {len(top_k_df)} assessments")
|
93 |
|
94 |
if top_k_df.empty:
|
95 |
return {"error": "No matching assessments found"}
|
96 |
+
|
97 |
+
# Remove duplicates by URL - keep the first occurrence (highest ranked)
|
98 |
+
top_k_df = top_k_df.drop_duplicates(subset=['url'])
|
99 |
+
print(f"After deduplication: {len(top_k_df)} unique assessments")
|
100 |
+
|
101 |
# Convert test_type to list if it's not already
|
102 |
top_k_df['test_type'] = top_k_df['test_type'].apply(
|
103 |
lambda x: x if isinstance(x, list) else
|
|
|
108 |
top_k_df['duration'] = top_k_df['duration'].fillna(-1).astype(int)
|
109 |
top_k_df.loc[top_k_df['duration'] == -1, 'duration'] = None
|
110 |
|
111 |
+
# Add a diverse set of assessment types - ensure we have different test types represented
|
112 |
+
test_types = set()
|
113 |
+
for _, row in top_k_df.iterrows():
|
114 |
+
if isinstance(row['test_type'], list):
|
115 |
+
for test_type in row['test_type']:
|
116 |
+
test_types.add(test_type)
|
117 |
+
|
118 |
+
print(f"Found assessments covering {len(test_types)} different test types")
|
119 |
+
|
120 |
# Convert DataFrame to list of dictionaries
|
121 |
candidates = top_k_df.to_dict(orient="records")
|
122 |
|
|
|
130 |
# Get recommendations
|
131 |
result = rerank(query, candidates)
|
132 |
|
133 |
+
# Post-process result to ensure no duplicates
|
134 |
if 'recommended_assessments' in result:
|
135 |
+
recommendations = result['recommended_assessments']
|
136 |
+
|
137 |
+
# Deduplicate by URL
|
138 |
+
unique_urls = set()
|
139 |
+
unique_recommendations = []
|
140 |
+
|
141 |
+
for rec in recommendations:
|
142 |
+
if rec['url'] not in unique_urls:
|
143 |
+
unique_urls.add(rec['url'])
|
144 |
+
unique_recommendations.append(rec)
|
145 |
+
|
146 |
+
# Validate URLs
|
147 |
+
result['recommended_assessments'] = validate_and_fix_urls(unique_recommendations)
|
148 |
+
print(f"Returning {len(result['recommended_assessments'])} unique recommended assessments")
|
149 |
|
150 |
return result
|
151 |
except Exception as e:
|