Spaces:

AnshulS
/

endpointwebappshl

Sleeping

App Files Files Community

AnshulS commited on May 9

Commit

85c1934

verified ·

1 Parent(s): e5d19f1

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -12

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import pandas as pd
 import gradio as gr
 from retriever import get_relevant_passages
 from reranker import rerank
@@ -11,7 +12,7 @@ def clean_df(df):
     print(f"Original columns: {df.columns}")
     # Ensure clean URLs from the second column
-    second_col = df.iloc[:, 2].astype(str)  # Pre-packaged Job Solutions column
     if second_col.str.contains('http').any() or second_col.str.contains('www').any():
         df["url"] = second_col  # Already has full URLs
@@ -20,18 +21,18 @@ def clean_df(df):
         df["url"] = "https://www.shl.com" + second_col.str.replace(r'^(?!/)', '/', regex=True)
     # Map T/F to Yes/No for remote testing and adaptive support
-    df["remote_support"] = df.iloc[:, 3].map(lambda x: "Yes" if x == "T" else "No")
-    df["adaptive_support"] = df.iloc[:, 4].map(lambda x: "Yes" if x == "T" else "No")
     # Handle test_type properly - convert string representation of list to actual list
-    df["test_type"] = df.iloc[:, 5].apply(lambda x: eval(x) if isinstance(x, str) else x)
     # Get description from column 7
-    df["description"] = df.iloc[:, 6]
     # Extract duration with error handling from column 10
     df["duration"] = pd.to_numeric(
-        df.iloc[:, 9].astype(str).str.extract(r'(\d+)')[0],
         errors='coerce'
     )
@@ -84,15 +85,19 @@ def recommend(query):
         # Print some debug info
         print(f"Processing query: {query[:50]}...")
-        # Get relevant passages
-        top_k_df = get_relevant_passages(query, df_clean, top_k=20)
         # Debug: Check if we got any results
         print(f"Retrieved {len(top_k_df)} assessments")
         if top_k_df.empty:
             return {"error": "No matching assessments found"}
         # Convert test_type to list if it's not already
         top_k_df['test_type'] = top_k_df['test_type'].apply(
             lambda x: x if isinstance(x, list) else
@@ -103,6 +108,15 @@ def recommend(query):
         top_k_df['duration'] = top_k_df['duration'].fillna(-1).astype(int)
         top_k_df.loc[top_k_df['duration'] == -1, 'duration'] = None
         # Convert DataFrame to list of dictionaries
         candidates = top_k_df.to_dict(orient="records")
@@ -116,10 +130,22 @@ def recommend(query):
         # Get recommendations
         result = rerank(query, candidates)
-        # Post-process result
         if 'recommended_assessments' in result:
-            result['recommended_assessments'] = validate_and_fix_urls(result['recommended_assessments'])
-            print(f"Returning {len(result['recommended_assessments'])} recommended assessments")
         return result
     except Exception as e:

 import pandas as pd
 import gradio as gr
+import numpy as np
 from retriever import get_relevant_passages
 from reranker import rerank
     print(f"Original columns: {df.columns}")
     # Ensure clean URLs from the second column
+    second_col = df.iloc[:, 3].astype(str)  # Pre-packaged Job Solutions column
     if second_col.str.contains('http').any() or second_col.str.contains('www').any():
         df["url"] = second_col  # Already has full URLs
         df["url"] = "https://www.shl.com" + second_col.str.replace(r'^(?!/)', '/', regex=True)
     # Map T/F to Yes/No for remote testing and adaptive support
+    df["remote_support"] = df.iloc[:, 4].map(lambda x: "Yes" if x == "T" else "No")
+    df["adaptive_support"] = df.iloc[:, 5].map(lambda x: "Yes" if x == "T" else "No")
     # Handle test_type properly - convert string representation of list to actual list
+    df["test_type"] = df.iloc[:, 6].apply(lambda x: eval(x) if isinstance(x, str) else x)
     # Get description from column 7
+    df["description"] = df.iloc[:, 7]
     # Extract duration with error handling from column 10
     df["duration"] = pd.to_numeric(
+        df.iloc[:, 10].astype(str).str.extract(r'(\d+)')[0],
         errors='coerce'
     )
         # Print some debug info
         print(f"Processing query: {query[:50]}...")
+        # Get relevant passages with increased diversity (more candidates)
+        top_k_df = get_relevant_passages(query, df_clean, top_k=30)
         # Debug: Check if we got any results
         print(f"Retrieved {len(top_k_df)} assessments")
         if top_k_df.empty:
             return {"error": "No matching assessments found"}
+        # Remove duplicates by URL - keep the first occurrence (highest ranked)
+        top_k_df = top_k_df.drop_duplicates(subset=['url'])
+        print(f"After deduplication: {len(top_k_df)} unique assessments")
         # Convert test_type to list if it's not already
         top_k_df['test_type'] = top_k_df['test_type'].apply(
             lambda x: x if isinstance(x, list) else
         top_k_df['duration'] = top_k_df['duration'].fillna(-1).astype(int)
         top_k_df.loc[top_k_df['duration'] == -1, 'duration'] = None
+        # Add a diverse set of assessment types - ensure we have different test types represented
+        test_types = set()
+        for _, row in top_k_df.iterrows():
+            if isinstance(row['test_type'], list):
+                for test_type in row['test_type']:
+                    test_types.add(test_type)
+        print(f"Found assessments covering {len(test_types)} different test types")
         # Convert DataFrame to list of dictionaries
         candidates = top_k_df.to_dict(orient="records")
         # Get recommendations
         result = rerank(query, candidates)
+        # Post-process result to ensure no duplicates
         if 'recommended_assessments' in result:
+            recommendations = result['recommended_assessments']
+            # Deduplicate by URL
+            unique_urls = set()
+            unique_recommendations = []
+            for rec in recommendations:
+                if rec['url'] not in unique_urls:
+                    unique_urls.add(rec['url'])
+                    unique_recommendations.append(rec)
+            # Validate URLs
+            result['recommended_assessments'] = validate_and_fix_urls(unique_recommendations)
+            print(f"Returning {len(result['recommended_assessments'])} unique recommended assessments")
         return result
     except Exception as e: