Spaces:
Sleeping
Sleeping
File size: 4,082 Bytes
8b6aa48 9e9d5ee 8b6aa48 9e9d5ee bef0a51 9e9d5ee bef0a51 9e9d5ee bef0a51 9e9d5ee bef0a51 9e9d5ee 8b6aa48 bef0a51 8b6aa48 cbb8b01 9e9d5ee bef0a51 cbb8b01 bef0a51 cbb8b01 bef0a51 cbb8b01 bef0a51 cbb8b01 bef0a51 cbb8b01 bef0a51 8b6aa48 9e9d5ee 8b6aa48 9e9d5ee 8b6aa48 bef0a51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import pandas as pd
import gradio as gr
from retriever import get_relevant_passages
from reranker import rerank
# Load and clean CSV
def clean_df(df):
df = df.copy()
# Ensure clean URLs
# Check if the second column contains URLs or just IDs
second_col = df.iloc[:, 1].astype(str)
if second_col.str.contains('http').any() or second_col.str.contains('www').any():
df["url"] = second_col # Already has full URLs
else:
# Create full URLs from IDs
df["url"] = "https://www.shl.com/" + second_col.str.replace(r'^[\s/]*', '', regex=True)
df["remote_support"] = df.iloc[:, 2].map(lambda x: "Yes" if x == "T" else "No")
df["adaptive_support"] = df.iloc[:, 3].map(lambda x: "Yes" if x == "T" else "No")
# Handle test_type with error checking
df["test_type"] = df.iloc[:, 4].astype(str).str.split("\\n")
df["description"] = df.iloc[:, 5]
# Extract duration with error handling
df["duration"] = pd.to_numeric(
df.iloc[:, 8].astype(str).str.extract(r'(\d+)')[0],
errors='coerce'
)
return df[["url", "adaptive_support", "remote_support", "description", "duration", "test_type"]]
try:
df = pd.read_csv("assesments.csv")
df_clean = clean_df(df)
except Exception as e:
print(f"Error loading or cleaning data: {e}")
# Create an empty DataFrame with required columns as fallback
df_clean = pd.DataFrame(columns=["url", "adaptive_support", "remote_support",
"description", "duration", "test_type"])
def validate_and_fix_urls(candidates):
"""Validates and fixes URLs in candidate assessments."""
for candidate in candidates:
# Ensure URL exists
if 'url' not in candidate or not candidate['url']:
candidate['url'] = 'https://www.shl.com/missing-url'
continue
url = str(candidate['url'])
# Fix URLs that are just numbers
if url.isdigit() or (url.startswith('https://www.shl.com') and url[len('https://www.shl.com'):].isdigit()):
candidate['url'] = f"https://www.shl.com/{url.replace('https://www.shl.com', '')}"
continue
# Add protocol if missing
if not url.startswith(('http://', 'https://')):
candidate['url'] = f"https://{url}"
return candidates
def recommend(query):
if not query.strip():
return {"error": "Please enter a job description"}
try:
# Print some debug info
print(f"Processing query: {query[:50]}...")
top_k_df = get_relevant_passages(query, df_clean, top_k=20)
# Debug: Check URLs in retrieved data
print(f"Retrieved {len(top_k_df)} assessments")
if not top_k_df.empty:
print(f"Sample URLs from retrieval: {top_k_df['url'].iloc[:3].tolist()}")
candidates = top_k_df.to_dict(orient="records")
# Additional URL validation before sending to reranker
for c in candidates:
if 'url' in c:
if not str(c['url']).startswith(('http://', 'https://')):
c['url'] = f"https://www.shl.com/{str(c['url']).lstrip('/')}"
result = rerank(query, candidates)
# Post-process result to ensure URLs are properly formatted
if 'recommended_assessments' in result:
result['recommended_assessments'] = validate_and_fix_urls(result['recommended_assessments'])
return result
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"Error: {str(e)}\n{error_details}")
return {"error": f"Error processing request: {str(e)}"}
iface = gr.Interface(
fn=recommend,
inputs=gr.Textbox(label="Enter Job Description", lines=4),
outputs="json",
title="SHL Assessment Recommender",
description="Paste a job description to get the most relevant SHL assessments."
)
if __name__ == "__main__":
iface.launch() |