Spaces:
Sleeping
Sleeping
File size: 5,342 Bytes
d93bcf7 3ed9ca7 d93bcf7 3ed9ca7 d93bcf7 3ed9ca7 d93bcf7 3ed9ca7 d93bcf7 3ed9ca7 d93bcf7 08dabce d93bcf7 08dabce d93bcf7 08dabce d93bcf7 08dabce d93bcf7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import pandas as pd
import gradio as gr
from retriever import get_relevant_passages
from reranker import rerank
# Load and clean CSV
def clean_df(df):
df = df.copy()
# Get column names for reference
print(f"Original columns: {df.columns}")
# Ensure clean URLs from the second column
second_col = df.iloc[:, 2].astype(str) # Pre-packaged Job Solutions column
if second_col.str.contains('http').any() or second_col.str.contains('www').any():
df["url"] = second_col # Already has full URLs
else:
# Create full URLs from IDs
df["url"] = "https://www.shl.com" + second_col.str.replace(r'^(?!/)', '/', regex=True)
# Map T/F to Yes/No for remote testing and adaptive support
df["remote_support"] = df.iloc[:, 3].map(lambda x: "Yes" if x == "T" else "No")
df["adaptive_support"] = df.iloc[:, 4].map(lambda x: "Yes" if x == "T" else "No")
# Handle test_type properly - convert string representation of list to actual list
df["test_type"] = df.iloc[:, 5].apply(lambda x: eval(x) if isinstance(x, str) else x)
# Get description from column 7
df["description"] = df.iloc[:, 6]
# Extract duration with error handling from column 10
df["duration"] = pd.to_numeric(
df.iloc[:, 9].astype(str).str.extract(r'(\d+)')[0],
errors='coerce'
)
# Print sample of cleaned data for debugging
print(f"Sample of cleaned data: {df[['url', 'adaptive_support', 'remote_support', 'description', 'duration', 'test_type']].head(2)}")
return df[["url", "adaptive_support", "remote_support", "description", "duration", "test_type"]]
try:
# Load CSV with explicit encoding
df = pd.read_csv("assesments.csv", encoding='utf-8')
print(f"CSV loaded successfully with {len(df)} rows")
df_clean = clean_df(df)
except Exception as e:
print(f"Error loading or cleaning data: {e}")
# Create an empty DataFrame with required columns as fallback
df_clean = pd.DataFrame(columns=["url", "adaptive_support", "remote_support",
"description", "duration", "test_type"])
def validate_and_fix_urls(candidates):
"""Validates and fixes URLs in candidate assessments."""
for candidate in candidates:
# Skip if candidate is not a dictionary
if not isinstance(candidate, dict):
continue
# Ensure URL exists
if 'url' not in candidate or not candidate['url']:
candidate['url'] = 'https://www.shl.com/missing-url'
continue
url = str(candidate['url'])
# Fix URLs that are just numbers
if url.isdigit():
candidate['url'] = f"https://www.shl.com/{url}"
continue
# Add protocol if missing
if not url.startswith(('http://', 'https://')):
candidate['url'] = f"https://www.shl.com{url}" if url.startswith('/') else f"https://www.shl.com/{url}"
return candidates
def recommend(query):
if not query.strip():
return {"error": "Please enter a job description"}
try:
# Print some debug info
print(f"Processing query: {query[:50]}...")
# Get relevant passages
top_k_df = get_relevant_passages(query, df_clean, top_k=20)
# Debug: Check if we got any results
print(f"Retrieved {len(top_k_df)} assessments")
if top_k_df.empty:
return {"error": "No matching assessments found"}
# Convert test_type to list if it's not already
top_k_df['test_type'] = top_k_df['test_type'].apply(
lambda x: x if isinstance(x, list) else
(eval(x) if isinstance(x, str) and x.startswith('[') else [str(x)])
)
# Handle nan values for duration
top_k_df['duration'] = top_k_df['duration'].fillna(-1).astype(int)
top_k_df.loc[top_k_df['duration'] == -1, 'duration'] = None
# Convert DataFrame to list of dictionaries
candidates = top_k_df.to_dict(orient="records")
# Additional URL validation
candidates = validate_and_fix_urls(candidates)
# Print sample of data being sent to reranker
if candidates:
print(f"Sample candidate being sent to reranker: {candidates[0]}")
# Get recommendations
result = rerank(query, candidates)
# Post-process result
if 'recommended_assessments' in result:
result['recommended_assessments'] = validate_and_fix_urls(result['recommended_assessments'])
print(f"Returning {len(result['recommended_assessments'])} recommended assessments")
return result
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"Error: {str(e)}\n{error_details}")
return {"error": f"Error processing request: {str(e)}"}
iface = gr.Interface(
fn=recommend,
inputs=gr.Textbox(label="Enter Job Description", lines=4),
outputs="json",
title="SHL Assessment Recommender",
description="Paste a job description to get the most relevant SHL assessments."
)
if __name__ == "__main__":
iface.launch() |