File size: 4,379 Bytes
8b6aa48
 
9e9d5ee
 
8b6aa48
 
9e9d5ee
 
bef0a51
d56d4c8
 
bef0a51
d56d4c8
 
bef0a51
d56d4c8
 
 
bef0a51
d56d4c8
 
bef0a51
d56d4c8
 
bef0a51
d56d4c8
 
 
 
 
 
8b6aa48
bef0a51
 
 
 
 
 
 
 
8b6aa48
cbb8b01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e9d5ee
bef0a51
 
 
 
cbb8b01
 
d56d4c8
 
 
 
 
 
 
 
 
cbb8b01
bef0a51
cbb8b01
d56d4c8
cbb8b01
 
 
 
bef0a51
cbb8b01
 
 
 
 
 
 
bef0a51
cbb8b01
 
 
 
 
bef0a51
 
cbb8b01
 
 
bef0a51
8b6aa48
d56d4c8
8b6aa48
9e9d5ee
8b6aa48
 
9e9d5ee
 
8b6aa48
 
 
bef0a51
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd
import gradio as gr
from retriever import get_relevant_passages
from reranker import rerank

# Load and clean CSV
def clean_df(df):
    df = df.copy()
    
    # Extract the assessment name from the URL
    df["assessment_name"] = df["Pre-packaged Job Solutions"].str.split('/').str[-2].str.replace('-', ' ').str.title()
    
    # Create proper URLs
    df["url"] = "https://www.shl.com" + df["Pre-packaged Job Solutions"]
    
    # Convert T/F to Yes/No
    df["remote_support"] = df["Remote Testing"].map(lambda x: "Yes" if x == "T" else "No")
    df["adaptive_support"] = df["Adaptive/IRT"].map(lambda x: "Yes" if x == "T" else "No")
    
    # Handle test_type properly - it's already in list format as a string
    df["test_type"] = df["Test Type"]
    
    # Keep the description as is
    df["description"] = df["Description"]
    
    # Extract duration with proper handling
    df["duration"] = df["Assessment_Length"].str.extract(r'(\d+)').fillna("N/A")
    
    # Select only the columns we need
    return df[["assessment_name", "url", "remote_support", "adaptive_support", 
               "description", "duration", "test_type"]]

try:
    df = pd.read_csv("assesments.csv")
    df_clean = clean_df(df)
except Exception as e:
    print(f"Error loading or cleaning data: {e}")
    # Create an empty DataFrame with required columns as fallback
    df_clean = pd.DataFrame(columns=["url", "adaptive_support", "remote_support", 
                                     "description", "duration", "test_type"])

def validate_and_fix_urls(candidates):
    """Validates and fixes URLs in candidate assessments."""
    for candidate in candidates:
        # Ensure URL exists
        if 'url' not in candidate or not candidate['url']:
            candidate['url'] = 'https://www.shl.com/missing-url'
            continue
            
        url = str(candidate['url'])
        
        # Fix URLs that are just numbers
        if url.isdigit() or (url.startswith('https://www.shl.com') and url[len('https://www.shl.com'):].isdigit()):
            candidate['url'] = f"https://www.shl.com/{url.replace('https://www.shl.com', '')}"
            continue
            
        # Add protocol if missing
        if not url.startswith(('http://', 'https://')):
            candidate['url'] = f"https://{url}"
            
    return candidates

def recommend(query):
    if not query.strip():
        return {"error": "Please enter a job description"}
    
    try:
        # Print some debug info
        print(f"Processing query: {query[:50]}...")
        print(f"DataFrame shape: {df_clean.shape}")
        print(f"DataFrame columns: {df_clean.columns.tolist()}")
        
        if df_clean.empty:
            return {"error": "No assessment data available"}
        
        # Print a sample row for debugging
        print("Sample row:")
        print(df_clean.iloc[0].to_dict())
        
        top_k_df = get_relevant_passages(query, df_clean, top_k=20)
        
        # Debug: Check retrieved data
        print(f"Retrieved {len(top_k_df)} assessments")
        if not top_k_df.empty:
            print(f"Sample URLs from retrieval: {top_k_df['url'].iloc[:3].tolist()}")
        
        candidates = top_k_df.to_dict(orient="records")
        
        # Additional URL validation before sending to reranker
        for c in candidates:
            if 'url' in c:
                if not str(c['url']).startswith(('http://', 'https://')):
                    c['url'] = f"https://www.shl.com/{str(c['url']).lstrip('/')}"
        
        result = rerank(query, candidates)
        
        # Post-process result to ensure URLs are properly formatted
        if 'recommended_assessments' in result:
            result['recommended_assessments'] = validate_and_fix_urls(result['recommended_assessments'])
            
        return result
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Error: {str(e)}\n{error_details}")
        return {"error": f"Error processing request: {str(e)}"}


iface = gr.Interface(
    fn=recommend,
    inputs=gr.Textbox(label="Enter Job Description", lines=4),
    outputs="json",
    title="SHL Assessment Recommender",
    description="Paste a job description to get the most relevant SHL assessments."
)

if __name__ == "__main__":
    iface.launch()