File size: 4,082 Bytes
8b6aa48
 
9e9d5ee
 
8b6aa48
 
9e9d5ee
 
bef0a51
 
 
 
 
 
 
 
 
 
9e9d5ee
 
bef0a51
 
9e9d5ee
bef0a51
9e9d5ee
bef0a51
 
 
 
 
 
 
9e9d5ee
8b6aa48
bef0a51
 
 
 
 
 
 
 
8b6aa48
cbb8b01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e9d5ee
bef0a51
 
 
 
cbb8b01
 
 
bef0a51
cbb8b01
 
 
 
 
 
bef0a51
cbb8b01
 
 
 
 
 
 
bef0a51
cbb8b01
 
 
 
 
bef0a51
 
cbb8b01
 
 
bef0a51
8b6aa48
 
9e9d5ee
8b6aa48
 
9e9d5ee
 
8b6aa48
 
 
bef0a51
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pandas as pd
import gradio as gr
from retriever import get_relevant_passages
from reranker import rerank

# Load and clean CSV
def clean_df(df):
    df = df.copy()
    
    # Ensure clean URLs
    # Check if the second column contains URLs or just IDs
    second_col = df.iloc[:, 1].astype(str)
    if second_col.str.contains('http').any() or second_col.str.contains('www').any():
        df["url"] = second_col  # Already has full URLs
    else:
        # Create full URLs from IDs
        df["url"] = "https://www.shl.com/" + second_col.str.replace(r'^[\s/]*', '', regex=True)
    
    df["remote_support"] = df.iloc[:, 2].map(lambda x: "Yes" if x == "T" else "No")
    df["adaptive_support"] = df.iloc[:, 3].map(lambda x: "Yes" if x == "T" else "No")
    
    # Handle test_type with error checking
    df["test_type"] = df.iloc[:, 4].astype(str).str.split("\\n")
    
    df["description"] = df.iloc[:, 5]
    
    # Extract duration with error handling
    df["duration"] = pd.to_numeric(
        df.iloc[:, 8].astype(str).str.extract(r'(\d+)')[0], 
        errors='coerce'
    )
    
    return df[["url", "adaptive_support", "remote_support", "description", "duration", "test_type"]]

try:
    df = pd.read_csv("assesments.csv")
    df_clean = clean_df(df)
except Exception as e:
    print(f"Error loading or cleaning data: {e}")
    # Create an empty DataFrame with required columns as fallback
    df_clean = pd.DataFrame(columns=["url", "adaptive_support", "remote_support", 
                                     "description", "duration", "test_type"])

def validate_and_fix_urls(candidates):
    """Validates and fixes URLs in candidate assessments."""
    for candidate in candidates:
        # Ensure URL exists
        if 'url' not in candidate or not candidate['url']:
            candidate['url'] = 'https://www.shl.com/missing-url'
            continue
            
        url = str(candidate['url'])
        
        # Fix URLs that are just numbers
        if url.isdigit() or (url.startswith('https://www.shl.com') and url[len('https://www.shl.com'):].isdigit()):
            candidate['url'] = f"https://www.shl.com/{url.replace('https://www.shl.com', '')}"
            continue
            
        # Add protocol if missing
        if not url.startswith(('http://', 'https://')):
            candidate['url'] = f"https://{url}"
            
    return candidates

def recommend(query):
    if not query.strip():
        return {"error": "Please enter a job description"}
    
    try:
        # Print some debug info
        print(f"Processing query: {query[:50]}...")
        
        top_k_df = get_relevant_passages(query, df_clean, top_k=20)
        
        # Debug: Check URLs in retrieved data
        print(f"Retrieved {len(top_k_df)} assessments")
        if not top_k_df.empty:
            print(f"Sample URLs from retrieval: {top_k_df['url'].iloc[:3].tolist()}")
        
        candidates = top_k_df.to_dict(orient="records")
        
        # Additional URL validation before sending to reranker
        for c in candidates:
            if 'url' in c:
                if not str(c['url']).startswith(('http://', 'https://')):
                    c['url'] = f"https://www.shl.com/{str(c['url']).lstrip('/')}"
        
        result = rerank(query, candidates)
        
        # Post-process result to ensure URLs are properly formatted
        if 'recommended_assessments' in result:
            result['recommended_assessments'] = validate_and_fix_urls(result['recommended_assessments'])
            
        return result
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Error: {str(e)}\n{error_details}")
        return {"error": f"Error processing request: {str(e)}"}

iface = gr.Interface(
    fn=recommend,
    inputs=gr.Textbox(label="Enter Job Description", lines=4),
    outputs="json",
    title="SHL Assessment Recommender",
    description="Paste a job description to get the most relevant SHL assessments."
)

if __name__ == "__main__":
    iface.launch()