File size: 6,642 Bytes
2133db4
6e0aaf8
24f19c6
2133db4
6e0aaf8
 
24f19c6
 
 
 
 
 
 
 
 
 
 
6e0aaf8
e5d19f1
6e0aaf8
 
397f5c9
24f19c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5d19f1
 
 
 
 
 
 
 
24f19c6
 
 
 
 
 
 
 
 
 
e5d19f1
 
 
24f19c6
 
 
 
 
 
 
04fa7f5
24f19c6
 
 
6e0aaf8
e5d19f1
397f5c9
24f19c6
 
 
 
 
 
 
 
 
 
e5d19f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import json

model = SentenceTransformer("all-MiniLM-L6-v2")

def format_test_type(test_types):
    """Format test type for embedding."""
    if isinstance(test_types, list):
        return ', '.join(test_types)
    if isinstance(test_types, str) and test_types.startswith('['):
        try:
            return ', '.join(eval(test_types))
        except:
            pass
    return str(test_types)

def get_relevant_passages(query, df, top_k=20):
    """Find most relevant and diverse assessments using semantic search with diversity enhancement."""
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    if df_copy.empty:
        print("Warning: Empty dataframe passed to get_relevant_passages")
        return df_copy
    
    # Display dataframe info for debugging
    print(f"Dataframe columns: {df_copy.columns}")
    print(f"Dataframe sample: {df_copy.head(1).to_dict('records')}")
    
    # Ensure test_type is properly formatted
    if 'test_type' in df_copy.columns:
        # Convert test_type to proper format if it's a string representation of a list
        df_copy['test_type'] = df_copy['test_type'].apply(
            lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else 
                    ([x] if not isinstance(x, list) else x)
        )
    
    # Extract keywords from query for better matching
    # This helps target specific skills mentioned in the job description
    keywords = extract_job_keywords(query)
    print(f"Extracted keywords from query: {keywords}")
    
    # Expand query with keywords for better semantic search
    expanded_query = f"{query} {' '.join(keywords)}"
    
    # Concatenate all fields into a single string per row for embedding
    corpus = []
    for _, row in df_copy.iterrows():
        try:
            description = row['description'] if pd.notna(row['description']) else ""
            test_types = format_test_type(row['test_type']) if 'test_type' in row else ""
            adaptive = row['adaptive_support'] if 'adaptive_support' in row else "Unknown"
            remote = row['remote_support'] if 'remote_support' in row else "Unknown"
            duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
            
            # Enhanced representation with more weight on description and test types
            text = (f"{description} {description} "  # Repeat description for more weight
                   f"Test types: {test_types} {test_types}. "  # Repeat test types for more weight
                   f"Adaptive support: {adaptive}. "
                   f"Remote support: {remote}. "
                   f"Duration: {duration}.")
            corpus.append(text)
        except Exception as e:
            print(f"Error processing row: {e}")
            corpus.append("Error processing assessment")
    
    print(f"Created corpus with {len(corpus)} items")
    
    # Generate embeddings
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(expanded_query, convert_to_tensor=True)
    
    # Find most similar
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
    
    # Get top matches
    result = df_copy.iloc[[hit['corpus_id'] for hit in hits]].copy()
    print(f"Found {len(result)} relevant passages")
    
    # Add score for debugging
    result['score'] = [hit['score'] for hit in hits]
    
    # Apply diversity enhancement - get a mix of test types if possible
    if len(result) > top_k / 2:
        primary_results = result.head(int(top_k * 0.7))  # Keep top 70% based on relevance
        
        # Try to find diverse test types for remaining slots
        test_types_seen = set()
        for _, row in primary_results.iterrows():
            if isinstance(row['test_type'], list):
                for t in row['test_type']:
                    test_types_seen.add(t)
            elif isinstance(row['test_type'], str):
                test_types_seen.add(row['test_type'])
        
        # Find assessments with different test types
        remaining = result.iloc[int(top_k * 0.7):]
        diverse_picks = []
        
        for _, row in remaining.iterrows():
            if len(diverse_picks) >= (top_k - len(primary_results)):
                break
                
            new_type_found = False
            if isinstance(row['test_type'], list):
                for t in row['test_type']:
                    if t not in test_types_seen:
                        new_type_found = True
                        test_types_seen.add(t)
            elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen:
                new_type_found = True
                test_types_seen.add(row['test_type'])
                
            if new_type_found:
                diverse_picks.append(row)
        
        # Combine primary results with diverse picks
        if diverse_picks:
            diverse_df = pd.DataFrame(diverse_picks)
            result = pd.concat([primary_results, diverse_df]).reset_index(drop=True)
    
    return result

def extract_job_keywords(query):
    """Extract relevant keywords from job description for better matching."""
    # Common job skill categories that might appear in descriptions
    skill_categories = [
        "competencies", "ability", "aptitude", "personality", "behavior", 
        "leadership", "management", "technical", "analytical", "problem-solving",
        "communication", "teamwork", "situational", "judgment", "cognitive",
        "verbal", "numerical", "programming", "coding", "development",
        "sales", "customer service", "administrative", "executive", "professional",
        "entry-level", "senior", "mid-level", "assessment", "test"
    ]
    
    # Look for these keywords in the query
    found_keywords = []
    query_lower = query.lower()
    
    for keyword in skill_categories:
        if keyword in query_lower:
            found_keywords.append(keyword)
    
    # Add any job titles found
    job_titles = [
        "manager", "director", "analyst", "developer", "engineer", "administrator",
        "assistant", "coordinator", "specialist", "supervisor", "consultant",
        "executive", "officer", "associate", "representative", "technician",
        "accountant", "designer", "sales", "support", "professional"
    ]
    
    for title in job_titles:
        if title in query_lower:
            found_keywords.append(title)
    
    return found_keywords