Spaces:
Sleeping
Sleeping
File size: 6,642 Bytes
2133db4 6e0aaf8 24f19c6 2133db4 6e0aaf8 24f19c6 6e0aaf8 e5d19f1 6e0aaf8 397f5c9 24f19c6 e5d19f1 24f19c6 e5d19f1 24f19c6 04fa7f5 24f19c6 6e0aaf8 e5d19f1 397f5c9 24f19c6 e5d19f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import json
model = SentenceTransformer("all-MiniLM-L6-v2")
def format_test_type(test_types):
"""Format test type for embedding."""
if isinstance(test_types, list):
return ', '.join(test_types)
if isinstance(test_types, str) and test_types.startswith('['):
try:
return ', '.join(eval(test_types))
except:
pass
return str(test_types)
def get_relevant_passages(query, df, top_k=20):
"""Find most relevant and diverse assessments using semantic search with diversity enhancement."""
# Create a copy to avoid modifying the original dataframe
df_copy = df.copy()
if df_copy.empty:
print("Warning: Empty dataframe passed to get_relevant_passages")
return df_copy
# Display dataframe info for debugging
print(f"Dataframe columns: {df_copy.columns}")
print(f"Dataframe sample: {df_copy.head(1).to_dict('records')}")
# Ensure test_type is properly formatted
if 'test_type' in df_copy.columns:
# Convert test_type to proper format if it's a string representation of a list
df_copy['test_type'] = df_copy['test_type'].apply(
lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else
([x] if not isinstance(x, list) else x)
)
# Extract keywords from query for better matching
# This helps target specific skills mentioned in the job description
keywords = extract_job_keywords(query)
print(f"Extracted keywords from query: {keywords}")
# Expand query with keywords for better semantic search
expanded_query = f"{query} {' '.join(keywords)}"
# Concatenate all fields into a single string per row for embedding
corpus = []
for _, row in df_copy.iterrows():
try:
description = row['description'] if pd.notna(row['description']) else ""
test_types = format_test_type(row['test_type']) if 'test_type' in row else ""
adaptive = row['adaptive_support'] if 'adaptive_support' in row else "Unknown"
remote = row['remote_support'] if 'remote_support' in row else "Unknown"
duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
# Enhanced representation with more weight on description and test types
text = (f"{description} {description} " # Repeat description for more weight
f"Test types: {test_types} {test_types}. " # Repeat test types for more weight
f"Adaptive support: {adaptive}. "
f"Remote support: {remote}. "
f"Duration: {duration}.")
corpus.append(text)
except Exception as e:
print(f"Error processing row: {e}")
corpus.append("Error processing assessment")
print(f"Created corpus with {len(corpus)} items")
# Generate embeddings
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
query_embedding = model.encode(expanded_query, convert_to_tensor=True)
# Find most similar
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
# Get top matches
result = df_copy.iloc[[hit['corpus_id'] for hit in hits]].copy()
print(f"Found {len(result)} relevant passages")
# Add score for debugging
result['score'] = [hit['score'] for hit in hits]
# Apply diversity enhancement - get a mix of test types if possible
if len(result) > top_k / 2:
primary_results = result.head(int(top_k * 0.7)) # Keep top 70% based on relevance
# Try to find diverse test types for remaining slots
test_types_seen = set()
for _, row in primary_results.iterrows():
if isinstance(row['test_type'], list):
for t in row['test_type']:
test_types_seen.add(t)
elif isinstance(row['test_type'], str):
test_types_seen.add(row['test_type'])
# Find assessments with different test types
remaining = result.iloc[int(top_k * 0.7):]
diverse_picks = []
for _, row in remaining.iterrows():
if len(diverse_picks) >= (top_k - len(primary_results)):
break
new_type_found = False
if isinstance(row['test_type'], list):
for t in row['test_type']:
if t not in test_types_seen:
new_type_found = True
test_types_seen.add(t)
elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen:
new_type_found = True
test_types_seen.add(row['test_type'])
if new_type_found:
diverse_picks.append(row)
# Combine primary results with diverse picks
if diverse_picks:
diverse_df = pd.DataFrame(diverse_picks)
result = pd.concat([primary_results, diverse_df]).reset_index(drop=True)
return result
def extract_job_keywords(query):
"""Extract relevant keywords from job description for better matching."""
# Common job skill categories that might appear in descriptions
skill_categories = [
"competencies", "ability", "aptitude", "personality", "behavior",
"leadership", "management", "technical", "analytical", "problem-solving",
"communication", "teamwork", "situational", "judgment", "cognitive",
"verbal", "numerical", "programming", "coding", "development",
"sales", "customer service", "administrative", "executive", "professional",
"entry-level", "senior", "mid-level", "assessment", "test"
]
# Look for these keywords in the query
found_keywords = []
query_lower = query.lower()
for keyword in skill_categories:
if keyword in query_lower:
found_keywords.append(keyword)
# Add any job titles found
job_titles = [
"manager", "director", "analyst", "developer", "engineer", "administrator",
"assistant", "coordinator", "specialist", "supervisor", "consultant",
"executive", "officer", "associate", "representative", "technician",
"accountant", "designer", "sales", "support", "professional"
]
for title in job_titles:
if title in query_lower:
found_keywords.append(title)
return found_keywords |