endpointwebappshl / retriever.py
AnshulS's picture
Update retriever.py
e5d19f1 verified
raw
history blame
6.64 kB
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import json
model = SentenceTransformer("all-MiniLM-L6-v2")
def format_test_type(test_types):
"""Format test type for embedding."""
if isinstance(test_types, list):
return ', '.join(test_types)
if isinstance(test_types, str) and test_types.startswith('['):
try:
return ', '.join(eval(test_types))
except:
pass
return str(test_types)
def get_relevant_passages(query, df, top_k=20):
"""Find most relevant and diverse assessments using semantic search with diversity enhancement."""
# Create a copy to avoid modifying the original dataframe
df_copy = df.copy()
if df_copy.empty:
print("Warning: Empty dataframe passed to get_relevant_passages")
return df_copy
# Display dataframe info for debugging
print(f"Dataframe columns: {df_copy.columns}")
print(f"Dataframe sample: {df_copy.head(1).to_dict('records')}")
# Ensure test_type is properly formatted
if 'test_type' in df_copy.columns:
# Convert test_type to proper format if it's a string representation of a list
df_copy['test_type'] = df_copy['test_type'].apply(
lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else
([x] if not isinstance(x, list) else x)
)
# Extract keywords from query for better matching
# This helps target specific skills mentioned in the job description
keywords = extract_job_keywords(query)
print(f"Extracted keywords from query: {keywords}")
# Expand query with keywords for better semantic search
expanded_query = f"{query} {' '.join(keywords)}"
# Concatenate all fields into a single string per row for embedding
corpus = []
for _, row in df_copy.iterrows():
try:
description = row['description'] if pd.notna(row['description']) else ""
test_types = format_test_type(row['test_type']) if 'test_type' in row else ""
adaptive = row['adaptive_support'] if 'adaptive_support' in row else "Unknown"
remote = row['remote_support'] if 'remote_support' in row else "Unknown"
duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
# Enhanced representation with more weight on description and test types
text = (f"{description} {description} " # Repeat description for more weight
f"Test types: {test_types} {test_types}. " # Repeat test types for more weight
f"Adaptive support: {adaptive}. "
f"Remote support: {remote}. "
f"Duration: {duration}.")
corpus.append(text)
except Exception as e:
print(f"Error processing row: {e}")
corpus.append("Error processing assessment")
print(f"Created corpus with {len(corpus)} items")
# Generate embeddings
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
query_embedding = model.encode(expanded_query, convert_to_tensor=True)
# Find most similar
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
# Get top matches
result = df_copy.iloc[[hit['corpus_id'] for hit in hits]].copy()
print(f"Found {len(result)} relevant passages")
# Add score for debugging
result['score'] = [hit['score'] for hit in hits]
# Apply diversity enhancement - get a mix of test types if possible
if len(result) > top_k / 2:
primary_results = result.head(int(top_k * 0.7)) # Keep top 70% based on relevance
# Try to find diverse test types for remaining slots
test_types_seen = set()
for _, row in primary_results.iterrows():
if isinstance(row['test_type'], list):
for t in row['test_type']:
test_types_seen.add(t)
elif isinstance(row['test_type'], str):
test_types_seen.add(row['test_type'])
# Find assessments with different test types
remaining = result.iloc[int(top_k * 0.7):]
diverse_picks = []
for _, row in remaining.iterrows():
if len(diverse_picks) >= (top_k - len(primary_results)):
break
new_type_found = False
if isinstance(row['test_type'], list):
for t in row['test_type']:
if t not in test_types_seen:
new_type_found = True
test_types_seen.add(t)
elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen:
new_type_found = True
test_types_seen.add(row['test_type'])
if new_type_found:
diverse_picks.append(row)
# Combine primary results with diverse picks
if diverse_picks:
diverse_df = pd.DataFrame(diverse_picks)
result = pd.concat([primary_results, diverse_df]).reset_index(drop=True)
return result
def extract_job_keywords(query):
"""Extract relevant keywords from job description for better matching."""
# Common job skill categories that might appear in descriptions
skill_categories = [
"competencies", "ability", "aptitude", "personality", "behavior",
"leadership", "management", "technical", "analytical", "problem-solving",
"communication", "teamwork", "situational", "judgment", "cognitive",
"verbal", "numerical", "programming", "coding", "development",
"sales", "customer service", "administrative", "executive", "professional",
"entry-level", "senior", "mid-level", "assessment", "test"
]
# Look for these keywords in the query
found_keywords = []
query_lower = query.lower()
for keyword in skill_categories:
if keyword in query_lower:
found_keywords.append(keyword)
# Add any job titles found
job_titles = [
"manager", "director", "analyst", "developer", "engineer", "administrator",
"assistant", "coordinator", "specialist", "supervisor", "consultant",
"executive", "officer", "associate", "representative", "technician",
"accountant", "designer", "sales", "support", "professional"
]
for title in job_titles:
if title in query_lower:
found_keywords.append(title)
return found_keywords