Spaces:
Sleeping
Sleeping
import pandas as pd | |
from sentence_transformers import SentenceTransformer, util | |
import json | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
def format_test_type(test_types): | |
"""Format test type for embedding.""" | |
if isinstance(test_types, list): | |
return ', '.join(test_types) | |
if isinstance(test_types, str) and test_types.startswith('['): | |
try: | |
return ', '.join(eval(test_types)) | |
except: | |
pass | |
return str(test_types) | |
def get_relevant_passages(query, df, top_k=20): | |
"""Find most relevant and diverse assessments using semantic search with diversity enhancement.""" | |
# Create a copy to avoid modifying the original dataframe | |
df_copy = df.copy() | |
if df_copy.empty: | |
print("Warning: Empty dataframe passed to get_relevant_passages") | |
return df_copy | |
# Display dataframe info for debugging | |
print(f"Dataframe columns: {df_copy.columns}") | |
print(f"Dataframe sample: {df_copy.head(1).to_dict('records')}") | |
# Ensure test_type is properly formatted | |
if 'test_type' in df_copy.columns: | |
# Convert test_type to proper format if it's a string representation of a list | |
df_copy['test_type'] = df_copy['test_type'].apply( | |
lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else | |
([x] if not isinstance(x, list) else x) | |
) | |
# Extract keywords from query for better matching | |
# This helps target specific skills mentioned in the job description | |
keywords = extract_job_keywords(query) | |
print(f"Extracted keywords from query: {keywords}") | |
# Expand query with keywords for better semantic search | |
expanded_query = f"{query} {' '.join(keywords)}" | |
# Concatenate all fields into a single string per row for embedding | |
corpus = [] | |
for _, row in df_copy.iterrows(): | |
try: | |
description = row['description'] if pd.notna(row['description']) else "" | |
test_types = format_test_type(row['test_type']) if 'test_type' in row else "" | |
adaptive = row['adaptive_support'] if 'adaptive_support' in row else "Unknown" | |
remote = row['remote_support'] if 'remote_support' in row else "Unknown" | |
duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration" | |
# Enhanced representation with more weight on description and test types | |
text = (f"{description} {description} " # Repeat description for more weight | |
f"Test types: {test_types} {test_types}. " # Repeat test types for more weight | |
f"Adaptive support: {adaptive}. " | |
f"Remote support: {remote}. " | |
f"Duration: {duration}.") | |
corpus.append(text) | |
except Exception as e: | |
print(f"Error processing row: {e}") | |
corpus.append("Error processing assessment") | |
print(f"Created corpus with {len(corpus)} items") | |
# Generate embeddings | |
corpus_embeddings = model.encode(corpus, convert_to_tensor=True) | |
query_embedding = model.encode(expanded_query, convert_to_tensor=True) | |
# Find most similar | |
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0] | |
# Get top matches | |
result = df_copy.iloc[[hit['corpus_id'] for hit in hits]].copy() | |
print(f"Found {len(result)} relevant passages") | |
# Add score for debugging | |
result['score'] = [hit['score'] for hit in hits] | |
# Apply diversity enhancement - get a mix of test types if possible | |
if len(result) > top_k / 2: | |
primary_results = result.head(int(top_k * 0.7)) # Keep top 70% based on relevance | |
# Try to find diverse test types for remaining slots | |
test_types_seen = set() | |
for _, row in primary_results.iterrows(): | |
if isinstance(row['test_type'], list): | |
for t in row['test_type']: | |
test_types_seen.add(t) | |
elif isinstance(row['test_type'], str): | |
test_types_seen.add(row['test_type']) | |
# Find assessments with different test types | |
remaining = result.iloc[int(top_k * 0.7):] | |
diverse_picks = [] | |
for _, row in remaining.iterrows(): | |
if len(diverse_picks) >= (top_k - len(primary_results)): | |
break | |
new_type_found = False | |
if isinstance(row['test_type'], list): | |
for t in row['test_type']: | |
if t not in test_types_seen: | |
new_type_found = True | |
test_types_seen.add(t) | |
elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen: | |
new_type_found = True | |
test_types_seen.add(row['test_type']) | |
if new_type_found: | |
diverse_picks.append(row) | |
# Combine primary results with diverse picks | |
if diverse_picks: | |
diverse_df = pd.DataFrame(diverse_picks) | |
result = pd.concat([primary_results, diverse_df]).reset_index(drop=True) | |
return result | |
def extract_job_keywords(query): | |
"""Extract relevant keywords from job description for better matching.""" | |
# Common job skill categories that might appear in descriptions | |
skill_categories = [ | |
"competencies", "ability", "aptitude", "personality", "behavior", | |
"leadership", "management", "technical", "analytical", "problem-solving", | |
"communication", "teamwork", "situational", "judgment", "cognitive", | |
"verbal", "numerical", "programming", "coding", "development", | |
"sales", "customer service", "administrative", "executive", "professional", | |
"entry-level", "senior", "mid-level", "assessment", "test" | |
] | |
# Look for these keywords in the query | |
found_keywords = [] | |
query_lower = query.lower() | |
for keyword in skill_categories: | |
if keyword in query_lower: | |
found_keywords.append(keyword) | |
# Add any job titles found | |
job_titles = [ | |
"manager", "director", "analyst", "developer", "engineer", "administrator", | |
"assistant", "coordinator", "specialist", "supervisor", "consultant", | |
"executive", "officer", "associate", "representative", "technician", | |
"accountant", "designer", "sales", "support", "professional" | |
] | |
for title in job_titles: | |
if title in query_lower: | |
found_keywords.append(title) | |
return found_keywords |