Spaces:

AnshulS
/

endpointwebappshl

Sleeping

App Files Files Community

endpointwebappshl / retriever.py

AnshulS

Update retriever.py

e5d19f1 verified about 1 month ago

raw

history blame

6.64 kB

	import pandas as pd
	from sentence_transformers import SentenceTransformer, util
	import json

	model = SentenceTransformer("all-MiniLM-L6-v2")

	def format_test_type(test_types):
	"""Format test type for embedding."""
	if isinstance(test_types, list):
	return ', '.join(test_types)
	if isinstance(test_types, str) and test_types.startswith('['):
	try:
	return ', '.join(eval(test_types))
	except:
	pass
	return str(test_types)

	def get_relevant_passages(query, df, top_k=20):
	"""Find most relevant and diverse assessments using semantic search with diversity enhancement."""
	# Create a copy to avoid modifying the original dataframe
	df_copy = df.copy()

	if df_copy.empty:
	print("Warning: Empty dataframe passed to get_relevant_passages")
	return df_copy

	# Display dataframe info for debugging
	print(f"Dataframe columns: {df_copy.columns}")
	print(f"Dataframe sample: {df_copy.head(1).to_dict('records')}")

	# Ensure test_type is properly formatted
	if 'test_type' in df_copy.columns:
	# Convert test_type to proper format if it's a string representation of a list
	df_copy['test_type'] = df_copy['test_type'].apply(
	lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else
	([x] if not isinstance(x, list) else x)
	)

	# Extract keywords from query for better matching
	# This helps target specific skills mentioned in the job description
	keywords = extract_job_keywords(query)
	print(f"Extracted keywords from query: {keywords}")

	# Expand query with keywords for better semantic search
	expanded_query = f"{query} {' '.join(keywords)}"

	# Concatenate all fields into a single string per row for embedding
	corpus = []
	for _, row in df_copy.iterrows():
	try:
	description = row['description'] if pd.notna(row['description']) else ""
	test_types = format_test_type(row['test_type']) if 'test_type' in row else ""
	adaptive = row['adaptive_support'] if 'adaptive_support' in row else "Unknown"
	remote = row['remote_support'] if 'remote_support' in row else "Unknown"
	duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"

	# Enhanced representation with more weight on description and test types
	text = (f"{description} {description} " # Repeat description for more weight
	f"Test types: {test_types} {test_types}. " # Repeat test types for more weight
	f"Adaptive support: {adaptive}. "
	f"Remote support: {remote}. "
	f"Duration: {duration}.")
	corpus.append(text)
	except Exception as e:
	print(f"Error processing row: {e}")
	corpus.append("Error processing assessment")

	print(f"Created corpus with {len(corpus)} items")

	# Generate embeddings
	corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
	query_embedding = model.encode(expanded_query, convert_to_tensor=True)

	# Find most similar
	hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]

	# Get top matches
	result = df_copy.iloc[[hit['corpus_id'] for hit in hits]].copy()
	print(f"Found {len(result)} relevant passages")

	# Add score for debugging
	result['score'] = [hit['score'] for hit in hits]

	# Apply diversity enhancement - get a mix of test types if possible
	if len(result) > top_k / 2:
	primary_results = result.head(int(top_k * 0.7)) # Keep top 70% based on relevance

	# Try to find diverse test types for remaining slots
	test_types_seen = set()
	for _, row in primary_results.iterrows():
	if isinstance(row['test_type'], list):
	for t in row['test_type']:
	test_types_seen.add(t)
	elif isinstance(row['test_type'], str):
	test_types_seen.add(row['test_type'])

	# Find assessments with different test types
	remaining = result.iloc[int(top_k * 0.7):]
	diverse_picks = []

	for _, row in remaining.iterrows():
	if len(diverse_picks) >= (top_k - len(primary_results)):
	break

	new_type_found = False
	if isinstance(row['test_type'], list):
	for t in row['test_type']:
	if t not in test_types_seen:
	new_type_found = True
	test_types_seen.add(t)
	elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen:
	new_type_found = True
	test_types_seen.add(row['test_type'])

	if new_type_found:
	diverse_picks.append(row)

	# Combine primary results with diverse picks
	if diverse_picks:
	diverse_df = pd.DataFrame(diverse_picks)
	result = pd.concat([primary_results, diverse_df]).reset_index(drop=True)

	return result

	def extract_job_keywords(query):
	"""Extract relevant keywords from job description for better matching."""
	# Common job skill categories that might appear in descriptions
	skill_categories = [
	"competencies", "ability", "aptitude", "personality", "behavior",
	"leadership", "management", "technical", "analytical", "problem-solving",
	"communication", "teamwork", "situational", "judgment", "cognitive",
	"verbal", "numerical", "programming", "coding", "development",
	"sales", "customer service", "administrative", "executive", "professional",
	"entry-level", "senior", "mid-level", "assessment", "test"
	]

	# Look for these keywords in the query
	found_keywords = []
	query_lower = query.lower()

	for keyword in skill_categories:
	if keyword in query_lower:
	found_keywords.append(keyword)

	# Add any job titles found
	job_titles = [
	"manager", "director", "analyst", "developer", "engineer", "administrator",
	"assistant", "coordinator", "specialist", "supervisor", "consultant",
	"executive", "officer", "associate", "representative", "technician",
	"accountant", "designer", "sales", "support", "professional"
	]

	for title in job_titles:
	if title in query_lower:
	found_keywords.append(title)

	return found_keywords