import pandas as pd from sentence_transformers import SentenceTransformer, util import json model = SentenceTransformer("all-MiniLM-L6-v2") def format_test_type(test_types): """Format test type for embedding.""" if isinstance(test_types, list): return ', '.join(test_types) if isinstance(test_types, str) and test_types.startswith('['): try: return ', '.join(eval(test_types)) except: pass return str(test_types) def get_relevant_passages(query, df, top_k=20): """Find most relevant and diverse assessments using semantic search with diversity enhancement.""" # Create a copy to avoid modifying the original dataframe df_copy = df.copy() if df_copy.empty: print("Warning: Empty dataframe passed to get_relevant_passages") return df_copy # Display dataframe info for debugging print(f"Dataframe columns: {df_copy.columns}") print(f"Dataframe sample: {df_copy.head(1).to_dict('records')}") # Ensure test_type is properly formatted if 'test_type' in df_copy.columns: # Convert test_type to proper format if it's a string representation of a list df_copy['test_type'] = df_copy['test_type'].apply( lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else ([x] if not isinstance(x, list) else x) ) # Extract keywords from query for better matching # This helps target specific skills mentioned in the job description keywords = extract_job_keywords(query) print(f"Extracted keywords from query: {keywords}") # Expand query with keywords for better semantic search expanded_query = f"{query} {' '.join(keywords)}" # Concatenate all fields into a single string per row for embedding corpus = [] for _, row in df_copy.iterrows(): try: description = row['description'] if pd.notna(row['description']) else "" test_types = format_test_type(row['test_type']) if 'test_type' in row else "" adaptive = row['adaptive_support'] if 'adaptive_support' in row else "Unknown" remote = row['remote_support'] if 'remote_support' in row else "Unknown" duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration" # Enhanced representation with more weight on description and test types text = (f"{description} {description} " # Repeat description for more weight f"Test types: {test_types} {test_types}. " # Repeat test types for more weight f"Adaptive support: {adaptive}. " f"Remote support: {remote}. " f"Duration: {duration}.") corpus.append(text) except Exception as e: print(f"Error processing row: {e}") corpus.append("Error processing assessment") print(f"Created corpus with {len(corpus)} items") # Generate embeddings corpus_embeddings = model.encode(corpus, convert_to_tensor=True) query_embedding = model.encode(expanded_query, convert_to_tensor=True) # Find most similar hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0] # Get top matches result = df_copy.iloc[[hit['corpus_id'] for hit in hits]].copy() print(f"Found {len(result)} relevant passages") # Add score for debugging result['score'] = [hit['score'] for hit in hits] # Apply diversity enhancement - get a mix of test types if possible if len(result) > top_k / 2: primary_results = result.head(int(top_k * 0.7)) # Keep top 70% based on relevance # Try to find diverse test types for remaining slots test_types_seen = set() for _, row in primary_results.iterrows(): if isinstance(row['test_type'], list): for t in row['test_type']: test_types_seen.add(t) elif isinstance(row['test_type'], str): test_types_seen.add(row['test_type']) # Find assessments with different test types remaining = result.iloc[int(top_k * 0.7):] diverse_picks = [] for _, row in remaining.iterrows(): if len(diverse_picks) >= (top_k - len(primary_results)): break new_type_found = False if isinstance(row['test_type'], list): for t in row['test_type']: if t not in test_types_seen: new_type_found = True test_types_seen.add(t) elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen: new_type_found = True test_types_seen.add(row['test_type']) if new_type_found: diverse_picks.append(row) # Combine primary results with diverse picks if diverse_picks: diverse_df = pd.DataFrame(diverse_picks) result = pd.concat([primary_results, diverse_df]).reset_index(drop=True) return result def extract_job_keywords(query): """Extract relevant keywords from job description for better matching.""" # Common job skill categories that might appear in descriptions skill_categories = [ "competencies", "ability", "aptitude", "personality", "behavior", "leadership", "management", "technical", "analytical", "problem-solving", "communication", "teamwork", "situational", "judgment", "cognitive", "verbal", "numerical", "programming", "coding", "development", "sales", "customer service", "administrative", "executive", "professional", "entry-level", "senior", "mid-level", "assessment", "test" ] # Look for these keywords in the query found_keywords = [] query_lower = query.lower() for keyword in skill_categories: if keyword in query_lower: found_keywords.append(keyword) # Add any job titles found job_titles = [ "manager", "director", "analyst", "developer", "engineer", "administrator", "assistant", "coordinator", "specialist", "supervisor", "consultant", "executive", "officer", "associate", "representative", "technician", "accountant", "designer", "sales", "support", "professional" ] for title in job_titles: if title in query_lower: found_keywords.append(title) return found_keywords