Spaces:

AnshulS
/

endpointwebappshl

Sleeping

File size: 6,642 Bytes

2133db4
6e0aaf8
24f19c6
2133db4
6e0aaf8
 
24f19c6
 
 
 
 
 
 
 
 
 
 
6e0aaf8
e5d19f1
6e0aaf8
 
397f5c9
24f19c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5d19f1
 
 
 
 
 
 
 
24f19c6
 
 
 
 
 
 
 
 
 
e5d19f1
 
 
24f19c6
 
 
 
 
 
 
04fa7f5
24f19c6
 
 
6e0aaf8
e5d19f1
397f5c9
24f19c6
 
 
 
 
 
 
 
 
 
e5d19f1

import pandas as pd
from sentence_transformers import SentenceTransformer, util
import json

model = SentenceTransformer("all-MiniLM-L6-v2")

def format_test_type(test_types):
    """Format test type for embedding."""
    if isinstance(test_types, list):
        return ', '.join(test_types)
    if isinstance(test_types, str) and test_types.startswith('['):
        try:
            return ', '.join(eval(test_types))
        except:
            pass
    return str(test_types)

def get_relevant_passages(query, df, top_k=20):
    """Find most relevant and diverse assessments using semantic search with diversity enhancement."""
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    if df_copy.empty:
        print("Warning: Empty dataframe passed to get_relevant_passages")
        return df_copy
    
    # Display dataframe info for debugging
    print(f"Dataframe columns: {df_copy.columns}")
    print(f"Dataframe sample: {df_copy.head(1).to_dict('records')}")
    
    # Ensure test_type is properly formatted
    if 'test_type' in df_copy.columns:
        # Convert test_type to proper format if it's a string representation of a list
        df_copy['test_type'] = df_copy['test_type'].apply(
            lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else 
                    ([x] if not isinstance(x, list) else x)
        )
    
    # Extract keywords from query for better matching
    # This helps target specific skills mentioned in the job description
    keywords = extract_job_keywords(query)
    print(f"Extracted keywords from query: {keywords}")
    
    # Expand query with keywords for better semantic search
    expanded_query = f"{query} {' '.join(keywords)}"
    
    # Concatenate all fields into a single string per row for embedding
    corpus = []
    for _, row in df_copy.iterrows():
        try:
            description = row['description'] if pd.notna(row['description']) else ""
            test_types = format_test_type(row['test_type']) if 'test_type' in row else ""
            adaptive = row['adaptive_support'] if 'adaptive_support' in row else "Unknown"
            remote = row['remote_support'] if 'remote_support' in row else "Unknown"
            duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
            
            # Enhanced representation with more weight on description and test types
            text = (f"{description} {description} "  # Repeat description for more weight
                   f"Test types: {test_types} {test_types}. "  # Repeat test types for more weight
                   f"Adaptive support: {adaptive}. "
                   f"Remote support: {remote}. "
                   f"Duration: {duration}.")
            corpus.append(text)
        except Exception as e:
            print(f"Error processing row: {e}")
            corpus.append("Error processing assessment")
    
    print(f"Created corpus with {len(corpus)} items")
    
    # Generate embeddings
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(expanded_query, convert_to_tensor=True)
    
    # Find most similar
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
    
    # Get top matches
    result = df_copy.iloc[[hit['corpus_id'] for hit in hits]].copy()
    print(f"Found {len(result)} relevant passages")
    
    # Add score for debugging
    result['score'] = [hit['score'] for hit in hits]
    
    # Apply diversity enhancement - get a mix of test types if possible
    if len(result) > top_k / 2:
        primary_results = result.head(int(top_k * 0.7))  # Keep top 70% based on relevance
        
        # Try to find diverse test types for remaining slots
        test_types_seen = set()
        for _, row in primary_results.iterrows():
            if isinstance(row['test_type'], list):
                for t in row['test_type']:
                    test_types_seen.add(t)
            elif isinstance(row['test_type'], str):
                test_types_seen.add(row['test_type'])
        
        # Find assessments with different test types
        remaining = result.iloc[int(top_k * 0.7):]
        diverse_picks = []
        
        for _, row in remaining.iterrows():
            if len(diverse_picks) >= (top_k - len(primary_results)):
                break
                
            new_type_found = False
            if isinstance(row['test_type'], list):
                for t in row['test_type']:
                    if t not in test_types_seen:
                        new_type_found = True
                        test_types_seen.add(t)
            elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen:
                new_type_found = True
                test_types_seen.add(row['test_type'])
                
            if new_type_found:
                diverse_picks.append(row)
        
        # Combine primary results with diverse picks
        if diverse_picks:
            diverse_df = pd.DataFrame(diverse_picks)
            result = pd.concat([primary_results, diverse_df]).reset_index(drop=True)
    
    return result

def extract_job_keywords(query):
    """Extract relevant keywords from job description for better matching."""
    # Common job skill categories that might appear in descriptions
    skill_categories = [
        "competencies", "ability", "aptitude", "personality", "behavior", 
        "leadership", "management", "technical", "analytical", "problem-solving",
        "communication", "teamwork", "situational", "judgment", "cognitive",
        "verbal", "numerical", "programming", "coding", "development",
        "sales", "customer service", "administrative", "executive", "professional",
        "entry-level", "senior", "mid-level", "assessment", "test"
    ]
    
    # Look for these keywords in the query
    found_keywords = []
    query_lower = query.lower()
    
    for keyword in skill_categories:
        if keyword in query_lower:
            found_keywords.append(keyword)
    
    # Add any job titles found
    job_titles = [
        "manager", "director", "analyst", "developer", "engineer", "administrator",
        "assistant", "coordinator", "specialist", "supervisor", "consultant",
        "executive", "officer", "associate", "representative", "technician",
        "accountant", "designer", "sales", "support", "professional"
    ]
    
    for title in job_titles:
        if title in query_lower:
            found_keywords.append(title)
    
    return found_keywords