Spaces:

AnshulS
/

endpointwebappshl

Sleeping

File size: 2,070 Bytes

2133db4
6e0aaf8
2133db4
6e0aaf8
 
 
 
 
397f5c9
e5766c5
 
 
 
 
 
 
 
 
6e0aaf8
e5766c5
 
 
6e0aaf8
 
 
 
 
397f5c9
6e0aaf8
 
 
e5766c5
6e0aaf8
397f5c9
6e0aaf8
 
e5766c5
6e0aaf8
 
 
e5766c5
6e0aaf8
 
04fa7f5
6e0aaf8
 
 
397f5c9
e5766c5

import pandas as pd
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

def get_relevant_passages(query, df, top_k=20):
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # Print shape for debugging
    print(f"DataFrame shape: {df_copy.shape}")
    print(f"DataFrame columns: {df_copy.columns.tolist()}")
    
    # Handle missing columns gracefully
    for col in ['description', 'test_type', 'adaptive_support', 'remote_support', 'duration']:
        if col not in df_copy.columns:
            df_copy[col] = 'N/A'
    
    # Ensure URL field is properly formatted
    if 'url' not in df_copy.columns:
        df_copy['url'] = 'https://www.shl.com/missing-url'
    else:
        # Clean up URLs if needed
        df_copy['url'] = df_copy['url'].astype(str)
        # Ensure URLs start with http or https
        mask = ~df_copy['url'].str.startswith(('http://', 'https://'))
        df_copy.loc[mask, 'url'] = 'https://www.shl.com/' + df_copy.loc[mask, 'url'].str.lstrip('/')
    
    # Format test_type for better representation
    def format_test_type(test_types):
        if isinstance(test_types, list):
            return ', '.join([str(t) for t in test_types if t])
        return str(test_types)
    
    # Concatenate all fields into a single string per row
    corpus = df_copy.apply(
        lambda row: f"{row.get('assessment_name', '')} {row.get('description', '')} "
                   f"Test types: {format_test_type(row['test_type'])}. "
                   f"Adaptive support: {row['adaptive_support']}. "
                   f"Remote support: {row['remote_support']}. "
                   f"Duration: {row['duration']} minutes.",
        axis=1
    ).tolist()
    
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
    
    return df_copy.iloc[[hit['corpus_id'] for hit in hits]]