Spaces:
Sleeping
Sleeping
import pandas as pd | |
from sentence_transformers import SentenceTransformer, util | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
def get_relevant_passages(query, df, top_k=20): | |
# Create a copy to avoid modifying the original dataframe | |
df_copy = df.copy() | |
# Print shape for debugging | |
print(f"DataFrame shape: {df_copy.shape}") | |
print(f"DataFrame columns: {df_copy.columns.tolist()}") | |
# Handle missing columns gracefully | |
for col in ['description', 'test_type', 'adaptive_support', 'remote_support', 'duration']: | |
if col not in df_copy.columns: | |
df_copy[col] = 'N/A' | |
# Ensure URL field is properly formatted | |
if 'url' not in df_copy.columns: | |
df_copy['url'] = 'https://www.shl.com/missing-url' | |
else: | |
# Clean up URLs if needed | |
df_copy['url'] = df_copy['url'].astype(str) | |
# Ensure URLs start with http or https | |
mask = ~df_copy['url'].str.startswith(('http://', 'https://')) | |
df_copy.loc[mask, 'url'] = 'https://www.shl.com/' + df_copy.loc[mask, 'url'].str.lstrip('/') | |
# Format test_type for better representation | |
def format_test_type(test_types): | |
if isinstance(test_types, list): | |
return ', '.join([str(t) for t in test_types if t]) | |
return str(test_types) | |
# Concatenate all fields into a single string per row | |
corpus = df_copy.apply( | |
lambda row: f"{row.get('assessment_name', '')} {row.get('description', '')} " | |
f"Test types: {format_test_type(row['test_type'])}. " | |
f"Adaptive support: {row['adaptive_support']}. " | |
f"Remote support: {row['remote_support']}. " | |
f"Duration: {row['duration']} minutes.", | |
axis=1 | |
).tolist() | |
corpus_embeddings = model.encode(corpus, convert_to_tensor=True) | |
query_embedding = model.encode(query, convert_to_tensor=True) | |
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0] | |
return df_copy.iloc[[hit['corpus_id'] for hit in hits]] | |