Spaces:
Sleeping
Sleeping
File size: 2,070 Bytes
2133db4 6e0aaf8 2133db4 6e0aaf8 397f5c9 e5766c5 6e0aaf8 e5766c5 6e0aaf8 397f5c9 6e0aaf8 e5766c5 6e0aaf8 397f5c9 6e0aaf8 e5766c5 6e0aaf8 e5766c5 6e0aaf8 04fa7f5 6e0aaf8 397f5c9 e5766c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import pandas as pd
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")
def get_relevant_passages(query, df, top_k=20):
# Create a copy to avoid modifying the original dataframe
df_copy = df.copy()
# Print shape for debugging
print(f"DataFrame shape: {df_copy.shape}")
print(f"DataFrame columns: {df_copy.columns.tolist()}")
# Handle missing columns gracefully
for col in ['description', 'test_type', 'adaptive_support', 'remote_support', 'duration']:
if col not in df_copy.columns:
df_copy[col] = 'N/A'
# Ensure URL field is properly formatted
if 'url' not in df_copy.columns:
df_copy['url'] = 'https://www.shl.com/missing-url'
else:
# Clean up URLs if needed
df_copy['url'] = df_copy['url'].astype(str)
# Ensure URLs start with http or https
mask = ~df_copy['url'].str.startswith(('http://', 'https://'))
df_copy.loc[mask, 'url'] = 'https://www.shl.com/' + df_copy.loc[mask, 'url'].str.lstrip('/')
# Format test_type for better representation
def format_test_type(test_types):
if isinstance(test_types, list):
return ', '.join([str(t) for t in test_types if t])
return str(test_types)
# Concatenate all fields into a single string per row
corpus = df_copy.apply(
lambda row: f"{row.get('assessment_name', '')} {row.get('description', '')} "
f"Test types: {format_test_type(row['test_type'])}. "
f"Adaptive support: {row['adaptive_support']}. "
f"Remote support: {row['remote_support']}. "
f"Duration: {row['duration']} minutes.",
axis=1
).tolist()
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
query_embedding = model.encode(query, convert_to_tensor=True)
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
return df_copy.iloc[[hit['corpus_id'] for hit in hits]]
|