File size: 1,616 Bytes
2133db4
 
 
 
 
 
397f5c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
759fd26
397f5c9
 
 
 
 
 
 
 
 
2133db4
 
 
397f5c9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import pandas as pd
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

def get_relevant_passages(query, df, top_k=20):
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # Ensure URL field is properly formatted
    if 'url' in df_copy.columns:
        # Clean up URLs if needed
        df_copy['url'] = df_copy['url'].astype(str)
        # Ensure URLs start with http or https
        mask = ~df_copy['url'].str.startswith(('http://', 'https://'))
        df_copy.loc[mask, 'url'] = 'https://www.shl.com/' + df_copy.loc[mask, 'url'].str.lstrip('/')
    
    # Format test_type for better representation
    def format_test_type(test_types):
        if isinstance(test_types, list):
            return ', '.join(test_types)
        return str(test_types)
    
    # Concatenate all fields into a single string per row
    corpus = df_copy.apply(
        lambda row: f"{row['description']} "
                   f"Test types: {format_test_type(row['test_type'])}. "
                   f"Adaptive support: {row['adaptive_support']}. "
                   f"Remote support: {row['remote_support']}. "
                   f"Duration: {row['duration'] if pd.notna(row['duration']) else 'N/A'} minutes.",
        axis=1
    ).tolist()
    
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
    
    return df_copy.iloc[[hit['corpus_id'] for hit in hits]]