File size: 3,122 Bytes
f25a758
 
 
 
 
2133db4
6e0aaf8
24f19c6
2133db4
6e0aaf8
 
24f19c6
 
 
 
 
 
 
 
 
 
 
6e0aaf8
831f81c
6e0aaf8
 
397f5c9
24f19c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
831f81c
 
24f19c6
 
 
 
 
 
 
04fa7f5
24f19c6
 
 
6e0aaf8
831f81c
397f5c9
24f19c6
 
 
 
 
 
 
 
 
 
831f81c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
os.environ["TRANSFORMERS_CACHE"] = "/tmp"
os.environ["HF_HOME"] = "/tmp"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp"

import pandas as pd
from sentence_transformers import SentenceTransformer, util
import json

model = SentenceTransformer("all-MiniLM-L6-v2")

def format_test_type(test_types):
    """Format test type for embedding."""
    if isinstance(test_types, list):
        return ', '.join(test_types)
    if isinstance(test_types, str) and test_types.startswith('['):
        try:
            return ', '.join(eval(test_types))
        except:
            pass
    return str(test_types)

def get_relevant_passages(query, df, top_k=20):
    """Find most relevant assessments using semantic search."""
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    if df_copy.empty:
        print("Warning: Empty dataframe passed to get_relevant_passages")
        return df_copy
    
    # Display dataframe info for debugging
    print(f"Dataframe columns: {df_copy.columns}")
    print(f"Dataframe sample: {df_copy.head(1).to_dict('records')}")
    
    # Ensure test_type is properly formatted
    if 'test_type' in df_copy.columns:
        # Convert test_type to proper format if it's a string representation of a list
        df_copy['test_type'] = df_copy['test_type'].apply(
            lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else 
                    ([x] if not isinstance(x, list) else x)
        )
    
    # Concatenate all fields into a single string per row for embedding
    corpus = []
    for _, row in df_copy.iterrows():
        try:
            description = row['description'] if pd.notna(row['description']) else ""
            test_types = format_test_type(row['test_type']) if 'test_type' in row else ""
            adaptive = row['adaptive_support'] if 'adaptive_support' in row else "Unknown"
            remote = row['remote_support'] if 'remote_support' in row else "Unknown"
            duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
            
            text = (f"{description} "
                   f"Test types: {test_types}. "
                   f"Adaptive support: {adaptive}. "
                   f"Remote support: {remote}. "
                   f"Duration: {duration}.")
            corpus.append(text)
        except Exception as e:
            print(f"Error processing row: {e}")
            corpus.append("Error processing assessment")
    
    print(f"Created corpus with {len(corpus)} items")
    
    # Generate embeddings
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    # Find most similar
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
    
    # Get top matches
    result = df_copy.iloc[[hit['corpus_id'] for hit in hits]].copy()
    print(f"Found {len(result)} relevant passages")
    
    # Add score for debugging
    result['score'] = [hit['score'] for hit in hits]
    
    return result