Spaces:

AnshulS
/

endpointwebappshl

Sleeping

App Files Files Community

AnshulS commited on May 9

Commit

7e0bee0

verified ·

1 Parent(s): 385103f

Update retriever.py

Browse files

Files changed (1) hide show

retriever.py +5 -16

retriever.py CHANGED Viewed

@@ -7,19 +7,8 @@ def get_relevant_passages(query, df, top_k=20):
     # Create a copy to avoid modifying the original dataframe
     df_copy = df.copy()
-    # Print shape for debugging
-    print(f"DataFrame shape: {df_copy.shape}")
-    print(f"DataFrame columns: {df_copy.columns.tolist()}")
-    # Handle missing columns gracefully
-    for col in ['description', 'test_type', 'adaptive_support', 'remote_support', 'duration']:
-        if col not in df_copy.columns:
-            df_copy[col] = 'N/A'
     # Ensure URL field is properly formatted
-    if 'url' not in df_copy.columns:
-        df_copy['url'] = 'https://www.shl.com/missing-url'
-    else:
         # Clean up URLs if needed
         df_copy['url'] = df_copy['url'].astype(str)
         # Ensure URLs start with http or https
@@ -29,16 +18,16 @@ def get_relevant_passages(query, df, top_k=20):
     # Format test_type for better representation
     def format_test_type(test_types):
         if isinstance(test_types, list):
-            return ', '.join([str(t) for t in test_types if t])
         return str(test_types)
     # Concatenate all fields into a single string per row
     corpus = df_copy.apply(
-        lambda row: f"{row.get('assessment_name', '')} {row.get('description', '')} "
                    f"Test types: {format_test_type(row['test_type'])}. "
                    f"Adaptive support: {row['adaptive_support']}. "
                    f"Remote support: {row['remote_support']}. "
-                   f"Duration: {row['duration']} minutes.",
         axis=1
     ).tolist()
@@ -46,4 +35,4 @@ def get_relevant_passages(query, df, top_k=20):
     query_embedding = model.encode(query, convert_to_tensor=True)
     hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
-    return df_copy.iloc[[hit['corpus_id'] for hit in hits]]

     # Create a copy to avoid modifying the original dataframe
     df_copy = df.copy()
     # Ensure URL field is properly formatted
+    if 'url' in df_copy.columns:
         # Clean up URLs if needed
         df_copy['url'] = df_copy['url'].astype(str)
         # Ensure URLs start with http or https
     # Format test_type for better representation
     def format_test_type(test_types):
         if isinstance(test_types, list):
+            return ', '.join(test_types)
         return str(test_types)
     # Concatenate all fields into a single string per row
     corpus = df_copy.apply(
+        lambda row: f"{row['description']} "
                    f"Test types: {format_test_type(row['test_type'])}. "
                    f"Adaptive support: {row['adaptive_support']}. "
                    f"Remote support: {row['remote_support']}. "
+                   f"Duration: {row['duration'] if pd.notna(row['duration']) else 'N/A'} minutes.",
         axis=1
     ).tolist()
     query_embedding = model.encode(query, convert_to_tensor=True)
     hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
+    return df_copy.iloc[[hit['corpus_id'] for hit in hits]]