Spaces:

AnshulS
/

endpointwebappshl

Sleeping

App Files Files Community

endpointwebappshl / retriever.py

AnshulS

Update retriever.py

f25a758 verified about 1 month ago

raw

history blame contribute delete

3.12 kB

	import os
	os.environ["TRANSFORMERS_CACHE"] = "/tmp"
	os.environ["HF_HOME"] = "/tmp"
	os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/tmp"

	import pandas as pd
	from sentence_transformers import SentenceTransformer, util
	import json

	model = SentenceTransformer("all-MiniLM-L6-v2")

	def format_test_type(test_types):
	"""Format test type for embedding."""
	if isinstance(test_types, list):
	return ', '.join(test_types)
	if isinstance(test_types, str) and test_types.startswith('['):
	try:
	return ', '.join(eval(test_types))
	except:
	pass
	return str(test_types)

	def get_relevant_passages(query, df, top_k=20):
	"""Find most relevant assessments using semantic search."""
	# Create a copy to avoid modifying the original dataframe
	df_copy = df.copy()

	if df_copy.empty:
	print("Warning: Empty dataframe passed to get_relevant_passages")
	return df_copy

	# Display dataframe info for debugging
	print(f"Dataframe columns: {df_copy.columns}")
	print(f"Dataframe sample: {df_copy.head(1).to_dict('records')}")

	# Ensure test_type is properly formatted
	if 'test_type' in df_copy.columns:
	# Convert test_type to proper format if it's a string representation of a list
	df_copy['test_type'] = df_copy['test_type'].apply(
	lambda x: eval(x) if isinstance(x, str) and x.startswith('[') else
	([x] if not isinstance(x, list) else x)
	)

	# Concatenate all fields into a single string per row for embedding
	corpus = []
	for _, row in df_copy.iterrows():
	try:
	description = row['description'] if pd.notna(row['description']) else ""
	test_types = format_test_type(row['test_type']) if 'test_type' in row else ""
	adaptive = row['adaptive_support'] if 'adaptive_support' in row else "Unknown"
	remote = row['remote_support'] if 'remote_support' in row else "Unknown"
	duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"

	text = (f"{description} "
	f"Test types: {test_types}. "
	f"Adaptive support: {adaptive}. "
	f"Remote support: {remote}. "
	f"Duration: {duration}.")
	corpus.append(text)
	except Exception as e:
	print(f"Error processing row: {e}")
	corpus.append("Error processing assessment")

	print(f"Created corpus with {len(corpus)} items")

	# Generate embeddings
	corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
	query_embedding = model.encode(query, convert_to_tensor=True)

	# Find most similar
	hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]

	# Get top matches
	result = df_copy.iloc[[hit['corpus_id'] for hit in hits]].copy()
	print(f"Found {len(result)} relevant passages")

	# Add score for debugging
	result['score'] = [hit['score'] for hit in hits]

	return result