endpointwebappshl / reranker.py
AnshulS's picture
Update reranker.py
9d9d3fa verified
raw
history blame
4.24 kB
import os
import re
import json
import google.generativeai as genai
# Configure the Gemini API
genai.configure(api_key=os.environ.get("GEMINI_API_KEY", ""))
model = genai.GenerativeModel("models/gemini-2.0-flash")
def rerank(query, candidates):
"""
Rerank the candidate assessments using Gemini.
Args:
query: The job description
candidates: List of assessment dictionaries
Returns:
Dictionary containing the recommended assessments
"""
# Ensure we have candidates
if not candidates:
return {"error": "No candidate assessments to rerank"}
# Print debugging info
print(f"Reranking {len(candidates)} candidates")
print(f"Sample candidate: {json.dumps(candidates[0], indent=2)}")
# Clean up candidates data for API
cleaned_candidates = []
for candidate in candidates:
# Create a clean copy
clean_candidate = {}
# Copy required fields
for field in ["url", "adaptive_support", "remote_support", "description", "duration", "test_type"]:
if field in candidate:
# Special handling for test_type
if field == "test_type" and isinstance(candidate[field], list):
clean_candidate[field] = candidate[field]
else:
clean_candidate[field] = candidate[field]
else:
# Default values for missing fields
if field == "test_type":
clean_candidate[field] = ["Unknown"]
elif field == "duration":
clean_candidate[field] = None
else:
clean_candidate[field] = "Unknown"
cleaned_candidates.append(clean_candidate)
# Create the prompt for Gemini
prompt = f"""
Given a job description, rank the most relevant SHL assessments based on how well they match the job requirements.
Job description: "{query}"
Candidate SHL assessments: {json.dumps(cleaned_candidates, indent=2)}
Rank the most relevant assessments and return a JSON list in this format:
{{
"recommended_assessments": [
{{
"url": "...",
"adaptive_support": "Yes/No",
"remote_support": "Yes/No",
"description": "...",
"duration": integer or null,
"test_type": ["type1", "type2", ...]
}}
]
}}
CRITICAL INSTRUCTIONS:
1. Return ONLY valid JSON without any markdown code blocks or extra text
2. Preserve the exact URL values from the input - do not modify them
3. Include all fields from the original assessment data
4. Limit to the top 5 most relevant assessments
5. Ensure the JSON is properly formatted with all fields
6. Keep all test_type values as arrays/lists, even if there's only one type
"""
# Generate response
try:
response = model.generate_content(prompt)
response_text = response.text
# Try to extract JSON from possible markdown code blocks
json_match = re.search(r'```(?:json)?\s*(.*?)```', response_text, re.DOTALL)
if json_match:
response_text = json_match.group(1).strip()
# Parse the JSON
result = json.loads(response_text)
# Validate the response structure
if "recommended_assessments" not in result:
return {"error": "Invalid response format: missing recommended_assessments key"}
# Ensure each assessment has the required fields
for assessment in result["recommended_assessments"]:
if "url" not in assessment:
assessment["url"] = "https://www.shl.com/missing-url"
if "test_type" not in assessment:
assessment["test_type"] = ["Unknown"]
if not isinstance(assessment["test_type"], list):
assessment["test_type"] = [assessment["test_type"]]
return result
except Exception as e:
error_msg = f"Error in reranking: {str(e)}"
print(error_msg)
return {"error": error_msg}