Spaces:
Sleeping
Sleeping
Update retriever.py
Browse files- retriever.py +5 -87
retriever.py
CHANGED
@@ -16,7 +16,7 @@ def format_test_type(test_types):
|
|
16 |
return str(test_types)
|
17 |
|
18 |
def get_relevant_passages(query, df, top_k=20):
|
19 |
-
"""Find most relevant
|
20 |
# Create a copy to avoid modifying the original dataframe
|
21 |
df_copy = df.copy()
|
22 |
|
@@ -36,14 +36,6 @@ def get_relevant_passages(query, df, top_k=20):
|
|
36 |
([x] if not isinstance(x, list) else x)
|
37 |
)
|
38 |
|
39 |
-
# Extract keywords from query for better matching
|
40 |
-
# This helps target specific skills mentioned in the job description
|
41 |
-
keywords = extract_job_keywords(query)
|
42 |
-
print(f"Extracted keywords from query: {keywords}")
|
43 |
-
|
44 |
-
# Expand query with keywords for better semantic search
|
45 |
-
expanded_query = f"{query} {' '.join(keywords)}"
|
46 |
-
|
47 |
# Concatenate all fields into a single string per row for embedding
|
48 |
corpus = []
|
49 |
for _, row in df_copy.iterrows():
|
@@ -54,9 +46,8 @@ def get_relevant_passages(query, df, top_k=20):
|
|
54 |
remote = row['remote_support'] if 'remote_support' in row else "Unknown"
|
55 |
duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
f"Test types: {test_types} {test_types}. " # Repeat test types for more weight
|
60 |
f"Adaptive support: {adaptive}. "
|
61 |
f"Remote support: {remote}. "
|
62 |
f"Duration: {duration}.")
|
@@ -69,7 +60,7 @@ def get_relevant_passages(query, df, top_k=20):
|
|
69 |
|
70 |
# Generate embeddings
|
71 |
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
|
72 |
-
query_embedding = model.encode(
|
73 |
|
74 |
# Find most similar
|
75 |
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
|
@@ -81,77 +72,4 @@ def get_relevant_passages(query, df, top_k=20):
|
|
81 |
# Add score for debugging
|
82 |
result['score'] = [hit['score'] for hit in hits]
|
83 |
|
84 |
-
|
85 |
-
if len(result) > top_k / 2:
|
86 |
-
primary_results = result.head(int(top_k * 0.7)) # Keep top 70% based on relevance
|
87 |
-
|
88 |
-
# Try to find diverse test types for remaining slots
|
89 |
-
test_types_seen = set()
|
90 |
-
for _, row in primary_results.iterrows():
|
91 |
-
if isinstance(row['test_type'], list):
|
92 |
-
for t in row['test_type']:
|
93 |
-
test_types_seen.add(t)
|
94 |
-
elif isinstance(row['test_type'], str):
|
95 |
-
test_types_seen.add(row['test_type'])
|
96 |
-
|
97 |
-
# Find assessments with different test types
|
98 |
-
remaining = result.iloc[int(top_k * 0.7):]
|
99 |
-
diverse_picks = []
|
100 |
-
|
101 |
-
for _, row in remaining.iterrows():
|
102 |
-
if len(diverse_picks) >= (top_k - len(primary_results)):
|
103 |
-
break
|
104 |
-
|
105 |
-
new_type_found = False
|
106 |
-
if isinstance(row['test_type'], list):
|
107 |
-
for t in row['test_type']:
|
108 |
-
if t not in test_types_seen:
|
109 |
-
new_type_found = True
|
110 |
-
test_types_seen.add(t)
|
111 |
-
elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen:
|
112 |
-
new_type_found = True
|
113 |
-
test_types_seen.add(row['test_type'])
|
114 |
-
|
115 |
-
if new_type_found:
|
116 |
-
diverse_picks.append(row)
|
117 |
-
|
118 |
-
# Combine primary results with diverse picks
|
119 |
-
if diverse_picks:
|
120 |
-
diverse_df = pd.DataFrame(diverse_picks)
|
121 |
-
result = pd.concat([primary_results, diverse_df]).reset_index(drop=True)
|
122 |
-
|
123 |
-
return result
|
124 |
-
|
125 |
-
def extract_job_keywords(query):
|
126 |
-
"""Extract relevant keywords from job description for better matching."""
|
127 |
-
# Common job skill categories that might appear in descriptions
|
128 |
-
skill_categories = [
|
129 |
-
"competencies", "ability", "aptitude", "personality", "behavior",
|
130 |
-
"leadership", "management", "technical", "analytical", "problem-solving",
|
131 |
-
"communication", "teamwork", "situational", "judgment", "cognitive",
|
132 |
-
"verbal", "numerical", "programming", "coding", "development",
|
133 |
-
"sales", "customer service", "administrative", "executive", "professional",
|
134 |
-
"entry-level", "senior", "mid-level", "assessment", "test"
|
135 |
-
]
|
136 |
-
|
137 |
-
# Look for these keywords in the query
|
138 |
-
found_keywords = []
|
139 |
-
query_lower = query.lower()
|
140 |
-
|
141 |
-
for keyword in skill_categories:
|
142 |
-
if keyword in query_lower:
|
143 |
-
found_keywords.append(keyword)
|
144 |
-
|
145 |
-
# Add any job titles found
|
146 |
-
job_titles = [
|
147 |
-
"manager", "director", "analyst", "developer", "engineer", "administrator",
|
148 |
-
"assistant", "coordinator", "specialist", "supervisor", "consultant",
|
149 |
-
"executive", "officer", "associate", "representative", "technician",
|
150 |
-
"accountant", "designer", "sales", "support", "professional"
|
151 |
-
]
|
152 |
-
|
153 |
-
for title in job_titles:
|
154 |
-
if title in query_lower:
|
155 |
-
found_keywords.append(title)
|
156 |
-
|
157 |
-
return found_keywords
|
|
|
16 |
return str(test_types)
|
17 |
|
18 |
def get_relevant_passages(query, df, top_k=20):
|
19 |
+
"""Find most relevant assessments using semantic search."""
|
20 |
# Create a copy to avoid modifying the original dataframe
|
21 |
df_copy = df.copy()
|
22 |
|
|
|
36 |
([x] if not isinstance(x, list) else x)
|
37 |
)
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
# Concatenate all fields into a single string per row for embedding
|
40 |
corpus = []
|
41 |
for _, row in df_copy.iterrows():
|
|
|
46 |
remote = row['remote_support'] if 'remote_support' in row else "Unknown"
|
47 |
duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
|
48 |
|
49 |
+
text = (f"{description} "
|
50 |
+
f"Test types: {test_types}. "
|
|
|
51 |
f"Adaptive support: {adaptive}. "
|
52 |
f"Remote support: {remote}. "
|
53 |
f"Duration: {duration}.")
|
|
|
60 |
|
61 |
# Generate embeddings
|
62 |
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
|
63 |
+
query_embedding = model.encode(query, convert_to_tensor=True)
|
64 |
|
65 |
# Find most similar
|
66 |
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
|
|
|
72 |
# Add score for debugging
|
73 |
result['score'] = [hit['score'] for hit in hits]
|
74 |
|
75 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|