AnshulS commited on
Commit
e5d19f1
·
verified ·
1 Parent(s): 2a13208

Update retriever.py

Browse files
Files changed (1) hide show
  1. retriever.py +87 -5
retriever.py CHANGED
@@ -16,7 +16,7 @@ def format_test_type(test_types):
16
  return str(test_types)
17
 
18
  def get_relevant_passages(query, df, top_k=20):
19
- """Find most relevant assessments using semantic search."""
20
  # Create a copy to avoid modifying the original dataframe
21
  df_copy = df.copy()
22
 
@@ -36,6 +36,14 @@ def get_relevant_passages(query, df, top_k=20):
36
  ([x] if not isinstance(x, list) else x)
37
  )
38
 
 
 
 
 
 
 
 
 
39
  # Concatenate all fields into a single string per row for embedding
40
  corpus = []
41
  for _, row in df_copy.iterrows():
@@ -46,8 +54,9 @@ def get_relevant_passages(query, df, top_k=20):
46
  remote = row['remote_support'] if 'remote_support' in row else "Unknown"
47
  duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
48
 
49
- text = (f"{description} "
50
- f"Test types: {test_types}. "
 
51
  f"Adaptive support: {adaptive}. "
52
  f"Remote support: {remote}. "
53
  f"Duration: {duration}.")
@@ -60,7 +69,7 @@ def get_relevant_passages(query, df, top_k=20):
60
 
61
  # Generate embeddings
62
  corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
63
- query_embedding = model.encode(query, convert_to_tensor=True)
64
 
65
  # Find most similar
66
  hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
@@ -72,4 +81,77 @@ def get_relevant_passages(query, df, top_k=20):
72
  # Add score for debugging
73
  result['score'] = [hit['score'] for hit in hits]
74
 
75
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  return str(test_types)
17
 
18
  def get_relevant_passages(query, df, top_k=20):
19
+ """Find most relevant and diverse assessments using semantic search with diversity enhancement."""
20
  # Create a copy to avoid modifying the original dataframe
21
  df_copy = df.copy()
22
 
 
36
  ([x] if not isinstance(x, list) else x)
37
  )
38
 
39
+ # Extract keywords from query for better matching
40
+ # This helps target specific skills mentioned in the job description
41
+ keywords = extract_job_keywords(query)
42
+ print(f"Extracted keywords from query: {keywords}")
43
+
44
+ # Expand query with keywords for better semantic search
45
+ expanded_query = f"{query} {' '.join(keywords)}"
46
+
47
  # Concatenate all fields into a single string per row for embedding
48
  corpus = []
49
  for _, row in df_copy.iterrows():
 
54
  remote = row['remote_support'] if 'remote_support' in row else "Unknown"
55
  duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
56
 
57
+ # Enhanced representation with more weight on description and test types
58
+ text = (f"{description} {description} " # Repeat description for more weight
59
+ f"Test types: {test_types} {test_types}. " # Repeat test types for more weight
60
  f"Adaptive support: {adaptive}. "
61
  f"Remote support: {remote}. "
62
  f"Duration: {duration}.")
 
69
 
70
  # Generate embeddings
71
  corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
72
+ query_embedding = model.encode(expanded_query, convert_to_tensor=True)
73
 
74
  # Find most similar
75
  hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
 
81
  # Add score for debugging
82
  result['score'] = [hit['score'] for hit in hits]
83
 
84
+ # Apply diversity enhancement - get a mix of test types if possible
85
+ if len(result) > top_k / 2:
86
+ primary_results = result.head(int(top_k * 0.7)) # Keep top 70% based on relevance
87
+
88
+ # Try to find diverse test types for remaining slots
89
+ test_types_seen = set()
90
+ for _, row in primary_results.iterrows():
91
+ if isinstance(row['test_type'], list):
92
+ for t in row['test_type']:
93
+ test_types_seen.add(t)
94
+ elif isinstance(row['test_type'], str):
95
+ test_types_seen.add(row['test_type'])
96
+
97
+ # Find assessments with different test types
98
+ remaining = result.iloc[int(top_k * 0.7):]
99
+ diverse_picks = []
100
+
101
+ for _, row in remaining.iterrows():
102
+ if len(diverse_picks) >= (top_k - len(primary_results)):
103
+ break
104
+
105
+ new_type_found = False
106
+ if isinstance(row['test_type'], list):
107
+ for t in row['test_type']:
108
+ if t not in test_types_seen:
109
+ new_type_found = True
110
+ test_types_seen.add(t)
111
+ elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen:
112
+ new_type_found = True
113
+ test_types_seen.add(row['test_type'])
114
+
115
+ if new_type_found:
116
+ diverse_picks.append(row)
117
+
118
+ # Combine primary results with diverse picks
119
+ if diverse_picks:
120
+ diverse_df = pd.DataFrame(diverse_picks)
121
+ result = pd.concat([primary_results, diverse_df]).reset_index(drop=True)
122
+
123
+ return result
124
+
125
+ def extract_job_keywords(query):
126
+ """Extract relevant keywords from job description for better matching."""
127
+ # Common job skill categories that might appear in descriptions
128
+ skill_categories = [
129
+ "competencies", "ability", "aptitude", "personality", "behavior",
130
+ "leadership", "management", "technical", "analytical", "problem-solving",
131
+ "communication", "teamwork", "situational", "judgment", "cognitive",
132
+ "verbal", "numerical", "programming", "coding", "development",
133
+ "sales", "customer service", "administrative", "executive", "professional",
134
+ "entry-level", "senior", "mid-level", "assessment", "test"
135
+ ]
136
+
137
+ # Look for these keywords in the query
138
+ found_keywords = []
139
+ query_lower = query.lower()
140
+
141
+ for keyword in skill_categories:
142
+ if keyword in query_lower:
143
+ found_keywords.append(keyword)
144
+
145
+ # Add any job titles found
146
+ job_titles = [
147
+ "manager", "director", "analyst", "developer", "engineer", "administrator",
148
+ "assistant", "coordinator", "specialist", "supervisor", "consultant",
149
+ "executive", "officer", "associate", "representative", "technician",
150
+ "accountant", "designer", "sales", "support", "professional"
151
+ ]
152
+
153
+ for title in job_titles:
154
+ if title in query_lower:
155
+ found_keywords.append(title)
156
+
157
+ return found_keywords