AnshulS commited on
Commit
831f81c
·
verified ·
1 Parent(s): 08dabce

Update retriever.py

Browse files
Files changed (1) hide show
  1. retriever.py +5 -87
retriever.py CHANGED
@@ -16,7 +16,7 @@ def format_test_type(test_types):
16
  return str(test_types)
17
 
18
  def get_relevant_passages(query, df, top_k=20):
19
- """Find most relevant and diverse assessments using semantic search with diversity enhancement."""
20
  # Create a copy to avoid modifying the original dataframe
21
  df_copy = df.copy()
22
 
@@ -36,14 +36,6 @@ def get_relevant_passages(query, df, top_k=20):
36
  ([x] if not isinstance(x, list) else x)
37
  )
38
 
39
- # Extract keywords from query for better matching
40
- # This helps target specific skills mentioned in the job description
41
- keywords = extract_job_keywords(query)
42
- print(f"Extracted keywords from query: {keywords}")
43
-
44
- # Expand query with keywords for better semantic search
45
- expanded_query = f"{query} {' '.join(keywords)}"
46
-
47
  # Concatenate all fields into a single string per row for embedding
48
  corpus = []
49
  for _, row in df_copy.iterrows():
@@ -54,9 +46,8 @@ def get_relevant_passages(query, df, top_k=20):
54
  remote = row['remote_support'] if 'remote_support' in row else "Unknown"
55
  duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
56
 
57
- # Enhanced representation with more weight on description and test types
58
- text = (f"{description} {description} " # Repeat description for more weight
59
- f"Test types: {test_types} {test_types}. " # Repeat test types for more weight
60
  f"Adaptive support: {adaptive}. "
61
  f"Remote support: {remote}. "
62
  f"Duration: {duration}.")
@@ -69,7 +60,7 @@ def get_relevant_passages(query, df, top_k=20):
69
 
70
  # Generate embeddings
71
  corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
72
- query_embedding = model.encode(expanded_query, convert_to_tensor=True)
73
 
74
  # Find most similar
75
  hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
@@ -81,77 +72,4 @@ def get_relevant_passages(query, df, top_k=20):
81
  # Add score for debugging
82
  result['score'] = [hit['score'] for hit in hits]
83
 
84
- # Apply diversity enhancement - get a mix of test types if possible
85
- if len(result) > top_k / 2:
86
- primary_results = result.head(int(top_k * 0.7)) # Keep top 70% based on relevance
87
-
88
- # Try to find diverse test types for remaining slots
89
- test_types_seen = set()
90
- for _, row in primary_results.iterrows():
91
- if isinstance(row['test_type'], list):
92
- for t in row['test_type']:
93
- test_types_seen.add(t)
94
- elif isinstance(row['test_type'], str):
95
- test_types_seen.add(row['test_type'])
96
-
97
- # Find assessments with different test types
98
- remaining = result.iloc[int(top_k * 0.7):]
99
- diverse_picks = []
100
-
101
- for _, row in remaining.iterrows():
102
- if len(diverse_picks) >= (top_k - len(primary_results)):
103
- break
104
-
105
- new_type_found = False
106
- if isinstance(row['test_type'], list):
107
- for t in row['test_type']:
108
- if t not in test_types_seen:
109
- new_type_found = True
110
- test_types_seen.add(t)
111
- elif isinstance(row['test_type'], str) and row['test_type'] not in test_types_seen:
112
- new_type_found = True
113
- test_types_seen.add(row['test_type'])
114
-
115
- if new_type_found:
116
- diverse_picks.append(row)
117
-
118
- # Combine primary results with diverse picks
119
- if diverse_picks:
120
- diverse_df = pd.DataFrame(diverse_picks)
121
- result = pd.concat([primary_results, diverse_df]).reset_index(drop=True)
122
-
123
- return result
124
-
125
- def extract_job_keywords(query):
126
- """Extract relevant keywords from job description for better matching."""
127
- # Common job skill categories that might appear in descriptions
128
- skill_categories = [
129
- "competencies", "ability", "aptitude", "personality", "behavior",
130
- "leadership", "management", "technical", "analytical", "problem-solving",
131
- "communication", "teamwork", "situational", "judgment", "cognitive",
132
- "verbal", "numerical", "programming", "coding", "development",
133
- "sales", "customer service", "administrative", "executive", "professional",
134
- "entry-level", "senior", "mid-level", "assessment", "test"
135
- ]
136
-
137
- # Look for these keywords in the query
138
- found_keywords = []
139
- query_lower = query.lower()
140
-
141
- for keyword in skill_categories:
142
- if keyword in query_lower:
143
- found_keywords.append(keyword)
144
-
145
- # Add any job titles found
146
- job_titles = [
147
- "manager", "director", "analyst", "developer", "engineer", "administrator",
148
- "assistant", "coordinator", "specialist", "supervisor", "consultant",
149
- "executive", "officer", "associate", "representative", "technician",
150
- "accountant", "designer", "sales", "support", "professional"
151
- ]
152
-
153
- for title in job_titles:
154
- if title in query_lower:
155
- found_keywords.append(title)
156
-
157
- return found_keywords
 
16
  return str(test_types)
17
 
18
  def get_relevant_passages(query, df, top_k=20):
19
+ """Find most relevant assessments using semantic search."""
20
  # Create a copy to avoid modifying the original dataframe
21
  df_copy = df.copy()
22
 
 
36
  ([x] if not isinstance(x, list) else x)
37
  )
38
 
 
 
 
 
 
 
 
 
39
  # Concatenate all fields into a single string per row for embedding
40
  corpus = []
41
  for _, row in df_copy.iterrows():
 
46
  remote = row['remote_support'] if 'remote_support' in row else "Unknown"
47
  duration = f"{row['duration']} minutes" if pd.notna(row.get('duration')) else "Unknown duration"
48
 
49
+ text = (f"{description} "
50
+ f"Test types: {test_types}. "
 
51
  f"Adaptive support: {adaptive}. "
52
  f"Remote support: {remote}. "
53
  f"Duration: {duration}.")
 
60
 
61
  # Generate embeddings
62
  corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
63
+ query_embedding = model.encode(query, convert_to_tensor=True)
64
 
65
  # Find most similar
66
  hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=min(top_k, len(corpus)))[0]
 
72
  # Add score for debugging
73
  result['score'] = [hit['score'] for hit in hits]
74
 
75
+ return result