root
commited on
Commit
Β·
1ced284
1
Parent(s):
d5266d0
ss
Browse files- app.py +569 -392
- requirements.txt +7 -6
app.py
CHANGED
@@ -15,7 +15,7 @@ from docx import Document
|
|
15 |
import csv
|
16 |
from datasets import load_dataset
|
17 |
import gc
|
18 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
19 |
import time
|
20 |
import faiss
|
21 |
import re
|
@@ -34,117 +34,107 @@ st.set_page_config(
|
|
34 |
initial_sidebar_state="expanded"
|
35 |
)
|
36 |
|
37 |
-
#
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
else:
|
73 |
-
error_msg = f"Failed to load Embedding Model: {error_str}"
|
74 |
-
print(f"β [Global Init] {error_msg}")
|
75 |
-
st.session_state.embedding_model_error = error_msg
|
76 |
|
77 |
-
#
|
78 |
-
if
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
try:
|
106 |
-
st.
|
107 |
-
|
|
|
|
|
108 |
except Exception as e:
|
109 |
-
|
110 |
-
|
111 |
-
st.session_state.qwen3_4b_tokenizer_error = error_msg
|
112 |
|
113 |
-
|
114 |
-
|
|
|
115 |
try:
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
)
|
122 |
-
st.session_state.qwen3_4b_model = AutoModelForCausalLM.from_pretrained(
|
123 |
-
"Qwen/Qwen3-4B",
|
124 |
-
quantization_config=quantization_config,
|
125 |
-
device_map="auto",
|
126 |
-
trust_remote_code=True,
|
127 |
-
torch_dtype=torch.float16,
|
128 |
-
use_cache=True
|
129 |
-
)
|
130 |
-
print("[Global Init] Qwen3-4B Model Loaded with 4-bit quantization.")
|
131 |
except Exception as e:
|
132 |
-
|
133 |
-
|
134 |
-
st.session_state.qwen3_4b_model_error = error_msg
|
135 |
-
|
136 |
-
# --- End of Global Model Loading Section ---
|
137 |
-
|
138 |
-
# --- Class Definitions and Helper Functions ---
|
139 |
|
140 |
def generate_qwen3_response(prompt, tokenizer, model, max_new_tokens=200):
|
141 |
-
# ... (implementation of generate_qwen3_response)
|
142 |
messages = [{"role": "user", "content": prompt}]
|
143 |
text = tokenizer.apply_chat_template(
|
144 |
messages,
|
145 |
tokenize=False,
|
146 |
add_generation_prompt=True,
|
147 |
-
enable_thinking=True
|
148 |
)
|
149 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
150 |
generated_ids = model.generate(
|
@@ -155,32 +145,14 @@ def generate_qwen3_response(prompt, tokenizer, model, max_new_tokens=200):
|
|
155 |
response = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
|
156 |
return response
|
157 |
|
158 |
-
class ResumeScreener:
|
159 |
def __init__(self):
|
160 |
-
#
|
161 |
-
|
162 |
-
self.
|
163 |
-
self.cross_encoder = st.session_state.get('cross_encoder')
|
164 |
-
|
165 |
-
if self.embedding_model:
|
166 |
-
print("[ResumeScreener] Embedding model reference set.")
|
167 |
-
else:
|
168 |
-
print("[ResumeScreener] Embedding model not available (check loading errors).")
|
169 |
-
|
170 |
-
if self.cross_encoder:
|
171 |
-
print("[ResumeScreener] Cross-encoder model reference set.")
|
172 |
-
else:
|
173 |
-
print("[ResumeScreener] Cross-encoder model not available (check loading errors).")
|
174 |
-
|
175 |
-
print("[ResumeScreener] Initialization complete.")
|
176 |
-
|
177 |
-
# ... (all other methods of ResumeScreener: extract_text_from_file, get_embedding,
|
178 |
-
# calculate_bm25_scores, advanced_pipeline_ranking, faiss_recall, cross_encoder_rerank,
|
179 |
-
# add_bm25_scores, add_intent_scores, analyze_intent, calculate_final_scores, extract_skills)
|
180 |
-
# Make sure all methods are correctly indented within the class
|
181 |
|
182 |
def extract_text_from_file(self, file_path, file_type):
|
183 |
-
|
184 |
try:
|
185 |
if file_type == "pdf":
|
186 |
with open(file_path, 'rb') as file:
|
@@ -188,165 +160,249 @@ class ResumeScreener: # Ensure this class definition is BEFORE it's instantiated
|
|
188 |
text = ""
|
189 |
for page in pdf.pages:
|
190 |
text += page.extract_text() or ""
|
|
|
191 |
if not text.strip():
|
|
|
192 |
file.seek(0)
|
193 |
reader = PyPDF2.PdfReader(file)
|
194 |
text = ""
|
195 |
-
for
|
196 |
-
text +=
|
197 |
return text
|
|
|
198 |
elif file_type == "docx":
|
199 |
doc = Document(file_path)
|
200 |
return " ".join([paragraph.text for paragraph in doc.paragraphs])
|
|
|
201 |
elif file_type == "txt":
|
202 |
with open(file_path, 'r', encoding='utf-8') as file:
|
203 |
return file.read()
|
|
|
204 |
elif file_type == "csv":
|
205 |
with open(file_path, 'r', encoding='utf-8') as file:
|
206 |
csv_reader = csv.reader(file)
|
207 |
return " ".join([" ".join(row) for row in csv_reader])
|
|
|
208 |
except Exception as e:
|
209 |
st.error(f"Error extracting text from {file_path}: {str(e)}")
|
210 |
return ""
|
211 |
|
212 |
def get_embedding(self, text):
|
|
|
213 |
if self.embedding_model is None:
|
214 |
-
st.error("
|
215 |
-
return np.zeros(1024)
|
|
|
216 |
try:
|
217 |
-
|
|
|
|
|
218 |
text = "Represent this sentence for searching relevant passages: " + text
|
|
|
|
|
219 |
text = text[:8192] if text else ""
|
220 |
-
|
|
|
|
|
|
|
|
|
221 |
return embedding
|
|
|
222 |
except Exception as e:
|
223 |
st.error(f"Error generating embedding: {str(e)}")
|
224 |
-
return np.zeros(1024)
|
225 |
|
226 |
def calculate_bm25_scores(self, resume_texts, job_description):
|
|
|
227 |
try:
|
228 |
job_tokens = word_tokenize(job_description.lower())
|
229 |
corpus = [word_tokenize(text.lower()) for text in resume_texts if text and text.strip()]
|
|
|
230 |
if not corpus:
|
231 |
return [0.0] * len(resume_texts)
|
|
|
232 |
bm25 = BM25Okapi(corpus)
|
233 |
scores = bm25.get_scores(job_tokens)
|
234 |
return scores.tolist()
|
|
|
235 |
except Exception as e:
|
236 |
st.error(f"Error calculating BM25 scores: {str(e)}")
|
237 |
return [0.0] * len(resume_texts)
|
238 |
|
239 |
-
def advanced_pipeline_ranking(self, resume_texts, job_description):
|
240 |
-
|
241 |
if not resume_texts:
|
242 |
return []
|
243 |
-
|
|
|
|
|
244 |
top_50_indices = self.faiss_recall(resume_texts, job_description, top_k=50)
|
245 |
-
|
|
|
|
|
246 |
top_20_results = self.cross_encoder_rerank(resume_texts, job_description, top_50_indices, top_k=20)
|
247 |
-
|
|
|
|
|
248 |
top_20_with_bm25 = self.add_bm25_scores(resume_texts, job_description, top_20_results)
|
249 |
-
|
|
|
|
|
250 |
top_20_with_intent = self.add_intent_scores(resume_texts, job_description, top_20_with_bm25)
|
251 |
-
|
|
|
|
|
252 |
final_results = self.calculate_final_scores(top_20_with_intent)
|
253 |
-
|
254 |
-
return final_results[:
|
255 |
-
|
256 |
def faiss_recall(self, resume_texts, job_description, top_k=50):
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
def cross_encoder_rerank(self, resume_texts, job_description, top_50_indices, top_k=20):
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
return [(idx, 0.0) for idx in top_50_indices[:top_k]]
|
300 |
-
|
301 |
-
scores = []
|
302 |
-
batch_size = 8
|
303 |
-
progress_bar = st.progress(0)
|
304 |
-
for i in range(0, len(pairs), batch_size):
|
305 |
-
batch = pairs[i:i+batch_size]
|
306 |
-
batch_scores = self.cross_encoder.predict(batch)
|
307 |
-
scores.extend(batch_scores)
|
308 |
-
progress_bar.progress(min(1.0, (i + batch_size) / len(pairs)))
|
309 |
-
progress_bar.empty()
|
310 |
-
indexed_scores = list(zip(valid_indices, scores))
|
311 |
-
indexed_scores.sort(key=lambda x: x[1], reverse=True)
|
312 |
-
return indexed_scores[:top_k]
|
313 |
-
|
314 |
def add_bm25_scores(self, resume_texts, job_description, top_20_results):
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
else:
|
323 |
-
normalized_bm25 = [0.15] * len(
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
def add_intent_scores(self, resume_texts, job_description, top_20_with_bm25):
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
def analyze_intent(self, resume_text, job_description):
|
343 |
-
|
344 |
-
st.text("LLM Intent: Analyzing intent (Qwen3-4B)...")
|
345 |
try:
|
346 |
-
|
347 |
-
|
|
|
348 |
|
349 |
-
prompt = f"""You are given a job description and a candidate's resume.
|
|
|
350 |
|
351 |
Job Description:
|
352 |
{job_snippet}
|
@@ -354,145 +410,204 @@ Job Description:
|
|
354 |
Candidate Resume:
|
355 |
{resume_snippet}
|
356 |
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
3. Do their skills match what's needed?
|
363 |
-
4. Would this role be appealing given their background?
|
364 |
-
|
365 |
-
Think through your analysis step by step, then provide your final assessment.
|
366 |
-
|
367 |
-
Respond with exactly one of these formats:
|
368 |
-
- Intent: Yes (if they would likely seek this job)
|
369 |
-
- Intent: Maybe (if it's uncertain or partially aligned)
|
370 |
-
- Intent: No (if they would likely not seek this job)"""
|
371 |
-
|
372 |
-
# Check if models are available
|
373 |
-
if not st.session_state.get('qwen3_4b_tokenizer') or not st.session_state.get('qwen3_4b_model'):
|
374 |
-
st.warning("Qwen3-4B model not available, using fallback intent score.")
|
375 |
-
return 0.1
|
376 |
-
|
377 |
-
response_text = generate_qwen3_response(
|
378 |
prompt,
|
379 |
-
st.session_state.
|
380 |
-
st.session_state.
|
381 |
-
max_new_tokens=
|
382 |
)
|
383 |
|
384 |
-
# Parse
|
385 |
-
|
386 |
-
intent_decision_part = response_text
|
387 |
-
think_start_tag = "<think>"
|
388 |
-
think_end_tag = "</think>"
|
389 |
-
start_index = response_text.find(think_start_tag)
|
390 |
-
end_index = response_text.rfind(think_end_tag)
|
391 |
-
if start_index != -1 and end_index != -1 and start_index < end_index:
|
392 |
-
thinking_content = response_text[start_index + len(think_start_tag):end_index].strip()
|
393 |
-
intent_decision_part = response_text[end_index + len(think_end_tag):].strip()
|
394 |
-
|
395 |
-
response_lower = intent_decision_part.lower()
|
396 |
-
intent_score = 0.1 # Default "Maybe" score
|
397 |
-
|
398 |
if 'intent: yes' in response_lower or 'intent:yes' in response_lower:
|
399 |
-
|
400 |
-
elif 'intent: no' in response_lower or 'intent:no' in response_lower:
|
401 |
-
intent_score = 0.0
|
402 |
elif 'intent: maybe' in response_lower or 'intent:maybe' in response_lower:
|
403 |
-
|
404 |
-
|
405 |
-
|
|
|
406 |
except Exception as e:
|
407 |
-
st.warning(f"Error analyzing intent
|
408 |
-
return 0.1
|
409 |
-
|
410 |
def calculate_final_scores(self, results_with_all_scores):
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
def extract_skills(self, text, job_description):
|
424 |
-
|
425 |
-
if not text:
|
426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
job_words = set(word.lower() for word in word_tokenize(job_description) if len(word) > 2)
|
|
|
|
|
428 |
found_skills = []
|
429 |
text_lower = text.lower()
|
|
|
|
|
430 |
for skill in common_skills:
|
431 |
if skill in text_lower and any(skill in job_word for job_word in job_words):
|
432 |
found_skills.append(skill)
|
|
|
|
|
433 |
for word in job_words:
|
434 |
-
if len(word) > 3 and word in text_lower and word not in found_skills
|
435 |
-
|
436 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
438 |
def create_download_link(df, filename="resume_screening_results.csv"):
|
439 |
-
|
440 |
csv = df.to_csv(index=False)
|
441 |
b64 = base64.b64encode(csv.encode()).decode()
|
442 |
return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-btn">π₯ Download Results CSV</a>'
|
443 |
|
444 |
-
#
|
445 |
-
st.markdown("---")
|
446 |
-
st.markdown("### π€ Advanced Pipeline")
|
447 |
-
st.markdown("- **Stage 1**: FAISS Recall (Top 50)")
|
448 |
-
st.markdown("- **Stage 2**: Cross-Encoder Re-ranking (Top 20)")
|
449 |
-
st.markdown("- **Stage 3**: BM25 Keyword Matching")
|
450 |
-
st.markdown("- **Stage 4**: LLM Intent Analysis (Qwen3-4B)")
|
451 |
-
st.markdown("- **Final**: Combined Scoring") # Updated this line
|
452 |
-
st.markdown("### π Models Used")
|
453 |
-
st.markdown("- **Embedding**: BAAI/bge-large-en-v1.5")
|
454 |
-
st.markdown("- **Cross-Encoder**: ms-marco-MiniLM-L6-v2")
|
455 |
-
st.markdown("- **LLM**: Qwen/Qwen3-4B (4-bit quantized)")
|
456 |
-
st.markdown("### π Scoring Formula")
|
457 |
-
st.markdown("**Final Score = Cross-Encoder (0-1) + BM25 (0.1-0.2) + Intent (0-0.3)**")
|
458 |
-
|
459 |
-
# --- Main App Interface (Must be after global model loading and class defs) ---
|
460 |
st.title("π― AI-Powered Resume Screener")
|
461 |
-
|
462 |
-
# ...
|
463 |
-
st.markdown("*Find the perfect candidates using BAAI/bge-large-en-v1.5 embeddings and Qwen3-4B for intent analysis*")
|
464 |
-
|
465 |
-
st.subheader("π€ Model Loading Status")
|
466 |
-
col1, col2 = st.columns(2)
|
467 |
-
with col1:
|
468 |
-
if st.session_state.get('embedding_model_error'):
|
469 |
-
st.error(f"Embedding Model: {st.session_state.embedding_model_error}")
|
470 |
-
elif st.session_state.get('embedding_model'):
|
471 |
-
st.success("β
Embedding Model (BAAI/bge-large-en-v1.5) loaded.")
|
472 |
-
else:
|
473 |
-
st.warning("β³ Embedding Model loading or not found (check console).")
|
474 |
-
if st.session_state.get('cross_encoder_error'):
|
475 |
-
st.error(f"Cross-Encoder Model: {st.session_state.cross_encoder_error}")
|
476 |
-
elif st.session_state.get('cross_encoder'):
|
477 |
-
st.success("β
Cross-Encoder Model (ms-marco-MiniLM-L6-v2) loaded.")
|
478 |
-
else:
|
479 |
-
st.warning("β³ Cross-Encoder Model loading or not found (check console).")
|
480 |
-
with col2:
|
481 |
-
if st.session_state.get('qwen3_4b_tokenizer_error'):
|
482 |
-
st.error(f"Qwen3-4B Tokenizer: {st.session_state.qwen3_4b_tokenizer_error}")
|
483 |
-
elif st.session_state.get('qwen3_4b_tokenizer'):
|
484 |
-
st.success("β
Qwen3-4B Tokenizer loaded.")
|
485 |
-
else:
|
486 |
-
st.warning("β³ Qwen3-4B Tokenizer loading or not found (check console).")
|
487 |
-
if st.session_state.get('qwen3_4b_model_error'):
|
488 |
-
st.error(f"Qwen3-4B Model: {st.session_state.qwen3_4b_model_error}")
|
489 |
-
elif st.session_state.get('qwen3_4b_model'):
|
490 |
-
st.success("β
Qwen3-4B Model loaded (4-bit quantized).")
|
491 |
-
else:
|
492 |
-
st.warning("β³ Qwen3-4B Model loading or not found (check console).")
|
493 |
st.markdown("---")
|
494 |
|
495 |
-
# Initialize screener
|
496 |
screener = ResumeScreener()
|
497 |
|
498 |
# Job Description Input
|
@@ -516,6 +631,7 @@ if st.session_state.resume_texts:
|
|
516 |
st.session_state.resume_texts = []
|
517 |
st.session_state.file_names = []
|
518 |
st.session_state.results = []
|
|
|
519 |
st.session_state.current_job_description = ""
|
520 |
st.rerun()
|
521 |
|
@@ -683,60 +799,115 @@ col1, col2 = st.columns([1, 1])
|
|
683 |
|
684 |
with col1:
|
685 |
if st.button("π Advanced Pipeline Analysis",
|
686 |
-
disabled=not (job_description and st.session_state.resume_texts
|
687 |
-
st.session_state.get('embedding_model') and
|
688 |
-
st.session_state.get('cross_encoder') and
|
689 |
-
st.session_state.get('qwen3_4b_model') and
|
690 |
-
st.session_state.get('qwen3_4b_tokenizer')),
|
691 |
type="primary",
|
692 |
help="Run the complete 5-stage advanced pipeline"):
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
-
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
740 |
|
741 |
# Display Results
|
742 |
if st.session_state.results:
|
@@ -762,7 +933,7 @@ if st.session_state.results:
|
|
762 |
"Top Skills": ", ".join(result['skills'][:5])
|
763 |
})
|
764 |
|
765 |
-
|
766 |
|
767 |
# Style the dataframe
|
768 |
def color_scores(val):
|
@@ -805,6 +976,7 @@ if st.session_state.results:
|
|
805 |
"Intent_Score": result['intent_score'],
|
806 |
"Intent_Analysis": intent_text,
|
807 |
"Skills": "; ".join(result['skills']),
|
|
|
808 |
"Resume_Preview": result['text_preview']
|
809 |
})
|
810 |
|
@@ -835,6 +1007,9 @@ if st.session_state.results:
|
|
835 |
st.write(f"β’ {skill}")
|
836 |
|
837 |
with col2:
|
|
|
|
|
|
|
838 |
st.write("**π Resume Preview:**")
|
839 |
st.text_area("", result['text_preview'], height=200, disabled=True, key=f"preview_{result['rank']}")
|
840 |
|
@@ -894,6 +1069,7 @@ with col1:
|
|
894 |
st.session_state.resume_texts = []
|
895 |
st.session_state.file_names = []
|
896 |
st.session_state.results = []
|
|
|
897 |
st.session_state.current_job_description = ""
|
898 |
st.success("β
Resumes cleared!")
|
899 |
st.rerun()
|
@@ -903,6 +1079,7 @@ with col2:
|
|
903 |
st.session_state.resume_texts = []
|
904 |
st.session_state.file_names = []
|
905 |
st.session_state.results = []
|
|
|
906 |
st.session_state.current_job_description = ""
|
907 |
|
908 |
if torch.cuda.is_available():
|
@@ -916,7 +1093,7 @@ st.markdown("---")
|
|
916 |
st.markdown(
|
917 |
"""
|
918 |
<div style='text-align: center; color: #666;'>
|
919 |
-
π Powered by BAAI/bge-large-en-v1.5 & Qwen3-
|
920 |
</div>
|
921 |
""",
|
922 |
unsafe_allow_html=True
|
|
|
15 |
import csv
|
16 |
from datasets import load_dataset
|
17 |
import gc
|
18 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
19 |
import time
|
20 |
import faiss
|
21 |
import re
|
|
|
34 |
initial_sidebar_state="expanded"
|
35 |
)
|
36 |
|
37 |
+
# Sidebar configuration
|
38 |
+
with st.sidebar:
|
39 |
+
st.title("βοΈ Configuration")
|
40 |
+
|
41 |
+
# Ranking weights
|
42 |
+
st.subheader("Ranking Weights")
|
43 |
+
semantic_weight = st.slider("Semantic Similarity Weight", 0.0, 1.0, 0.7, 0.1)
|
44 |
+
keyword_weight = 1.0 - semantic_weight
|
45 |
+
st.write(f"Keyword Weight: {keyword_weight:.1f}")
|
46 |
+
|
47 |
+
# Advanced options
|
48 |
+
st.subheader("Advanced Options")
|
49 |
+
top_k = st.selectbox("Number of results to display", options=[1, 2, 3, 4, 5], index=4)
|
50 |
+
|
51 |
+
# LLM Settings
|
52 |
+
st.subheader("LLM Settings")
|
53 |
+
use_llm_explanations = st.checkbox("Generate AI Explanations", value=True)
|
54 |
+
if use_llm_explanations:
|
55 |
+
hf_token = st.text_input("Hugging Face Token (optional)", type="password",
|
56 |
+
help="Enter your HF token for better rate limits")
|
57 |
+
|
58 |
+
st.markdown("---")
|
59 |
+
st.markdown("### π€ Advanced Pipeline")
|
60 |
+
st.markdown("- **Stage 1**: FAISS Recall (Top 50)")
|
61 |
+
st.markdown("- **Stage 2**: Cross-Encoder Re-ranking (Top 20)")
|
62 |
+
st.markdown("- **Stage 3**: BM25 Keyword Matching")
|
63 |
+
st.markdown("- **Stage 4**: LLM Intent Analysis")
|
64 |
+
st.markdown("- **Final**: Combined Scoring (Top 5)")
|
65 |
+
st.markdown("### π Models Used")
|
66 |
+
st.markdown("- **Embedding**: BAAI/bge-large-en-v1.5")
|
67 |
+
st.markdown("- **Cross-Encoder**: ms-marco-MiniLM-L6-v2")
|
68 |
+
st.markdown("- **LLM Explanations**: Qwen/Qwen3-14B")
|
69 |
+
st.markdown("- **Intent Analysis**: Qwen/Qwen3-1.7B")
|
70 |
+
st.markdown("### π Scoring Formula")
|
71 |
+
st.markdown("**Final Score = Cross-Encoder (0-1) + BM25 (0.1-0.2) + Intent (0-0.3)**")
|
|
|
|
|
|
|
|
|
72 |
|
73 |
+
# Initialize session state
|
74 |
+
if 'embedding_model' not in st.session_state:
|
75 |
+
st.session_state.embedding_model = None
|
76 |
+
if 'cross_encoder' not in st.session_state:
|
77 |
+
st.session_state.cross_encoder = None
|
78 |
+
if 'results' not in st.session_state:
|
79 |
+
st.session_state.results = []
|
80 |
+
if 'resume_texts' not in st.session_state:
|
81 |
+
st.session_state.resume_texts = []
|
82 |
+
if 'file_names' not in st.session_state:
|
83 |
+
st.session_state.file_names = []
|
84 |
+
if 'explanations_generated' not in st.session_state:
|
85 |
+
st.session_state.explanations_generated = False
|
86 |
+
if 'current_job_description' not in st.session_state:
|
87 |
+
st.session_state.current_job_description = ""
|
88 |
+
if 'qwen3_tokenizer' not in st.session_state:
|
89 |
+
st.session_state.qwen3_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B")
|
90 |
+
if 'qwen3_model' not in st.session_state:
|
91 |
+
st.session_state.qwen3_model = AutoModelForCausalLM.from_pretrained(
|
92 |
+
"Qwen/Qwen3-14B",
|
93 |
+
torch_dtype="auto",
|
94 |
+
device_map="auto"
|
95 |
+
)
|
96 |
+
# Separate smaller model for intent analysis
|
97 |
+
if 'qwen3_intent_tokenizer' not in st.session_state:
|
98 |
+
st.session_state.qwen3_intent_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")
|
99 |
+
if 'qwen3_intent_model' not in st.session_state:
|
100 |
+
st.session_state.qwen3_intent_model = AutoModelForCausalLM.from_pretrained(
|
101 |
+
"Qwen/Qwen3-1.7B",
|
102 |
+
torch_dtype="auto",
|
103 |
+
device_map="auto"
|
104 |
+
)
|
105 |
|
106 |
+
@st.cache_resource
|
107 |
+
def load_embedding_model():
|
108 |
+
"""Load and cache the BGE embedding model"""
|
109 |
try:
|
110 |
+
with st.spinner("π Loading BAAI/bge-large-en-v1.5 model..."):
|
111 |
+
model = SentenceTransformer('BAAI/bge-large-en-v1.5')
|
112 |
+
st.success("β
Embedding model loaded successfully!")
|
113 |
+
return model
|
114 |
except Exception as e:
|
115 |
+
st.error(f"β Error loading embedding model: {str(e)}")
|
116 |
+
return None
|
|
|
117 |
|
118 |
+
@st.cache_resource
|
119 |
+
def load_cross_encoder():
|
120 |
+
"""Load and cache the Cross-Encoder model"""
|
121 |
try:
|
122 |
+
with st.spinner("π Loading Cross-Encoder ms-marco-MiniLM-L6-v2..."):
|
123 |
+
from sentence_transformers import CrossEncoder
|
124 |
+
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
|
125 |
+
st.success("β
Cross-Encoder model loaded successfully!")
|
126 |
+
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
except Exception as e:
|
128 |
+
st.error(f"β Error loading Cross-Encoder model: {str(e)}")
|
129 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
def generate_qwen3_response(prompt, tokenizer, model, max_new_tokens=200):
|
|
|
132 |
messages = [{"role": "user", "content": prompt}]
|
133 |
text = tokenizer.apply_chat_template(
|
134 |
messages,
|
135 |
tokenize=False,
|
136 |
add_generation_prompt=True,
|
137 |
+
enable_thinking=True
|
138 |
)
|
139 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
140 |
generated_ids = model.generate(
|
|
|
145 |
response = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
|
146 |
return response
|
147 |
|
148 |
+
class ResumeScreener:
|
149 |
def __init__(self):
|
150 |
+
# Load models
|
151 |
+
self.embedding_model = load_embedding_model()
|
152 |
+
self.cross_encoder = load_cross_encoder()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
def extract_text_from_file(self, file_path, file_type):
|
155 |
+
"""Extract text from various file types"""
|
156 |
try:
|
157 |
if file_type == "pdf":
|
158 |
with open(file_path, 'rb') as file:
|
|
|
160 |
text = ""
|
161 |
for page in pdf.pages:
|
162 |
text += page.extract_text() or ""
|
163 |
+
|
164 |
if not text.strip():
|
165 |
+
# Fallback to PyPDF2
|
166 |
file.seek(0)
|
167 |
reader = PyPDF2.PdfReader(file)
|
168 |
text = ""
|
169 |
+
for page in reader.pages:
|
170 |
+
text += page.extract_text() or ""
|
171 |
return text
|
172 |
+
|
173 |
elif file_type == "docx":
|
174 |
doc = Document(file_path)
|
175 |
return " ".join([paragraph.text for paragraph in doc.paragraphs])
|
176 |
+
|
177 |
elif file_type == "txt":
|
178 |
with open(file_path, 'r', encoding='utf-8') as file:
|
179 |
return file.read()
|
180 |
+
|
181 |
elif file_type == "csv":
|
182 |
with open(file_path, 'r', encoding='utf-8') as file:
|
183 |
csv_reader = csv.reader(file)
|
184 |
return " ".join([" ".join(row) for row in csv_reader])
|
185 |
+
|
186 |
except Exception as e:
|
187 |
st.error(f"Error extracting text from {file_path}: {str(e)}")
|
188 |
return ""
|
189 |
|
190 |
def get_embedding(self, text):
|
191 |
+
"""Generate embedding for text using BGE model"""
|
192 |
if self.embedding_model is None:
|
193 |
+
st.error("No embedding model loaded!")
|
194 |
+
return np.zeros(1024) # BGE-large dimension
|
195 |
+
|
196 |
try:
|
197 |
+
# BGE models recommend adding instruction for retrieval
|
198 |
+
# For queries (job description)
|
199 |
+
if len(text) < 500: # Assuming shorter texts are queries
|
200 |
text = "Represent this sentence for searching relevant passages: " + text
|
201 |
+
|
202 |
+
# Truncate text to avoid memory issues
|
203 |
text = text[:8192] if text else ""
|
204 |
+
|
205 |
+
# Generate embedding
|
206 |
+
embedding = self.embedding_model.encode(text,
|
207 |
+
convert_to_numpy=True,
|
208 |
+
normalize_embeddings=True)
|
209 |
return embedding
|
210 |
+
|
211 |
except Exception as e:
|
212 |
st.error(f"Error generating embedding: {str(e)}")
|
213 |
+
return np.zeros(1024) # BGE-large dimension
|
214 |
|
215 |
def calculate_bm25_scores(self, resume_texts, job_description):
|
216 |
+
"""Calculate BM25 scores for keyword matching"""
|
217 |
try:
|
218 |
job_tokens = word_tokenize(job_description.lower())
|
219 |
corpus = [word_tokenize(text.lower()) for text in resume_texts if text and text.strip()]
|
220 |
+
|
221 |
if not corpus:
|
222 |
return [0.0] * len(resume_texts)
|
223 |
+
|
224 |
bm25 = BM25Okapi(corpus)
|
225 |
scores = bm25.get_scores(job_tokens)
|
226 |
return scores.tolist()
|
227 |
+
|
228 |
except Exception as e:
|
229 |
st.error(f"Error calculating BM25 scores: {str(e)}")
|
230 |
return [0.0] * len(resume_texts)
|
231 |
|
232 |
+
def advanced_pipeline_ranking(self, resume_texts, job_description, final_top_k=5):
|
233 |
+
"""Advanced pipeline: FAISS recall -> Cross-encoder -> BM25 -> LLM intent -> Final ranking"""
|
234 |
if not resume_texts:
|
235 |
return []
|
236 |
+
|
237 |
+
# Stage 1: FAISS Recall (Top 50)
|
238 |
+
st.write("π **Stage 1**: FAISS Recall - Finding top 50 candidates...")
|
239 |
top_50_indices = self.faiss_recall(resume_texts, job_description, top_k=50)
|
240 |
+
|
241 |
+
# Stage 2: Cross-Encoder Re-ranking (Top 20)
|
242 |
+
st.write("π― **Stage 2**: Cross-Encoder Re-ranking - Selecting top 20...")
|
243 |
top_20_results = self.cross_encoder_rerank(resume_texts, job_description, top_50_indices, top_k=20)
|
244 |
+
|
245 |
+
# Stage 3: BM25 Keyword Matching
|
246 |
+
st.write("π€ **Stage 3**: BM25 Keyword Matching...")
|
247 |
top_20_with_bm25 = self.add_bm25_scores(resume_texts, job_description, top_20_results)
|
248 |
+
|
249 |
+
# Stage 4: LLM Intent Analysis (using Qwen3-1.7B)
|
250 |
+
st.write("π€ **Stage 4**: LLM Intent Analysis...")
|
251 |
top_20_with_intent = self.add_intent_scores(resume_texts, job_description, top_20_with_bm25)
|
252 |
+
|
253 |
+
# Stage 5: Final Combined Ranking
|
254 |
+
st.write(f"π **Stage 5**: Final Combined Ranking - Selecting top {final_top_k}...")
|
255 |
final_results = self.calculate_final_scores(top_20_with_intent)
|
256 |
+
|
257 |
+
return final_results[:final_top_k] # Return top K as selected by user
|
258 |
+
|
259 |
def faiss_recall(self, resume_texts, job_description, top_k=50):
|
260 |
+
"""Stage 1: Use FAISS for initial recall to find top 50 resumes"""
|
261 |
+
try:
|
262 |
+
# Get job embedding
|
263 |
+
job_embedding = self.get_embedding(job_description)
|
264 |
+
|
265 |
+
# Get resume embeddings
|
266 |
+
resume_embeddings = []
|
267 |
+
progress_bar = st.progress(0)
|
268 |
+
|
269 |
+
for i, text in enumerate(resume_texts):
|
270 |
+
if text:
|
271 |
+
embedding = self.embedding_model.encode(text[:8192],
|
272 |
+
convert_to_numpy=True,
|
273 |
+
normalize_embeddings=True)
|
274 |
+
resume_embeddings.append(embedding)
|
275 |
+
else:
|
276 |
+
resume_embeddings.append(np.zeros(1024))
|
277 |
+
progress_bar.progress((i + 1) / len(resume_texts))
|
278 |
+
|
279 |
+
progress_bar.empty()
|
280 |
+
|
281 |
+
# Create FAISS index
|
282 |
+
resume_embeddings = np.array(resume_embeddings).astype('float32')
|
283 |
+
dimension = resume_embeddings.shape[1]
|
284 |
+
index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
|
285 |
+
index.add(resume_embeddings)
|
286 |
+
|
287 |
+
# Search for top K
|
288 |
+
job_embedding = job_embedding.reshape(1, -1).astype('float32')
|
289 |
+
scores, indices = index.search(job_embedding, min(top_k, len(resume_texts)))
|
290 |
+
|
291 |
+
return indices[0].tolist()
|
292 |
+
|
293 |
+
except Exception as e:
|
294 |
+
st.error(f"Error in FAISS recall: {str(e)}")
|
295 |
+
# Fallback: return all indices
|
296 |
+
return list(range(min(top_k, len(resume_texts))))
|
297 |
+
|
298 |
def cross_encoder_rerank(self, resume_texts, job_description, top_50_indices, top_k=20):
|
299 |
+
"""Stage 2: Use Cross-Encoder to re-rank top 50 and select top 20"""
|
300 |
+
try:
|
301 |
+
if not self.cross_encoder:
|
302 |
+
st.error("Cross-encoder not loaded!")
|
303 |
+
return [(idx, 0.0) for idx in top_50_indices[:top_k]]
|
304 |
+
|
305 |
+
# Prepare pairs for cross-encoder
|
306 |
+
pairs = []
|
307 |
+
valid_indices = []
|
308 |
+
|
309 |
+
for idx in top_50_indices:
|
310 |
+
if idx < len(resume_texts) and resume_texts[idx]:
|
311 |
+
# Truncate texts for cross-encoder
|
312 |
+
job_snippet = job_description[:512]
|
313 |
+
resume_snippet = resume_texts[idx][:512]
|
314 |
+
pairs.append([job_snippet, resume_snippet])
|
315 |
+
valid_indices.append(idx)
|
316 |
+
|
317 |
+
if not pairs:
|
318 |
+
return [(idx, 0.0) for idx in top_50_indices[:top_k]]
|
319 |
+
|
320 |
+
# Get cross-encoder scores
|
321 |
+
progress_bar = st.progress(0)
|
322 |
+
scores = []
|
323 |
+
|
324 |
+
# Process in batches to avoid memory issues
|
325 |
+
batch_size = 8
|
326 |
+
for i in range(0, len(pairs), batch_size):
|
327 |
+
batch = pairs[i:i+batch_size]
|
328 |
+
batch_scores = self.cross_encoder.predict(batch)
|
329 |
+
scores.extend(batch_scores)
|
330 |
+
progress_bar.progress(min(1.0, (i + batch_size) / len(pairs)))
|
331 |
+
|
332 |
+
progress_bar.empty()
|
333 |
+
|
334 |
+
# Combine indices with scores and sort
|
335 |
+
indexed_scores = list(zip(valid_indices, scores))
|
336 |
+
indexed_scores.sort(key=lambda x: x[1], reverse=True)
|
337 |
+
|
338 |
+
return indexed_scores[:top_k]
|
339 |
+
|
340 |
+
except Exception as e:
|
341 |
+
st.error(f"Error in cross-encoder re-ranking: {str(e)}")
|
342 |
return [(idx, 0.0) for idx in top_50_indices[:top_k]]
|
343 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
def add_bm25_scores(self, resume_texts, job_description, top_20_results):
|
345 |
+
"""Stage 3: Add BM25 scores to top 20 resumes"""
|
346 |
+
try:
|
347 |
+
# Get texts for top 20
|
348 |
+
top_20_texts = [resume_texts[idx] for idx, _ in top_20_results]
|
349 |
+
|
350 |
+
# Calculate BM25 scores
|
351 |
+
bm25_scores = self.calculate_bm25_scores(top_20_texts, job_description)
|
352 |
+
|
353 |
+
# Normalize BM25 scores to 0.1-0.2 range
|
354 |
+
if bm25_scores and max(bm25_scores) > 0:
|
355 |
+
max_bm25 = max(bm25_scores)
|
356 |
+
min_bm25 = min(bm25_scores)
|
357 |
+
if max_bm25 > min_bm25:
|
358 |
+
normalized_bm25 = [
|
359 |
+
0.1 + 0.1 * (score - min_bm25) / (max_bm25 - min_bm25)
|
360 |
+
for score in bm25_scores
|
361 |
+
]
|
362 |
+
else:
|
363 |
+
normalized_bm25 = [0.15] * len(bm25_scores)
|
364 |
else:
|
365 |
+
normalized_bm25 = [0.15] * len(top_20_results)
|
366 |
+
|
367 |
+
# Combine with existing results
|
368 |
+
results_with_bm25 = []
|
369 |
+
for i, (idx, cross_score) in enumerate(top_20_results):
|
370 |
+
bm25_score = normalized_bm25[i] if i < len(normalized_bm25) else 0.15
|
371 |
+
results_with_bm25.append((idx, cross_score, bm25_score))
|
372 |
+
|
373 |
+
return results_with_bm25
|
374 |
+
|
375 |
+
except Exception as e:
|
376 |
+
st.error(f"Error adding BM25 scores: {str(e)}")
|
377 |
+
return [(idx, cross_score, 0.15) for idx, cross_score in top_20_results]
|
378 |
+
|
379 |
def add_intent_scores(self, resume_texts, job_description, top_20_with_bm25):
|
380 |
+
"""Stage 4: Add LLM intent analysis scores"""
|
381 |
+
try:
|
382 |
+
results_with_intent = []
|
383 |
+
progress_bar = st.progress(0)
|
384 |
+
|
385 |
+
for i, (idx, cross_score, bm25_score) in enumerate(top_20_with_bm25):
|
386 |
+
intent_score = self.analyze_intent(resume_texts[idx], job_description)
|
387 |
+
results_with_intent.append((idx, cross_score, bm25_score, intent_score))
|
388 |
+
progress_bar.progress((i + 1) / len(top_20_with_bm25))
|
389 |
+
|
390 |
+
progress_bar.empty()
|
391 |
+
return results_with_intent
|
392 |
+
|
393 |
+
except Exception as e:
|
394 |
+
st.error(f"Error adding intent scores: {str(e)}")
|
395 |
+
return [(idx, cross_score, bm25_score, 0.1) for idx, cross_score, bm25_score in top_20_with_bm25]
|
396 |
+
|
397 |
def analyze_intent(self, resume_text, job_description):
|
398 |
+
"""Analyze candidate's intent using LLM"""
|
|
|
399 |
try:
|
400 |
+
# Truncate texts
|
401 |
+
resume_snippet = resume_text[:1500] if len(resume_text) > 1500 else resume_text
|
402 |
+
job_snippet = job_description[:800] if len(job_description) > 800 else job_description
|
403 |
|
404 |
+
prompt = f"""You are given a job description and a candidate's resume.
|
405 |
+
Clearly answer: "Is the candidate likely seeking this job? Respond with 'Yes', 'Maybe', or 'No' and give a brief justification."
|
406 |
|
407 |
Job Description:
|
408 |
{job_snippet}
|
|
|
410 |
Candidate Resume:
|
411 |
{resume_snippet}
|
412 |
|
413 |
+
Response format:
|
414 |
+
Intent: [Yes/Maybe/No]
|
415 |
+
Reason: [Brief justification]"""
|
416 |
+
|
417 |
+
response = generate_qwen3_response(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
prompt,
|
419 |
+
st.session_state.qwen3_intent_tokenizer,
|
420 |
+
st.session_state.qwen3_intent_model,
|
421 |
+
max_new_tokens=100
|
422 |
)
|
423 |
|
424 |
+
# Parse response
|
425 |
+
response_lower = response.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
if 'intent: yes' in response_lower or 'intent:yes' in response_lower:
|
427 |
+
return 0.3
|
|
|
|
|
428 |
elif 'intent: maybe' in response_lower or 'intent:maybe' in response_lower:
|
429 |
+
return 0.1
|
430 |
+
else:
|
431 |
+
return 0.0
|
432 |
+
|
433 |
except Exception as e:
|
434 |
+
st.warning(f"Error analyzing intent: {str(e)}")
|
435 |
+
return 0.1 # Default to "Maybe"
|
436 |
+
|
437 |
def calculate_final_scores(self, results_with_all_scores):
|
438 |
+
"""Stage 5: Calculate final combined scores"""
|
439 |
+
try:
|
440 |
+
final_results = []
|
441 |
+
|
442 |
+
for idx, cross_score, bm25_score, intent_score in results_with_all_scores:
|
443 |
+
# Normalize cross-encoder score to 0-1 range
|
444 |
+
normalized_cross = max(0, min(1, cross_score))
|
445 |
+
|
446 |
+
# Final Score = Cross-Encoder (0-1) + BM25 (0.1-0.2) + Intent (0-0.3)
|
447 |
+
final_score = normalized_cross + bm25_score + intent_score
|
448 |
+
|
449 |
+
final_results.append({
|
450 |
+
'index': idx,
|
451 |
+
'cross_encoder_score': normalized_cross,
|
452 |
+
'bm25_score': bm25_score,
|
453 |
+
'intent_score': intent_score,
|
454 |
+
'final_score': final_score
|
455 |
+
})
|
456 |
+
|
457 |
+
# Sort by final score
|
458 |
+
final_results.sort(key=lambda x: x['final_score'], reverse=True)
|
459 |
+
|
460 |
+
return final_results
|
461 |
+
|
462 |
+
except Exception as e:
|
463 |
+
st.error(f"Error calculating final scores: {str(e)}")
|
464 |
+
return []
|
465 |
+
|
466 |
def extract_skills(self, text, job_description):
|
467 |
+
"""Extract skills from resume based on job description"""
|
468 |
+
if not text:
|
469 |
+
return []
|
470 |
+
|
471 |
+
# Common tech skills
|
472 |
+
common_skills = [
|
473 |
+
"python", "java", "javascript", "react", "angular", "vue", "node.js",
|
474 |
+
"express", "django", "flask", "spring", "sql", "nosql", "html", "css",
|
475 |
+
"aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "git", "github",
|
476 |
+
"agile", "scrum", "jira", "ci/cd", "devops", "microservices", "rest", "api",
|
477 |
+
"machine learning", "deep learning", "data science", "artificial intelligence",
|
478 |
+
"tensorflow", "pytorch", "keras", "scikit-learn", "pandas", "numpy",
|
479 |
+
"matplotlib", "seaborn", "jupyter", "r", "sas", "spss", "tableau", "powerbi",
|
480 |
+
"excel", "mysql", "postgresql", "mongodb", "redis", "elasticsearch",
|
481 |
+
"kafka", "rabbitmq", "spark", "hadoop", "hive", "airflow", "linux", "unix"
|
482 |
+
]
|
483 |
+
|
484 |
+
# Extract potential skills from job description
|
485 |
job_words = set(word.lower() for word in word_tokenize(job_description) if len(word) > 2)
|
486 |
+
|
487 |
+
# Find matching skills
|
488 |
found_skills = []
|
489 |
text_lower = text.lower()
|
490 |
+
|
491 |
+
# Check common skills that appear in both resume and job description
|
492 |
for skill in common_skills:
|
493 |
if skill in text_lower and any(skill in job_word for job_word in job_words):
|
494 |
found_skills.append(skill)
|
495 |
+
|
496 |
+
# Check for skills mentioned in job description
|
497 |
for word in job_words:
|
498 |
+
if len(word) > 3 and word in text_lower and word not in found_skills:
|
499 |
+
# Basic filter to avoid common words
|
500 |
+
if word not in ['with', 'have', 'that', 'this', 'from', 'what', 'when', 'where']:
|
501 |
+
found_skills.append(word)
|
502 |
+
|
503 |
+
return list(set(found_skills))[:15] # Return top 15 unique skills
|
504 |
+
|
505 |
+
def generate_simple_explanation(self, score, semantic_score, bm25_score, skills):
|
506 |
+
"""Generate simple explanation for the match (fallback)"""
|
507 |
+
if score > 0.8:
|
508 |
+
quality = "excellent"
|
509 |
+
elif score > 0.6:
|
510 |
+
quality = "strong"
|
511 |
+
elif score > 0.4:
|
512 |
+
quality = "moderate"
|
513 |
+
else:
|
514 |
+
quality = "limited"
|
515 |
+
|
516 |
+
explanation = f"This candidate shows {quality} alignment with the position (score: {score:.2f}). "
|
517 |
+
|
518 |
+
if semantic_score > bm25_score:
|
519 |
+
explanation += f"The resume demonstrates strong conceptual relevance ({semantic_score:.2f}) suggesting good experience fit. "
|
520 |
+
else:
|
521 |
+
explanation += f"The resume has high keyword match ({bm25_score:.2f}) indicating direct skill alignment. "
|
522 |
+
|
523 |
+
if skills:
|
524 |
+
explanation += f"Key matching competencies include: {', '.join(skills[:5])}."
|
525 |
+
|
526 |
+
return explanation
|
527 |
+
|
528 |
+
def generate_llm_explanation(self, resume_text, job_description, score, skills, max_retries=3):
|
529 |
+
"""Generate detailed explanation using Qwen3-14B"""
|
530 |
+
if not st.session_state.qwen3_model:
|
531 |
+
return self.generate_simple_explanation(score, score, score, skills)
|
532 |
+
|
533 |
+
# Truncate texts to manage token limits
|
534 |
+
resume_snippet = resume_text[:2000] if len(resume_text) > 2000 else resume_text
|
535 |
+
job_snippet = job_description[:1000] if len(job_description) > 1000 else job_description
|
536 |
+
|
537 |
+
prompt = f"""You are an expert HR analyst. Analyze this individual candidate's resume against the job requirements and write EXACTLY 150 words explaining why this specific candidate is suitable for the position.
|
538 |
+
|
539 |
+
Structure your 150-word analysis as follows:
|
540 |
+
1. Experience alignment (40-50 words)
|
541 |
+
2. Key strengths and skills match (40-50 words)
|
542 |
+
3. Unique value proposition (40-50 words)
|
543 |
+
4. Overall recommendation (10-20 words)
|
544 |
+
|
545 |
+
Job Requirements:
|
546 |
+
{job_snippet}
|
547 |
+
|
548 |
+
Candidate's Resume:
|
549 |
+
{resume_snippet}
|
550 |
+
|
551 |
+
Identified Matching Skills: {', '.join(skills[:10])}
|
552 |
+
Compatibility Score: {score:.1%}
|
553 |
+
|
554 |
+
Write a professional, detailed 150-word analysis for THIS INDIVIDUAL CANDIDATE:"""
|
555 |
+
|
556 |
+
for attempt in range(max_retries):
|
557 |
+
try:
|
558 |
+
response = generate_qwen3_response(
|
559 |
+
prompt,
|
560 |
+
st.session_state.qwen3_tokenizer,
|
561 |
+
st.session_state.qwen3_model,
|
562 |
+
max_new_tokens=200
|
563 |
+
)
|
564 |
+
|
565 |
+
# Extract the response and ensure it's about 150 words
|
566 |
+
explanation = response.strip()
|
567 |
+
word_count = len(explanation.split())
|
568 |
+
|
569 |
+
# If response is close to 150 words (130-170), accept it
|
570 |
+
if 130 <= word_count <= 170:
|
571 |
+
return explanation
|
572 |
+
|
573 |
+
# If response is too short or too long, try again with adjusted prompt
|
574 |
+
if word_count < 130:
|
575 |
+
# Response too short, try again
|
576 |
+
continue
|
577 |
+
elif word_count > 170:
|
578 |
+
# Response too long, truncate to approximately 150 words
|
579 |
+
words = explanation.split()
|
580 |
+
truncated = ' '.join(words[:150])
|
581 |
+
# Add proper ending if truncated
|
582 |
+
if not truncated.endswith('.'):
|
583 |
+
truncated += '.'
|
584 |
+
return truncated
|
585 |
+
|
586 |
+
return explanation
|
587 |
+
|
588 |
+
except Exception as e:
|
589 |
+
if attempt < max_retries - 1:
|
590 |
+
time.sleep(2) # Wait before retry
|
591 |
+
continue
|
592 |
+
else:
|
593 |
+
# Fallback to simple explanation
|
594 |
+
return self.generate_simple_explanation(score, score, score, skills)
|
595 |
+
|
596 |
+
# If all retries failed, use simple explanation
|
597 |
+
return self.generate_simple_explanation(score, score, score, skills)
|
598 |
|
599 |
def create_download_link(df, filename="resume_screening_results.csv"):
|
600 |
+
"""Create download link for results"""
|
601 |
csv = df.to_csv(index=False)
|
602 |
b64 = base64.b64encode(csv.encode()).decode()
|
603 |
return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-btn">π₯ Download Results CSV</a>'
|
604 |
|
605 |
+
# Main App Interface
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
606 |
st.title("π― AI-Powered Resume Screener")
|
607 |
+
st.markdown("*Find the perfect candidates using BAAI/bge-large-en-v1.5 embeddings and Qwen3-14B explanations*")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
608 |
st.markdown("---")
|
609 |
|
610 |
+
# Initialize screener
|
611 |
screener = ResumeScreener()
|
612 |
|
613 |
# Job Description Input
|
|
|
631 |
st.session_state.resume_texts = []
|
632 |
st.session_state.file_names = []
|
633 |
st.session_state.results = []
|
634 |
+
st.session_state.explanations_generated = False
|
635 |
st.session_state.current_job_description = ""
|
636 |
st.rerun()
|
637 |
|
|
|
799 |
|
800 |
with col1:
|
801 |
if st.button("π Advanced Pipeline Analysis",
|
802 |
+
disabled=not (job_description and st.session_state.resume_texts),
|
|
|
|
|
|
|
|
|
803 |
type="primary",
|
804 |
help="Run the complete 5-stage advanced pipeline"):
|
805 |
+
if len(st.session_state.resume_texts) == 0:
|
806 |
+
st.error("β Please upload resumes first!")
|
807 |
+
elif not job_description.strip():
|
808 |
+
st.error("β Please enter a job description!")
|
809 |
+
else:
|
810 |
+
with st.spinner("π Running Advanced Pipeline Analysis..."):
|
811 |
+
try:
|
812 |
+
# Run the advanced pipeline
|
813 |
+
pipeline_results = screener.advanced_pipeline_ranking(
|
814 |
+
st.session_state.resume_texts, job_description, final_top_k=top_k
|
815 |
+
)
|
816 |
+
|
817 |
+
# Prepare results for display
|
818 |
+
results = []
|
819 |
+
|
820 |
+
for rank, result_data in enumerate(pipeline_results, 1):
|
821 |
+
idx = result_data['index']
|
822 |
+
name = st.session_state.file_names[idx]
|
823 |
+
text = st.session_state.resume_texts[idx]
|
824 |
+
|
825 |
+
# Extract skills
|
826 |
+
skills = screener.extract_skills(text, job_description)
|
827 |
+
|
828 |
+
results.append({
|
829 |
+
'rank': rank,
|
830 |
+
'name': name,
|
831 |
+
'final_score': result_data['final_score'],
|
832 |
+
'cross_encoder_score': result_data['cross_encoder_score'],
|
833 |
+
'bm25_score': result_data['bm25_score'],
|
834 |
+
'intent_score': result_data['intent_score'],
|
835 |
+
'skills': skills,
|
836 |
+
'text': text,
|
837 |
+
'text_preview': text[:500] + "..." if len(text) > 500 else text,
|
838 |
+
'explanation': None # No detailed explanation yet
|
839 |
+
})
|
840 |
+
|
841 |
+
# Add simple explanations for now
|
842 |
+
for result in results:
|
843 |
+
result['explanation'] = screener.generate_simple_explanation(
|
844 |
+
result['final_score'],
|
845 |
+
result['cross_encoder_score'],
|
846 |
+
result['bm25_score'],
|
847 |
+
result['skills']
|
848 |
+
)
|
849 |
+
|
850 |
+
# Store in session state
|
851 |
+
st.session_state.results = results
|
852 |
+
st.session_state.explanations_generated = False
|
853 |
+
st.session_state.current_job_description = job_description
|
854 |
+
|
855 |
+
st.success(f"π Advanced pipeline complete! Found top {len(st.session_state.results)} candidates.")
|
856 |
+
|
857 |
+
except Exception as e:
|
858 |
+
st.error(f"β Error during analysis: {str(e)}")
|
859 |
+
|
860 |
+
# Second button: Generate AI explanations (slower, optional)
|
861 |
+
with col2:
|
862 |
+
# Show this button only if we have results and LLM is enabled
|
863 |
+
show_explanation_button = (
|
864 |
+
st.session_state.results and
|
865 |
+
use_llm_explanations and
|
866 |
+
st.session_state.qwen3_model and
|
867 |
+
not st.session_state.explanations_generated
|
868 |
+
)
|
869 |
+
|
870 |
+
if show_explanation_button:
|
871 |
+
if st.button("π€ Generate AI Explanations",
|
872 |
+
type="secondary",
|
873 |
+
help="Generate detailed 150-word explanations using Qwen3-14B (takes longer)"):
|
874 |
+
with st.spinner("π€ Generating detailed AI explanations..."):
|
875 |
+
try:
|
876 |
+
explanation_progress = st.progress(0)
|
877 |
+
explanation_text = st.empty()
|
878 |
+
|
879 |
+
for i, result in enumerate(st.session_state.results):
|
880 |
+
explanation_text.text(f"π€ Generating AI explanation for candidate {i+1}/{len(st.session_state.results)}...")
|
881 |
+
|
882 |
+
llm_explanation = screener.generate_llm_explanation(
|
883 |
+
result['text'],
|
884 |
+
st.session_state.current_job_description,
|
885 |
+
result['final_score'],
|
886 |
+
result['skills']
|
887 |
+
)
|
888 |
+
result['explanation'] = llm_explanation
|
889 |
+
|
890 |
+
explanation_progress.progress((i + 1) / len(st.session_state.results))
|
891 |
+
|
892 |
+
explanation_progress.empty()
|
893 |
+
explanation_text.empty()
|
894 |
+
|
895 |
+
# Mark explanations as generated
|
896 |
+
st.session_state.explanations_generated = True
|
897 |
+
|
898 |
+
st.success(f"π€ AI explanations generated for all {len(st.session_state.results)} candidates!")
|
899 |
+
|
900 |
+
except Exception as e:
|
901 |
+
st.error(f"β Error generating explanations: {str(e)}")
|
902 |
+
|
903 |
+
elif st.session_state.results and st.session_state.explanations_generated:
|
904 |
+
st.info("β
AI explanations already generated!")
|
905 |
+
|
906 |
+
elif st.session_state.results and not use_llm_explanations:
|
907 |
+
st.info("π‘ Enable 'Generate AI Explanations' in sidebar to use this feature")
|
908 |
+
|
909 |
+
elif st.session_state.results and not st.session_state.qwen3_model:
|
910 |
+
st.warning("β οΈ LLM model not available. Check your Hugging Face token.")
|
911 |
|
912 |
# Display Results
|
913 |
if st.session_state.results:
|
|
|
933 |
"Top Skills": ", ".join(result['skills'][:5])
|
934 |
})
|
935 |
|
936 |
+
summary_df = pd.DataFrame(summary_data)
|
937 |
|
938 |
# Style the dataframe
|
939 |
def color_scores(val):
|
|
|
976 |
"Intent_Score": result['intent_score'],
|
977 |
"Intent_Analysis": intent_text,
|
978 |
"Skills": "; ".join(result['skills']),
|
979 |
+
"AI_Explanation": result['explanation'],
|
980 |
"Resume_Preview": result['text_preview']
|
981 |
})
|
982 |
|
|
|
1007 |
st.write(f"β’ {skill}")
|
1008 |
|
1009 |
with col2:
|
1010 |
+
st.write("**π‘ AI-Generated Match Analysis:**")
|
1011 |
+
st.info(result['explanation'])
|
1012 |
+
|
1013 |
st.write("**π Resume Preview:**")
|
1014 |
st.text_area("", result['text_preview'], height=200, disabled=True, key=f"preview_{result['rank']}")
|
1015 |
|
|
|
1069 |
st.session_state.resume_texts = []
|
1070 |
st.session_state.file_names = []
|
1071 |
st.session_state.results = []
|
1072 |
+
st.session_state.explanations_generated = False
|
1073 |
st.session_state.current_job_description = ""
|
1074 |
st.success("β
Resumes cleared!")
|
1075 |
st.rerun()
|
|
|
1079 |
st.session_state.resume_texts = []
|
1080 |
st.session_state.file_names = []
|
1081 |
st.session_state.results = []
|
1082 |
+
st.session_state.explanations_generated = False
|
1083 |
st.session_state.current_job_description = ""
|
1084 |
|
1085 |
if torch.cuda.is_available():
|
|
|
1093 |
st.markdown(
|
1094 |
"""
|
1095 |
<div style='text-align: center; color: #666;'>
|
1096 |
+
π Powered by BAAI/bge-large-en-v1.5 & Qwen3-14B | Built with Streamlit
|
1097 |
</div>
|
1098 |
""",
|
1099 |
unsafe_allow_html=True
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
streamlit==1.31.0
|
2 |
-
transformers>=4.
|
3 |
-
torch
|
4 |
pdfplumber==0.10.1
|
5 |
PyPDF2==3.0.1
|
6 |
python-docx==1.0.1
|
@@ -10,9 +10,10 @@ rank-bm25==0.2.2
|
|
10 |
pandas==2.1.3
|
11 |
numpy==1.24.3
|
12 |
tqdm==4.66.1
|
13 |
-
huggingface-hub
|
14 |
-
bitsandbytes
|
15 |
-
accelerate
|
16 |
datasets==2.18.0
|
17 |
sentence-transformers==2.7.0
|
18 |
-
|
|
|
|
1 |
streamlit==1.31.0
|
2 |
+
transformers>=4.51.0
|
3 |
+
torch==2.1.2
|
4 |
pdfplumber==0.10.1
|
5 |
PyPDF2==3.0.1
|
6 |
python-docx==1.0.1
|
|
|
10 |
pandas==2.1.3
|
11 |
numpy==1.24.3
|
12 |
tqdm==4.66.1
|
13 |
+
huggingface-hub==0.30.0
|
14 |
+
bitsandbytes==0.44.1
|
15 |
+
accelerate==0.27.2
|
16 |
datasets==2.18.0
|
17 |
sentence-transformers==2.7.0
|
18 |
+
plotly==5.18.0
|
19 |
+
einops
|