root commited on
Commit
1ced284
Β·
1 Parent(s): d5266d0
Files changed (2) hide show
  1. app.py +569 -392
  2. requirements.txt +7 -6
app.py CHANGED
@@ -15,7 +15,7 @@ from docx import Document
15
  import csv
16
  from datasets import load_dataset
17
  import gc
18
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
19
  import time
20
  import faiss
21
  import re
@@ -34,117 +34,107 @@ st.set_page_config(
34
  initial_sidebar_state="expanded"
35
  )
36
 
37
- # --- Global Device and Model Loading Section ---
38
-
39
- # Initialize session state keys for all models, their loading status/errors, and app data
40
- keys_to_initialize = {
41
- 'embedding_model': None, 'embedding_model_error': None,
42
- 'cross_encoder': None, 'cross_encoder_error': None,
43
- 'qwen3_4b_tokenizer': None, 'qwen3_4b_tokenizer_error': None,
44
- 'qwen3_4b_model': None, 'qwen3_4b_model_error': None,
45
- 'results': [], 'resume_texts': [], 'file_names': [], 'current_job_description': ""
46
- # Add any other app-specific session state keys here if needed
47
- }
48
- for key, default_value in keys_to_initialize.items():
49
- if key not in st.session_state:
50
- st.session_state[key] = default_value
51
-
52
- # Load Embedding Model (BAAI/bge-large-en-v1.5)
53
- if st.session_state.embedding_model is None and st.session_state.embedding_model_error is None:
54
- print("[Global Init] Attempting to load Embedding Model (BAAI/bge-large-en-v1.5) with device_map='auto'...")
55
- try:
56
- st.session_state.embedding_model = SentenceTransformer(
57
- 'BAAI/bge-large-en-v1.5',
58
- device_map="auto"
59
- )
60
- print(f"[Global Init] Embedding Model (BAAI/bge-large-en-v1.5) LOADED with device_map='auto'.")
61
- except Exception as e:
62
- error_str = str(e) if e else ""
63
- if "device_map" in error_str.lower() and "unexpected keyword argument" in error_str.lower():
64
- print("⚠️ [Global Init] device_map='auto' not supported for SentenceTransformer. Falling back to default device handling.")
65
- try:
66
- st.session_state.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
67
- print(f"[Global Init] Embedding Model (BAAI/bge-large-en-v1.5) LOADED (fallback device handling).")
68
- except Exception as e_fallback:
69
- error_msg = f"Failed to load Embedding Model (fallback): {str(e_fallback) if e_fallback else 'Unknown error'}"
70
- print(f"❌ [Global Init] {error_msg}")
71
- st.session_state.embedding_model_error = error_msg
72
- else:
73
- error_msg = f"Failed to load Embedding Model: {error_str}"
74
- print(f"❌ [Global Init] {error_msg}")
75
- st.session_state.embedding_model_error = error_msg
76
 
77
- # Load Cross-Encoder Model (ms-marco-MiniLM-L6-v2)
78
- if st.session_state.cross_encoder is None and st.session_state.cross_encoder_error is None:
79
- print("[Global Init] Attempting to load Cross-Encoder Model (ms-marco-MiniLM-L6-v2) with device_map='auto'...")
80
- try:
81
- st.session_state.cross_encoder = CrossEncoder(
82
- 'cross-encoder/ms-marco-MiniLM-L6-v2',
83
- device_map="auto"
84
- )
85
- print(f"[Global Init] Cross-Encoder Model (ms-marco-MiniLM-L6-v2) LOADED with device_map='auto'.")
86
- except Exception as e:
87
- error_str = str(e) if e else ""
88
- if "device_map" in error_str.lower() and "unexpected keyword argument" in error_str.lower():
89
- print("⚠️ [Global Init] device_map='auto' not supported for CrossEncoder. Falling back to default device handling.")
90
- try:
91
- st.session_state.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
92
- print(f"[Global Init] Cross-Encoder Model (ms-marco-MiniLM-L6-v2) LOADED (fallback device handling).")
93
- except Exception as e_fallback:
94
- error_msg = f"Failed to load Cross-Encoder Model (fallback): {str(e_fallback) if e_fallback else 'Unknown error'}"
95
- print(f"❌ [Global Init] {error_msg}")
96
- st.session_state.cross_encoder_error = error_msg
97
- else:
98
- error_msg = f"Failed to load Cross-Encoder Model: {error_str}"
99
- print(f"❌ [Global Init] {error_msg}")
100
- st.session_state.cross_encoder_error = error_msg
 
 
 
 
 
 
 
 
101
 
102
- # Load Qwen3-4B Tokenizer and Model with 4-bit quantization (QwQ-32B style)
103
- if st.session_state.qwen3_4b_tokenizer is None and st.session_state.qwen3_4b_tokenizer_error is None:
104
- print("[Global Init] Loading Qwen3-4B Tokenizer...")
105
  try:
106
- st.session_state.qwen3_4b_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
107
- print("[Global Init] Qwen3-4B Tokenizer Loaded.")
 
 
108
  except Exception as e:
109
- error_msg = f"Failed to load Qwen3-4B Tokenizer: {str(e) if e else 'Unknown error'}"
110
- print(f"❌ [Global Init] {error_msg}")
111
- st.session_state.qwen3_4b_tokenizer_error = error_msg
112
 
113
- if st.session_state.qwen3_4b_model is None and st.session_state.qwen3_4b_model_error is None:
114
- print("[Global Init] Loading Qwen3-4B Model with 4-bit quantization...")
 
115
  try:
116
- quantization_config = BitsAndBytesConfig(
117
- load_in_4bit=True,
118
- bnb_4bit_quant_type="nf4",
119
- bnb_4bit_compute_dtype=torch.float16,
120
- bnb_4bit_use_double_quant=True
121
- )
122
- st.session_state.qwen3_4b_model = AutoModelForCausalLM.from_pretrained(
123
- "Qwen/Qwen3-4B",
124
- quantization_config=quantization_config,
125
- device_map="auto",
126
- trust_remote_code=True,
127
- torch_dtype=torch.float16,
128
- use_cache=True
129
- )
130
- print("[Global Init] Qwen3-4B Model Loaded with 4-bit quantization.")
131
  except Exception as e:
132
- error_msg = f"Failed to load Qwen3-4B Model: {str(e) if e else 'Unknown error'}"
133
- print(f"❌ [Global Init] {error_msg}")
134
- st.session_state.qwen3_4b_model_error = error_msg
135
-
136
- # --- End of Global Model Loading Section ---
137
-
138
- # --- Class Definitions and Helper Functions ---
139
 
140
  def generate_qwen3_response(prompt, tokenizer, model, max_new_tokens=200):
141
- # ... (implementation of generate_qwen3_response)
142
  messages = [{"role": "user", "content": prompt}]
143
  text = tokenizer.apply_chat_template(
144
  messages,
145
  tokenize=False,
146
  add_generation_prompt=True,
147
- enable_thinking=True # As per Qwen3-1.7B docs for thinking mode
148
  )
149
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
150
  generated_ids = model.generate(
@@ -155,32 +145,14 @@ def generate_qwen3_response(prompt, tokenizer, model, max_new_tokens=200):
155
  response = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
156
  return response
157
 
158
- class ResumeScreener: # Ensure this class definition is BEFORE it's instantiated
159
  def __init__(self):
160
- # ... (init logic as before, referencing st.session_state for models)
161
- print("[ResumeScreener] Initializing with references to globally loaded models...")
162
- self.embedding_model = st.session_state.get('embedding_model')
163
- self.cross_encoder = st.session_state.get('cross_encoder')
164
-
165
- if self.embedding_model:
166
- print("[ResumeScreener] Embedding model reference set.")
167
- else:
168
- print("[ResumeScreener] Embedding model not available (check loading errors).")
169
-
170
- if self.cross_encoder:
171
- print("[ResumeScreener] Cross-encoder model reference set.")
172
- else:
173
- print("[ResumeScreener] Cross-encoder model not available (check loading errors).")
174
-
175
- print("[ResumeScreener] Initialization complete.")
176
-
177
- # ... (all other methods of ResumeScreener: extract_text_from_file, get_embedding,
178
- # calculate_bm25_scores, advanced_pipeline_ranking, faiss_recall, cross_encoder_rerank,
179
- # add_bm25_scores, add_intent_scores, analyze_intent, calculate_final_scores, extract_skills)
180
- # Make sure all methods are correctly indented within the class
181
 
182
  def extract_text_from_file(self, file_path, file_type):
183
- # ... (implementation)
184
  try:
185
  if file_type == "pdf":
186
  with open(file_path, 'rb') as file:
@@ -188,165 +160,249 @@ class ResumeScreener: # Ensure this class definition is BEFORE it's instantiated
188
  text = ""
189
  for page in pdf.pages:
190
  text += page.extract_text() or ""
 
191
  if not text.strip():
 
192
  file.seek(0)
193
  reader = PyPDF2.PdfReader(file)
194
  text = ""
195
- for page_num in range(len(reader.pages)):
196
- text += reader.pages[page_num].extract_text() or ""
197
  return text
 
198
  elif file_type == "docx":
199
  doc = Document(file_path)
200
  return " ".join([paragraph.text for paragraph in doc.paragraphs])
 
201
  elif file_type == "txt":
202
  with open(file_path, 'r', encoding='utf-8') as file:
203
  return file.read()
 
204
  elif file_type == "csv":
205
  with open(file_path, 'r', encoding='utf-8') as file:
206
  csv_reader = csv.reader(file)
207
  return " ".join([" ".join(row) for row in csv_reader])
 
208
  except Exception as e:
209
  st.error(f"Error extracting text from {file_path}: {str(e)}")
210
  return ""
211
 
212
  def get_embedding(self, text):
 
213
  if self.embedding_model is None:
214
- st.error("Embedding model is not available!")
215
- return np.zeros(1024)
 
216
  try:
217
- if len(text) < 500:
 
 
218
  text = "Represent this sentence for searching relevant passages: " + text
 
 
219
  text = text[:8192] if text else ""
220
- embedding = self.embedding_model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
 
 
 
 
221
  return embedding
 
222
  except Exception as e:
223
  st.error(f"Error generating embedding: {str(e)}")
224
- return np.zeros(1024)
225
 
226
  def calculate_bm25_scores(self, resume_texts, job_description):
 
227
  try:
228
  job_tokens = word_tokenize(job_description.lower())
229
  corpus = [word_tokenize(text.lower()) for text in resume_texts if text and text.strip()]
 
230
  if not corpus:
231
  return [0.0] * len(resume_texts)
 
232
  bm25 = BM25Okapi(corpus)
233
  scores = bm25.get_scores(job_tokens)
234
  return scores.tolist()
 
235
  except Exception as e:
236
  st.error(f"Error calculating BM25 scores: {str(e)}")
237
  return [0.0] * len(resume_texts)
238
 
239
- def advanced_pipeline_ranking(self, resume_texts, job_description):
240
- print("[Pipeline] Advanced Pipeline Ranking started.")
241
  if not resume_texts:
242
  return []
243
- st.info("πŸ” Stage 1: FAISS Recall - Finding top candidates...")
 
 
244
  top_50_indices = self.faiss_recall(resume_texts, job_description, top_k=50)
245
- st.info("🎯 Stage 2: Cross-Encoder Re-ranking - Selecting top candidates...")
 
 
246
  top_20_results = self.cross_encoder_rerank(resume_texts, job_description, top_50_indices, top_k=20)
247
- st.info("πŸ”€ Stage 3: BM25 Keyword Matching...")
 
 
248
  top_20_with_bm25 = self.add_bm25_scores(resume_texts, job_description, top_20_results)
249
- st.info("πŸ€– Stage 4: LLM Intent Analysis (Qwen3-4B)...")
 
 
250
  top_20_with_intent = self.add_intent_scores(resume_texts, job_description, top_20_with_bm25)
251
- st.info("πŸ† Stage 5: Final Combined Ranking...")
 
 
252
  final_results = self.calculate_final_scores(top_20_with_intent)
253
- print("[Pipeline] Advanced Pipeline Ranking finished.")
254
- return final_results[:st.session_state.get('top_k', 5)]
255
-
256
  def faiss_recall(self, resume_texts, job_description, top_k=50):
257
- print("[faiss_recall] Method started.")
258
- st.text("FAISS Recall: Embedding job description...")
259
- job_embedding = self.get_embedding(job_description)
260
- st.text(f"FAISS Recall: Embedding {len(resume_texts)} resumes...")
261
- resume_embeddings = []
262
- progress_bar = st.progress(0)
263
- for i, text in enumerate(resume_texts):
264
- if text:
265
- embedding = self.embedding_model.encode(text[:8192], convert_to_numpy=True, normalize_embeddings=True)
266
- resume_embeddings.append(embedding)
267
- else:
268
- resume_embeddings.append(np.zeros(1024))
269
- progress_bar.progress((i + 1) / len(resume_texts))
270
- progress_bar.empty()
271
- resume_embeddings_np = np.array(resume_embeddings).astype('float32') # Renamed variable
272
- if resume_embeddings_np.ndim == 1: # Handle case of single resume
273
- resume_embeddings_np = resume_embeddings_np.reshape(1, -1)
274
- if resume_embeddings_np.size == 0:
275
- print("[faiss_recall] No resume embeddings to add to FAISS index.")
276
- return [] # Or handle error appropriately
277
-
278
- dimension = resume_embeddings_np.shape[1]
279
- index = faiss.IndexFlatIP(dimension)
280
- index.add(resume_embeddings_np)
281
- job_embedding_np = job_embedding.reshape(1, -1).astype('float32') # Renamed variable
282
- scores, indices = index.search(job_embedding_np, min(top_k, len(resume_texts)))
283
- return indices[0].tolist()
284
-
 
 
 
 
 
 
 
 
 
 
285
  def cross_encoder_rerank(self, resume_texts, job_description, top_50_indices, top_k=20):
286
- print("[cross_encoder_rerank] Method started.")
287
- if not self.cross_encoder:
288
- st.error("Cross-encoder model is not available!")
289
- return [(idx, 0.0) for idx in top_50_indices[:top_k]]
290
- pairs = []
291
- valid_indices = []
292
- for idx in top_50_indices:
293
- if idx < len(resume_texts) and resume_texts[idx]:
294
- job_snippet = job_description[:512]
295
- resume_snippet = resume_texts[idx][:512]
296
- pairs.append([job_snippet, resume_snippet])
297
- valid_indices.append(idx)
298
- if not pairs:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  return [(idx, 0.0) for idx in top_50_indices[:top_k]]
300
- st.text(f"Cross-Encoder: Preparing {len(pairs)} pairs for re-ranking...")
301
- scores = []
302
- batch_size = 8
303
- progress_bar = st.progress(0)
304
- for i in range(0, len(pairs), batch_size):
305
- batch = pairs[i:i+batch_size]
306
- batch_scores = self.cross_encoder.predict(batch)
307
- scores.extend(batch_scores)
308
- progress_bar.progress(min(1.0, (i + batch_size) / len(pairs)))
309
- progress_bar.empty()
310
- indexed_scores = list(zip(valid_indices, scores))
311
- indexed_scores.sort(key=lambda x: x[1], reverse=True)
312
- return indexed_scores[:top_k]
313
-
314
  def add_bm25_scores(self, resume_texts, job_description, top_20_results):
315
- st.text("BM25: Calculating keyword scores...")
316
- top_20_texts = [resume_texts[idx] for idx, _ in top_20_results]
317
- bm25_scores_raw = self.calculate_bm25_scores(top_20_texts, job_description)
318
- if bm25_scores_raw and max(bm25_scores_raw) > 0:
319
- max_bm25, min_bm25 = max(bm25_scores_raw), min(bm25_scores_raw)
320
- if max_bm25 > min_bm25:
321
- normalized_bm25 = [0.1 + 0.1 * (s - min_bm25) / (max_bm25 - min_bm25) for s in bm25_scores_raw]
 
 
 
 
 
 
 
 
 
 
 
 
322
  else:
323
- normalized_bm25 = [0.15] * len(bm25_scores_raw)
324
- else:
325
- normalized_bm25 = [0.15] * len(top_20_results)
326
- results_with_bm25 = []
327
- for i, (idx, cross_score) in enumerate(top_20_results):
328
- results_with_bm25.append((idx, cross_score, normalized_bm25[i] if i < len(normalized_bm25) else 0.15))
329
- return results_with_bm25
330
-
 
 
 
 
 
 
331
  def add_intent_scores(self, resume_texts, job_description, top_20_with_bm25):
332
- st.text(f"LLM Intent: Analyzing intent for {len(top_20_with_bm25)} candidates (Qwen3-4B)...")
333
- results_with_intent = []
334
- progress_bar = st.progress(0)
335
- for i, (idx, cross_score, bm25_score) in enumerate(top_20_with_bm25):
336
- intent_score = self.analyze_intent(resume_texts[idx], job_description)
337
- results_with_intent.append((idx, cross_score, bm25_score, intent_score))
338
- progress_bar.progress((i + 1) / len(top_20_with_bm25))
339
- progress_bar.empty()
340
- return results_with_intent
341
-
 
 
 
 
 
 
 
342
  def analyze_intent(self, resume_text, job_description):
343
- print(f"[analyze_intent] Analyzing intent for one resume (Qwen3-4B)...")
344
- st.text("LLM Intent: Analyzing intent (Qwen3-4B)...")
345
  try:
346
- resume_snippet = resume_text[:15000]
347
- job_snippet = job_description[:5000]
 
348
 
349
- prompt = f"""You are given a job description and a candidate's resume. Your task is to analyze whether the candidate is likely seeking this specific type of job.
 
350
 
351
  Job Description:
352
  {job_snippet}
@@ -354,145 +410,204 @@ Job Description:
354
  Candidate Resume:
355
  {resume_snippet}
356
 
357
- Please analyze the candidate's background, skills, experience, and career trajectory to determine if they would be genuinely interested in and likely to apply for this position.
358
-
359
- Consider:
360
- 1. Does their experience align with the job requirements?
361
- 2. Is this a logical career progression for them?
362
- 3. Do their skills match what's needed?
363
- 4. Would this role be appealing given their background?
364
-
365
- Think through your analysis step by step, then provide your final assessment.
366
-
367
- Respond with exactly one of these formats:
368
- - Intent: Yes (if they would likely seek this job)
369
- - Intent: Maybe (if it's uncertain or partially aligned)
370
- - Intent: No (if they would likely not seek this job)"""
371
-
372
- # Check if models are available
373
- if not st.session_state.get('qwen3_4b_tokenizer') or not st.session_state.get('qwen3_4b_model'):
374
- st.warning("Qwen3-4B model not available, using fallback intent score.")
375
- return 0.1
376
-
377
- response_text = generate_qwen3_response(
378
  prompt,
379
- st.session_state.qwen3_4b_tokenizer,
380
- st.session_state.qwen3_4b_model,
381
- max_new_tokens=20000
382
  )
383
 
384
- # Parse thinking content and intent decision
385
- thinking_content = "No detailed thought process extracted."
386
- intent_decision_part = response_text
387
- think_start_tag = "<think>"
388
- think_end_tag = "</think>"
389
- start_index = response_text.find(think_start_tag)
390
- end_index = response_text.rfind(think_end_tag)
391
- if start_index != -1 and end_index != -1 and start_index < end_index:
392
- thinking_content = response_text[start_index + len(think_start_tag):end_index].strip()
393
- intent_decision_part = response_text[end_index + len(think_end_tag):].strip()
394
-
395
- response_lower = intent_decision_part.lower()
396
- intent_score = 0.1 # Default "Maybe" score
397
-
398
  if 'intent: yes' in response_lower or 'intent:yes' in response_lower:
399
- intent_score = 0.3
400
- elif 'intent: no' in response_lower or 'intent:no' in response_lower:
401
- intent_score = 0.0
402
  elif 'intent: maybe' in response_lower or 'intent:maybe' in response_lower:
403
- intent_score = 0.1
404
-
405
- return intent_score
 
406
  except Exception as e:
407
- st.warning(f"Error analyzing intent with Qwen3-4B: {str(e)}")
408
- return 0.1
409
-
410
  def calculate_final_scores(self, results_with_all_scores):
411
- final_results = []
412
- for idx, cross_score, bm25_score, intent_score in results_with_all_scores:
413
- normalized_cross = max(0, min(1, cross_score))
414
- final_score = normalized_cross + bm25_score + intent_score
415
- final_results.append({
416
- 'index': idx, 'cross_encoder_score': normalized_cross,
417
- 'bm25_score': bm25_score, 'intent_score': intent_score,
418
- 'final_score': final_score
419
- })
420
- final_results.sort(key=lambda x: x['final_score'], reverse=True)
421
- return final_results
422
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  def extract_skills(self, text, job_description):
424
- # ... (implementation)
425
- if not text: return []
426
- common_skills = ["python", "java", "javascript", "react", "angular", "vue", "node.js", "express", "django", "flask", "spring", "sql", "nosql", "html", "css", "aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "git", "github", "agile", "scrum", "jira", "ci/cd", "devops", "microservices", "rest", "api", "machine learning", "deep learning", "data science", "artificial intelligence", "tensorflow", "pytorch", "keras", "scikit-learn", "pandas", "numpy", "matplotlib", "seaborn", "jupyter", "r", "sas", "spss", "tableau", "powerbi", "excel", "mysql", "postgresql", "mongodb", "redis", "elasticsearch", "kafka", "rabbitmq", "spark", "hadoop", "hive", "airflow", "linux", "unix"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  job_words = set(word.lower() for word in word_tokenize(job_description) if len(word) > 2)
 
 
428
  found_skills = []
429
  text_lower = text.lower()
 
 
430
  for skill in common_skills:
431
  if skill in text_lower and any(skill in job_word for job_word in job_words):
432
  found_skills.append(skill)
 
 
433
  for word in job_words:
434
- if len(word) > 3 and word in text_lower and word not in found_skills and word not in ['with', 'have', 'that', 'this', 'from', 'what', 'when', 'where']:
435
- found_skills.append(word)
436
- return list(set(found_skills))[:15]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
  def create_download_link(df, filename="resume_screening_results.csv"):
439
- # ... (implementation)
440
  csv = df.to_csv(index=False)
441
  b64 = base64.b64encode(csv.encode()).decode()
442
  return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-btn">πŸ“₯ Download Results CSV</a>'
443
 
444
- # --- Sidebar Configuration (Must be after global model loading and class defs if it uses them) ---
445
- st.markdown("---")
446
- st.markdown("### πŸ€– Advanced Pipeline")
447
- st.markdown("- **Stage 1**: FAISS Recall (Top 50)")
448
- st.markdown("- **Stage 2**: Cross-Encoder Re-ranking (Top 20)")
449
- st.markdown("- **Stage 3**: BM25 Keyword Matching")
450
- st.markdown("- **Stage 4**: LLM Intent Analysis (Qwen3-4B)")
451
- st.markdown("- **Final**: Combined Scoring") # Updated this line
452
- st.markdown("### πŸ“Š Models Used")
453
- st.markdown("- **Embedding**: BAAI/bge-large-en-v1.5")
454
- st.markdown("- **Cross-Encoder**: ms-marco-MiniLM-L6-v2")
455
- st.markdown("- **LLM**: Qwen/Qwen3-4B (4-bit quantized)")
456
- st.markdown("### πŸ“ˆ Scoring Formula")
457
- st.markdown("**Final Score = Cross-Encoder (0-1) + BM25 (0.1-0.2) + Intent (0-0.3)**")
458
-
459
- # --- Main App Interface (Must be after global model loading and class defs) ---
460
  st.title("🎯 AI-Powered Resume Screener")
461
- # ... (Model Loading Status display as before)
462
- # ...
463
- st.markdown("*Find the perfect candidates using BAAI/bge-large-en-v1.5 embeddings and Qwen3-4B for intent analysis*")
464
-
465
- st.subheader("πŸ€– Model Loading Status")
466
- col1, col2 = st.columns(2)
467
- with col1:
468
- if st.session_state.get('embedding_model_error'):
469
- st.error(f"Embedding Model: {st.session_state.embedding_model_error}")
470
- elif st.session_state.get('embedding_model'):
471
- st.success("βœ… Embedding Model (BAAI/bge-large-en-v1.5) loaded.")
472
- else:
473
- st.warning("⏳ Embedding Model loading or not found (check console).")
474
- if st.session_state.get('cross_encoder_error'):
475
- st.error(f"Cross-Encoder Model: {st.session_state.cross_encoder_error}")
476
- elif st.session_state.get('cross_encoder'):
477
- st.success("βœ… Cross-Encoder Model (ms-marco-MiniLM-L6-v2) loaded.")
478
- else:
479
- st.warning("⏳ Cross-Encoder Model loading or not found (check console).")
480
- with col2:
481
- if st.session_state.get('qwen3_4b_tokenizer_error'):
482
- st.error(f"Qwen3-4B Tokenizer: {st.session_state.qwen3_4b_tokenizer_error}")
483
- elif st.session_state.get('qwen3_4b_tokenizer'):
484
- st.success("βœ… Qwen3-4B Tokenizer loaded.")
485
- else:
486
- st.warning("⏳ Qwen3-4B Tokenizer loading or not found (check console).")
487
- if st.session_state.get('qwen3_4b_model_error'):
488
- st.error(f"Qwen3-4B Model: {st.session_state.qwen3_4b_model_error}")
489
- elif st.session_state.get('qwen3_4b_model'):
490
- st.success("βœ… Qwen3-4B Model loaded (4-bit quantized).")
491
- else:
492
- st.warning("⏳ Qwen3-4B Model loading or not found (check console).")
493
  st.markdown("---")
494
 
495
- # Initialize screener (This line was causing NameError, ensure class is defined above)
496
  screener = ResumeScreener()
497
 
498
  # Job Description Input
@@ -516,6 +631,7 @@ if st.session_state.resume_texts:
516
  st.session_state.resume_texts = []
517
  st.session_state.file_names = []
518
  st.session_state.results = []
 
519
  st.session_state.current_job_description = ""
520
  st.rerun()
521
 
@@ -683,60 +799,115 @@ col1, col2 = st.columns([1, 1])
683
 
684
  with col1:
685
  if st.button("πŸš€ Advanced Pipeline Analysis",
686
- disabled=not (job_description and st.session_state.resume_texts and
687
- st.session_state.get('embedding_model') and
688
- st.session_state.get('cross_encoder') and
689
- st.session_state.get('qwen3_4b_model') and
690
- st.session_state.get('qwen3_4b_tokenizer')),
691
  type="primary",
692
  help="Run the complete 5-stage advanced pipeline"):
693
- print("--- Advanced Pipeline Analysis Button Clicked ---")
694
- if len(st.session_state.resume_texts) == 0:
695
- st.error("❌ Please upload resumes first!")
696
- elif not job_description.strip():
697
- st.error("❌ Please enter a job description!")
698
- else:
699
- print("[UI Button] Pre-checks passed. Starting spinner and pipeline.")
700
- with st.spinner("πŸš€ Running Advanced Pipeline Analysis..."):
701
- st.text("Pipeline Initiated: Starting advanced analysis...")
702
- try:
703
- # Run the advanced pipeline
704
- pipeline_results = screener.advanced_pipeline_ranking(
705
- st.session_state.resume_texts, job_description
706
- )
707
-
708
- # Prepare results for display
709
- results = []
710
-
711
- for rank, result_data in enumerate(pipeline_results, 1):
712
- idx = result_data['index']
713
- name = st.session_state.file_names[idx]
714
- text = st.session_state.resume_texts[idx]
715
-
716
- # Extract skills
717
- skills = screener.extract_skills(text, job_description)
718
-
719
- results.append({
720
- 'rank': rank,
721
- 'name': name,
722
- 'final_score': result_data['final_score'],
723
- 'cross_encoder_score': result_data['cross_encoder_score'],
724
- 'bm25_score': result_data['bm25_score'],
725
- 'intent_score': result_data['intent_score'],
726
- 'skills': skills,
727
- 'text': text,
728
- 'text_preview': text[:500] + "..." if len(text) > 500 else text
729
- })
730
-
731
- # Store in session state
732
- st.session_state.results = results
733
- st.session_state.current_job_description = job_description
734
-
735
- st.success(f"πŸš€ Advanced pipeline complete! Found top {len(st.session_state.results)} candidates.")
736
- st.text("Displaying Top Candidates...")
737
-
738
- except Exception as e:
739
- st.error(f"❌ Error during analysis: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
 
741
  # Display Results
742
  if st.session_state.results:
@@ -762,7 +933,7 @@ if st.session_state.results:
762
  "Top Skills": ", ".join(result['skills'][:5])
763
  })
764
 
765
- summary_df = pd.DataFrame(summary_data)
766
 
767
  # Style the dataframe
768
  def color_scores(val):
@@ -805,6 +976,7 @@ if st.session_state.results:
805
  "Intent_Score": result['intent_score'],
806
  "Intent_Analysis": intent_text,
807
  "Skills": "; ".join(result['skills']),
 
808
  "Resume_Preview": result['text_preview']
809
  })
810
 
@@ -835,6 +1007,9 @@ if st.session_state.results:
835
  st.write(f"β€’ {skill}")
836
 
837
  with col2:
 
 
 
838
  st.write("**πŸ“„ Resume Preview:**")
839
  st.text_area("", result['text_preview'], height=200, disabled=True, key=f"preview_{result['rank']}")
840
 
@@ -894,6 +1069,7 @@ with col1:
894
  st.session_state.resume_texts = []
895
  st.session_state.file_names = []
896
  st.session_state.results = []
 
897
  st.session_state.current_job_description = ""
898
  st.success("βœ… Resumes cleared!")
899
  st.rerun()
@@ -903,6 +1079,7 @@ with col2:
903
  st.session_state.resume_texts = []
904
  st.session_state.file_names = []
905
  st.session_state.results = []
 
906
  st.session_state.current_job_description = ""
907
 
908
  if torch.cuda.is_available():
@@ -916,7 +1093,7 @@ st.markdown("---")
916
  st.markdown(
917
  """
918
  <div style='text-align: center; color: #666;'>
919
- πŸš€ Powered by BAAI/bge-large-en-v1.5 & Qwen3-4B (4-bit) | Built with Streamlit
920
  </div>
921
  """,
922
  unsafe_allow_html=True
 
15
  import csv
16
  from datasets import load_dataset
17
  import gc
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
  import time
20
  import faiss
21
  import re
 
34
  initial_sidebar_state="expanded"
35
  )
36
 
37
+ # Sidebar configuration
38
+ with st.sidebar:
39
+ st.title("βš™οΈ Configuration")
40
+
41
+ # Ranking weights
42
+ st.subheader("Ranking Weights")
43
+ semantic_weight = st.slider("Semantic Similarity Weight", 0.0, 1.0, 0.7, 0.1)
44
+ keyword_weight = 1.0 - semantic_weight
45
+ st.write(f"Keyword Weight: {keyword_weight:.1f}")
46
+
47
+ # Advanced options
48
+ st.subheader("Advanced Options")
49
+ top_k = st.selectbox("Number of results to display", options=[1, 2, 3, 4, 5], index=4)
50
+
51
+ # LLM Settings
52
+ st.subheader("LLM Settings")
53
+ use_llm_explanations = st.checkbox("Generate AI Explanations", value=True)
54
+ if use_llm_explanations:
55
+ hf_token = st.text_input("Hugging Face Token (optional)", type="password",
56
+ help="Enter your HF token for better rate limits")
57
+
58
+ st.markdown("---")
59
+ st.markdown("### πŸ€– Advanced Pipeline")
60
+ st.markdown("- **Stage 1**: FAISS Recall (Top 50)")
61
+ st.markdown("- **Stage 2**: Cross-Encoder Re-ranking (Top 20)")
62
+ st.markdown("- **Stage 3**: BM25 Keyword Matching")
63
+ st.markdown("- **Stage 4**: LLM Intent Analysis")
64
+ st.markdown("- **Final**: Combined Scoring (Top 5)")
65
+ st.markdown("### πŸ“Š Models Used")
66
+ st.markdown("- **Embedding**: BAAI/bge-large-en-v1.5")
67
+ st.markdown("- **Cross-Encoder**: ms-marco-MiniLM-L6-v2")
68
+ st.markdown("- **LLM Explanations**: Qwen/Qwen3-14B")
69
+ st.markdown("- **Intent Analysis**: Qwen/Qwen3-1.7B")
70
+ st.markdown("### πŸ“ˆ Scoring Formula")
71
+ st.markdown("**Final Score = Cross-Encoder (0-1) + BM25 (0.1-0.2) + Intent (0-0.3)**")
 
 
 
 
72
 
73
+ # Initialize session state
74
+ if 'embedding_model' not in st.session_state:
75
+ st.session_state.embedding_model = None
76
+ if 'cross_encoder' not in st.session_state:
77
+ st.session_state.cross_encoder = None
78
+ if 'results' not in st.session_state:
79
+ st.session_state.results = []
80
+ if 'resume_texts' not in st.session_state:
81
+ st.session_state.resume_texts = []
82
+ if 'file_names' not in st.session_state:
83
+ st.session_state.file_names = []
84
+ if 'explanations_generated' not in st.session_state:
85
+ st.session_state.explanations_generated = False
86
+ if 'current_job_description' not in st.session_state:
87
+ st.session_state.current_job_description = ""
88
+ if 'qwen3_tokenizer' not in st.session_state:
89
+ st.session_state.qwen3_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B")
90
+ if 'qwen3_model' not in st.session_state:
91
+ st.session_state.qwen3_model = AutoModelForCausalLM.from_pretrained(
92
+ "Qwen/Qwen3-14B",
93
+ torch_dtype="auto",
94
+ device_map="auto"
95
+ )
96
+ # Separate smaller model for intent analysis
97
+ if 'qwen3_intent_tokenizer' not in st.session_state:
98
+ st.session_state.qwen3_intent_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")
99
+ if 'qwen3_intent_model' not in st.session_state:
100
+ st.session_state.qwen3_intent_model = AutoModelForCausalLM.from_pretrained(
101
+ "Qwen/Qwen3-1.7B",
102
+ torch_dtype="auto",
103
+ device_map="auto"
104
+ )
105
 
106
+ @st.cache_resource
107
+ def load_embedding_model():
108
+ """Load and cache the BGE embedding model"""
109
  try:
110
+ with st.spinner("πŸ”„ Loading BAAI/bge-large-en-v1.5 model..."):
111
+ model = SentenceTransformer('BAAI/bge-large-en-v1.5')
112
+ st.success("βœ… Embedding model loaded successfully!")
113
+ return model
114
  except Exception as e:
115
+ st.error(f"❌ Error loading embedding model: {str(e)}")
116
+ return None
 
117
 
118
+ @st.cache_resource
119
+ def load_cross_encoder():
120
+ """Load and cache the Cross-Encoder model"""
121
  try:
122
+ with st.spinner("πŸ”„ Loading Cross-Encoder ms-marco-MiniLM-L6-v2..."):
123
+ from sentence_transformers import CrossEncoder
124
+ model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
125
+ st.success("βœ… Cross-Encoder model loaded successfully!")
126
+ return model
 
 
 
 
 
 
 
 
 
 
127
  except Exception as e:
128
+ st.error(f"❌ Error loading Cross-Encoder model: {str(e)}")
129
+ return None
 
 
 
 
 
130
 
131
  def generate_qwen3_response(prompt, tokenizer, model, max_new_tokens=200):
 
132
  messages = [{"role": "user", "content": prompt}]
133
  text = tokenizer.apply_chat_template(
134
  messages,
135
  tokenize=False,
136
  add_generation_prompt=True,
137
+ enable_thinking=True
138
  )
139
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
140
  generated_ids = model.generate(
 
145
  response = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
146
  return response
147
 
148
+ class ResumeScreener:
149
  def __init__(self):
150
+ # Load models
151
+ self.embedding_model = load_embedding_model()
152
+ self.cross_encoder = load_cross_encoder()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  def extract_text_from_file(self, file_path, file_type):
155
+ """Extract text from various file types"""
156
  try:
157
  if file_type == "pdf":
158
  with open(file_path, 'rb') as file:
 
160
  text = ""
161
  for page in pdf.pages:
162
  text += page.extract_text() or ""
163
+
164
  if not text.strip():
165
+ # Fallback to PyPDF2
166
  file.seek(0)
167
  reader = PyPDF2.PdfReader(file)
168
  text = ""
169
+ for page in reader.pages:
170
+ text += page.extract_text() or ""
171
  return text
172
+
173
  elif file_type == "docx":
174
  doc = Document(file_path)
175
  return " ".join([paragraph.text for paragraph in doc.paragraphs])
176
+
177
  elif file_type == "txt":
178
  with open(file_path, 'r', encoding='utf-8') as file:
179
  return file.read()
180
+
181
  elif file_type == "csv":
182
  with open(file_path, 'r', encoding='utf-8') as file:
183
  csv_reader = csv.reader(file)
184
  return " ".join([" ".join(row) for row in csv_reader])
185
+
186
  except Exception as e:
187
  st.error(f"Error extracting text from {file_path}: {str(e)}")
188
  return ""
189
 
190
  def get_embedding(self, text):
191
+ """Generate embedding for text using BGE model"""
192
  if self.embedding_model is None:
193
+ st.error("No embedding model loaded!")
194
+ return np.zeros(1024) # BGE-large dimension
195
+
196
  try:
197
+ # BGE models recommend adding instruction for retrieval
198
+ # For queries (job description)
199
+ if len(text) < 500: # Assuming shorter texts are queries
200
  text = "Represent this sentence for searching relevant passages: " + text
201
+
202
+ # Truncate text to avoid memory issues
203
  text = text[:8192] if text else ""
204
+
205
+ # Generate embedding
206
+ embedding = self.embedding_model.encode(text,
207
+ convert_to_numpy=True,
208
+ normalize_embeddings=True)
209
  return embedding
210
+
211
  except Exception as e:
212
  st.error(f"Error generating embedding: {str(e)}")
213
+ return np.zeros(1024) # BGE-large dimension
214
 
215
  def calculate_bm25_scores(self, resume_texts, job_description):
216
+ """Calculate BM25 scores for keyword matching"""
217
  try:
218
  job_tokens = word_tokenize(job_description.lower())
219
  corpus = [word_tokenize(text.lower()) for text in resume_texts if text and text.strip()]
220
+
221
  if not corpus:
222
  return [0.0] * len(resume_texts)
223
+
224
  bm25 = BM25Okapi(corpus)
225
  scores = bm25.get_scores(job_tokens)
226
  return scores.tolist()
227
+
228
  except Exception as e:
229
  st.error(f"Error calculating BM25 scores: {str(e)}")
230
  return [0.0] * len(resume_texts)
231
 
232
+ def advanced_pipeline_ranking(self, resume_texts, job_description, final_top_k=5):
233
+ """Advanced pipeline: FAISS recall -> Cross-encoder -> BM25 -> LLM intent -> Final ranking"""
234
  if not resume_texts:
235
  return []
236
+
237
+ # Stage 1: FAISS Recall (Top 50)
238
+ st.write("πŸ” **Stage 1**: FAISS Recall - Finding top 50 candidates...")
239
  top_50_indices = self.faiss_recall(resume_texts, job_description, top_k=50)
240
+
241
+ # Stage 2: Cross-Encoder Re-ranking (Top 20)
242
+ st.write("🎯 **Stage 2**: Cross-Encoder Re-ranking - Selecting top 20...")
243
  top_20_results = self.cross_encoder_rerank(resume_texts, job_description, top_50_indices, top_k=20)
244
+
245
+ # Stage 3: BM25 Keyword Matching
246
+ st.write("πŸ”€ **Stage 3**: BM25 Keyword Matching...")
247
  top_20_with_bm25 = self.add_bm25_scores(resume_texts, job_description, top_20_results)
248
+
249
+ # Stage 4: LLM Intent Analysis (using Qwen3-1.7B)
250
+ st.write("πŸ€– **Stage 4**: LLM Intent Analysis...")
251
  top_20_with_intent = self.add_intent_scores(resume_texts, job_description, top_20_with_bm25)
252
+
253
+ # Stage 5: Final Combined Ranking
254
+ st.write(f"πŸ† **Stage 5**: Final Combined Ranking - Selecting top {final_top_k}...")
255
  final_results = self.calculate_final_scores(top_20_with_intent)
256
+
257
+ return final_results[:final_top_k] # Return top K as selected by user
258
+
259
  def faiss_recall(self, resume_texts, job_description, top_k=50):
260
+ """Stage 1: Use FAISS for initial recall to find top 50 resumes"""
261
+ try:
262
+ # Get job embedding
263
+ job_embedding = self.get_embedding(job_description)
264
+
265
+ # Get resume embeddings
266
+ resume_embeddings = []
267
+ progress_bar = st.progress(0)
268
+
269
+ for i, text in enumerate(resume_texts):
270
+ if text:
271
+ embedding = self.embedding_model.encode(text[:8192],
272
+ convert_to_numpy=True,
273
+ normalize_embeddings=True)
274
+ resume_embeddings.append(embedding)
275
+ else:
276
+ resume_embeddings.append(np.zeros(1024))
277
+ progress_bar.progress((i + 1) / len(resume_texts))
278
+
279
+ progress_bar.empty()
280
+
281
+ # Create FAISS index
282
+ resume_embeddings = np.array(resume_embeddings).astype('float32')
283
+ dimension = resume_embeddings.shape[1]
284
+ index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
285
+ index.add(resume_embeddings)
286
+
287
+ # Search for top K
288
+ job_embedding = job_embedding.reshape(1, -1).astype('float32')
289
+ scores, indices = index.search(job_embedding, min(top_k, len(resume_texts)))
290
+
291
+ return indices[0].tolist()
292
+
293
+ except Exception as e:
294
+ st.error(f"Error in FAISS recall: {str(e)}")
295
+ # Fallback: return all indices
296
+ return list(range(min(top_k, len(resume_texts))))
297
+
298
  def cross_encoder_rerank(self, resume_texts, job_description, top_50_indices, top_k=20):
299
+ """Stage 2: Use Cross-Encoder to re-rank top 50 and select top 20"""
300
+ try:
301
+ if not self.cross_encoder:
302
+ st.error("Cross-encoder not loaded!")
303
+ return [(idx, 0.0) for idx in top_50_indices[:top_k]]
304
+
305
+ # Prepare pairs for cross-encoder
306
+ pairs = []
307
+ valid_indices = []
308
+
309
+ for idx in top_50_indices:
310
+ if idx < len(resume_texts) and resume_texts[idx]:
311
+ # Truncate texts for cross-encoder
312
+ job_snippet = job_description[:512]
313
+ resume_snippet = resume_texts[idx][:512]
314
+ pairs.append([job_snippet, resume_snippet])
315
+ valid_indices.append(idx)
316
+
317
+ if not pairs:
318
+ return [(idx, 0.0) for idx in top_50_indices[:top_k]]
319
+
320
+ # Get cross-encoder scores
321
+ progress_bar = st.progress(0)
322
+ scores = []
323
+
324
+ # Process in batches to avoid memory issues
325
+ batch_size = 8
326
+ for i in range(0, len(pairs), batch_size):
327
+ batch = pairs[i:i+batch_size]
328
+ batch_scores = self.cross_encoder.predict(batch)
329
+ scores.extend(batch_scores)
330
+ progress_bar.progress(min(1.0, (i + batch_size) / len(pairs)))
331
+
332
+ progress_bar.empty()
333
+
334
+ # Combine indices with scores and sort
335
+ indexed_scores = list(zip(valid_indices, scores))
336
+ indexed_scores.sort(key=lambda x: x[1], reverse=True)
337
+
338
+ return indexed_scores[:top_k]
339
+
340
+ except Exception as e:
341
+ st.error(f"Error in cross-encoder re-ranking: {str(e)}")
342
  return [(idx, 0.0) for idx in top_50_indices[:top_k]]
343
+
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  def add_bm25_scores(self, resume_texts, job_description, top_20_results):
345
+ """Stage 3: Add BM25 scores to top 20 resumes"""
346
+ try:
347
+ # Get texts for top 20
348
+ top_20_texts = [resume_texts[idx] for idx, _ in top_20_results]
349
+
350
+ # Calculate BM25 scores
351
+ bm25_scores = self.calculate_bm25_scores(top_20_texts, job_description)
352
+
353
+ # Normalize BM25 scores to 0.1-0.2 range
354
+ if bm25_scores and max(bm25_scores) > 0:
355
+ max_bm25 = max(bm25_scores)
356
+ min_bm25 = min(bm25_scores)
357
+ if max_bm25 > min_bm25:
358
+ normalized_bm25 = [
359
+ 0.1 + 0.1 * (score - min_bm25) / (max_bm25 - min_bm25)
360
+ for score in bm25_scores
361
+ ]
362
+ else:
363
+ normalized_bm25 = [0.15] * len(bm25_scores)
364
  else:
365
+ normalized_bm25 = [0.15] * len(top_20_results)
366
+
367
+ # Combine with existing results
368
+ results_with_bm25 = []
369
+ for i, (idx, cross_score) in enumerate(top_20_results):
370
+ bm25_score = normalized_bm25[i] if i < len(normalized_bm25) else 0.15
371
+ results_with_bm25.append((idx, cross_score, bm25_score))
372
+
373
+ return results_with_bm25
374
+
375
+ except Exception as e:
376
+ st.error(f"Error adding BM25 scores: {str(e)}")
377
+ return [(idx, cross_score, 0.15) for idx, cross_score in top_20_results]
378
+
379
  def add_intent_scores(self, resume_texts, job_description, top_20_with_bm25):
380
+ """Stage 4: Add LLM intent analysis scores"""
381
+ try:
382
+ results_with_intent = []
383
+ progress_bar = st.progress(0)
384
+
385
+ for i, (idx, cross_score, bm25_score) in enumerate(top_20_with_bm25):
386
+ intent_score = self.analyze_intent(resume_texts[idx], job_description)
387
+ results_with_intent.append((idx, cross_score, bm25_score, intent_score))
388
+ progress_bar.progress((i + 1) / len(top_20_with_bm25))
389
+
390
+ progress_bar.empty()
391
+ return results_with_intent
392
+
393
+ except Exception as e:
394
+ st.error(f"Error adding intent scores: {str(e)}")
395
+ return [(idx, cross_score, bm25_score, 0.1) for idx, cross_score, bm25_score in top_20_with_bm25]
396
+
397
  def analyze_intent(self, resume_text, job_description):
398
+ """Analyze candidate's intent using LLM"""
 
399
  try:
400
+ # Truncate texts
401
+ resume_snippet = resume_text[:1500] if len(resume_text) > 1500 else resume_text
402
+ job_snippet = job_description[:800] if len(job_description) > 800 else job_description
403
 
404
+ prompt = f"""You are given a job description and a candidate's resume.
405
+ Clearly answer: "Is the candidate likely seeking this job? Respond with 'Yes', 'Maybe', or 'No' and give a brief justification."
406
 
407
  Job Description:
408
  {job_snippet}
 
410
  Candidate Resume:
411
  {resume_snippet}
412
 
413
+ Response format:
414
+ Intent: [Yes/Maybe/No]
415
+ Reason: [Brief justification]"""
416
+
417
+ response = generate_qwen3_response(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  prompt,
419
+ st.session_state.qwen3_intent_tokenizer,
420
+ st.session_state.qwen3_intent_model,
421
+ max_new_tokens=100
422
  )
423
 
424
+ # Parse response
425
+ response_lower = response.lower()
 
 
 
 
 
 
 
 
 
 
 
 
426
  if 'intent: yes' in response_lower or 'intent:yes' in response_lower:
427
+ return 0.3
 
 
428
  elif 'intent: maybe' in response_lower or 'intent:maybe' in response_lower:
429
+ return 0.1
430
+ else:
431
+ return 0.0
432
+
433
  except Exception as e:
434
+ st.warning(f"Error analyzing intent: {str(e)}")
435
+ return 0.1 # Default to "Maybe"
436
+
437
  def calculate_final_scores(self, results_with_all_scores):
438
+ """Stage 5: Calculate final combined scores"""
439
+ try:
440
+ final_results = []
441
+
442
+ for idx, cross_score, bm25_score, intent_score in results_with_all_scores:
443
+ # Normalize cross-encoder score to 0-1 range
444
+ normalized_cross = max(0, min(1, cross_score))
445
+
446
+ # Final Score = Cross-Encoder (0-1) + BM25 (0.1-0.2) + Intent (0-0.3)
447
+ final_score = normalized_cross + bm25_score + intent_score
448
+
449
+ final_results.append({
450
+ 'index': idx,
451
+ 'cross_encoder_score': normalized_cross,
452
+ 'bm25_score': bm25_score,
453
+ 'intent_score': intent_score,
454
+ 'final_score': final_score
455
+ })
456
+
457
+ # Sort by final score
458
+ final_results.sort(key=lambda x: x['final_score'], reverse=True)
459
+
460
+ return final_results
461
+
462
+ except Exception as e:
463
+ st.error(f"Error calculating final scores: {str(e)}")
464
+ return []
465
+
466
  def extract_skills(self, text, job_description):
467
+ """Extract skills from resume based on job description"""
468
+ if not text:
469
+ return []
470
+
471
+ # Common tech skills
472
+ common_skills = [
473
+ "python", "java", "javascript", "react", "angular", "vue", "node.js",
474
+ "express", "django", "flask", "spring", "sql", "nosql", "html", "css",
475
+ "aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "git", "github",
476
+ "agile", "scrum", "jira", "ci/cd", "devops", "microservices", "rest", "api",
477
+ "machine learning", "deep learning", "data science", "artificial intelligence",
478
+ "tensorflow", "pytorch", "keras", "scikit-learn", "pandas", "numpy",
479
+ "matplotlib", "seaborn", "jupyter", "r", "sas", "spss", "tableau", "powerbi",
480
+ "excel", "mysql", "postgresql", "mongodb", "redis", "elasticsearch",
481
+ "kafka", "rabbitmq", "spark", "hadoop", "hive", "airflow", "linux", "unix"
482
+ ]
483
+
484
+ # Extract potential skills from job description
485
  job_words = set(word.lower() for word in word_tokenize(job_description) if len(word) > 2)
486
+
487
+ # Find matching skills
488
  found_skills = []
489
  text_lower = text.lower()
490
+
491
+ # Check common skills that appear in both resume and job description
492
  for skill in common_skills:
493
  if skill in text_lower and any(skill in job_word for job_word in job_words):
494
  found_skills.append(skill)
495
+
496
+ # Check for skills mentioned in job description
497
  for word in job_words:
498
+ if len(word) > 3 and word in text_lower and word not in found_skills:
499
+ # Basic filter to avoid common words
500
+ if word not in ['with', 'have', 'that', 'this', 'from', 'what', 'when', 'where']:
501
+ found_skills.append(word)
502
+
503
+ return list(set(found_skills))[:15] # Return top 15 unique skills
504
+
505
+ def generate_simple_explanation(self, score, semantic_score, bm25_score, skills):
506
+ """Generate simple explanation for the match (fallback)"""
507
+ if score > 0.8:
508
+ quality = "excellent"
509
+ elif score > 0.6:
510
+ quality = "strong"
511
+ elif score > 0.4:
512
+ quality = "moderate"
513
+ else:
514
+ quality = "limited"
515
+
516
+ explanation = f"This candidate shows {quality} alignment with the position (score: {score:.2f}). "
517
+
518
+ if semantic_score > bm25_score:
519
+ explanation += f"The resume demonstrates strong conceptual relevance ({semantic_score:.2f}) suggesting good experience fit. "
520
+ else:
521
+ explanation += f"The resume has high keyword match ({bm25_score:.2f}) indicating direct skill alignment. "
522
+
523
+ if skills:
524
+ explanation += f"Key matching competencies include: {', '.join(skills[:5])}."
525
+
526
+ return explanation
527
+
528
+ def generate_llm_explanation(self, resume_text, job_description, score, skills, max_retries=3):
529
+ """Generate detailed explanation using Qwen3-14B"""
530
+ if not st.session_state.qwen3_model:
531
+ return self.generate_simple_explanation(score, score, score, skills)
532
+
533
+ # Truncate texts to manage token limits
534
+ resume_snippet = resume_text[:2000] if len(resume_text) > 2000 else resume_text
535
+ job_snippet = job_description[:1000] if len(job_description) > 1000 else job_description
536
+
537
+ prompt = f"""You are an expert HR analyst. Analyze this individual candidate's resume against the job requirements and write EXACTLY 150 words explaining why this specific candidate is suitable for the position.
538
+
539
+ Structure your 150-word analysis as follows:
540
+ 1. Experience alignment (40-50 words)
541
+ 2. Key strengths and skills match (40-50 words)
542
+ 3. Unique value proposition (40-50 words)
543
+ 4. Overall recommendation (10-20 words)
544
+
545
+ Job Requirements:
546
+ {job_snippet}
547
+
548
+ Candidate's Resume:
549
+ {resume_snippet}
550
+
551
+ Identified Matching Skills: {', '.join(skills[:10])}
552
+ Compatibility Score: {score:.1%}
553
+
554
+ Write a professional, detailed 150-word analysis for THIS INDIVIDUAL CANDIDATE:"""
555
+
556
+ for attempt in range(max_retries):
557
+ try:
558
+ response = generate_qwen3_response(
559
+ prompt,
560
+ st.session_state.qwen3_tokenizer,
561
+ st.session_state.qwen3_model,
562
+ max_new_tokens=200
563
+ )
564
+
565
+ # Extract the response and ensure it's about 150 words
566
+ explanation = response.strip()
567
+ word_count = len(explanation.split())
568
+
569
+ # If response is close to 150 words (130-170), accept it
570
+ if 130 <= word_count <= 170:
571
+ return explanation
572
+
573
+ # If response is too short or too long, try again with adjusted prompt
574
+ if word_count < 130:
575
+ # Response too short, try again
576
+ continue
577
+ elif word_count > 170:
578
+ # Response too long, truncate to approximately 150 words
579
+ words = explanation.split()
580
+ truncated = ' '.join(words[:150])
581
+ # Add proper ending if truncated
582
+ if not truncated.endswith('.'):
583
+ truncated += '.'
584
+ return truncated
585
+
586
+ return explanation
587
+
588
+ except Exception as e:
589
+ if attempt < max_retries - 1:
590
+ time.sleep(2) # Wait before retry
591
+ continue
592
+ else:
593
+ # Fallback to simple explanation
594
+ return self.generate_simple_explanation(score, score, score, skills)
595
+
596
+ # If all retries failed, use simple explanation
597
+ return self.generate_simple_explanation(score, score, score, skills)
598
 
599
  def create_download_link(df, filename="resume_screening_results.csv"):
600
+ """Create download link for results"""
601
  csv = df.to_csv(index=False)
602
  b64 = base64.b64encode(csv.encode()).decode()
603
  return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-btn">πŸ“₯ Download Results CSV</a>'
604
 
605
+ # Main App Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  st.title("🎯 AI-Powered Resume Screener")
607
+ st.markdown("*Find the perfect candidates using BAAI/bge-large-en-v1.5 embeddings and Qwen3-14B explanations*")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
  st.markdown("---")
609
 
610
+ # Initialize screener
611
  screener = ResumeScreener()
612
 
613
  # Job Description Input
 
631
  st.session_state.resume_texts = []
632
  st.session_state.file_names = []
633
  st.session_state.results = []
634
+ st.session_state.explanations_generated = False
635
  st.session_state.current_job_description = ""
636
  st.rerun()
637
 
 
799
 
800
  with col1:
801
  if st.button("πŸš€ Advanced Pipeline Analysis",
802
+ disabled=not (job_description and st.session_state.resume_texts),
 
 
 
 
803
  type="primary",
804
  help="Run the complete 5-stage advanced pipeline"):
805
+ if len(st.session_state.resume_texts) == 0:
806
+ st.error("❌ Please upload resumes first!")
807
+ elif not job_description.strip():
808
+ st.error("❌ Please enter a job description!")
809
+ else:
810
+ with st.spinner("πŸš€ Running Advanced Pipeline Analysis..."):
811
+ try:
812
+ # Run the advanced pipeline
813
+ pipeline_results = screener.advanced_pipeline_ranking(
814
+ st.session_state.resume_texts, job_description, final_top_k=top_k
815
+ )
816
+
817
+ # Prepare results for display
818
+ results = []
819
+
820
+ for rank, result_data in enumerate(pipeline_results, 1):
821
+ idx = result_data['index']
822
+ name = st.session_state.file_names[idx]
823
+ text = st.session_state.resume_texts[idx]
824
+
825
+ # Extract skills
826
+ skills = screener.extract_skills(text, job_description)
827
+
828
+ results.append({
829
+ 'rank': rank,
830
+ 'name': name,
831
+ 'final_score': result_data['final_score'],
832
+ 'cross_encoder_score': result_data['cross_encoder_score'],
833
+ 'bm25_score': result_data['bm25_score'],
834
+ 'intent_score': result_data['intent_score'],
835
+ 'skills': skills,
836
+ 'text': text,
837
+ 'text_preview': text[:500] + "..." if len(text) > 500 else text,
838
+ 'explanation': None # No detailed explanation yet
839
+ })
840
+
841
+ # Add simple explanations for now
842
+ for result in results:
843
+ result['explanation'] = screener.generate_simple_explanation(
844
+ result['final_score'],
845
+ result['cross_encoder_score'],
846
+ result['bm25_score'],
847
+ result['skills']
848
+ )
849
+
850
+ # Store in session state
851
+ st.session_state.results = results
852
+ st.session_state.explanations_generated = False
853
+ st.session_state.current_job_description = job_description
854
+
855
+ st.success(f"πŸš€ Advanced pipeline complete! Found top {len(st.session_state.results)} candidates.")
856
+
857
+ except Exception as e:
858
+ st.error(f"❌ Error during analysis: {str(e)}")
859
+
860
+ # Second button: Generate AI explanations (slower, optional)
861
+ with col2:
862
+ # Show this button only if we have results and LLM is enabled
863
+ show_explanation_button = (
864
+ st.session_state.results and
865
+ use_llm_explanations and
866
+ st.session_state.qwen3_model and
867
+ not st.session_state.explanations_generated
868
+ )
869
+
870
+ if show_explanation_button:
871
+ if st.button("πŸ€– Generate AI Explanations",
872
+ type="secondary",
873
+ help="Generate detailed 150-word explanations using Qwen3-14B (takes longer)"):
874
+ with st.spinner("πŸ€– Generating detailed AI explanations..."):
875
+ try:
876
+ explanation_progress = st.progress(0)
877
+ explanation_text = st.empty()
878
+
879
+ for i, result in enumerate(st.session_state.results):
880
+ explanation_text.text(f"πŸ€– Generating AI explanation for candidate {i+1}/{len(st.session_state.results)}...")
881
+
882
+ llm_explanation = screener.generate_llm_explanation(
883
+ result['text'],
884
+ st.session_state.current_job_description,
885
+ result['final_score'],
886
+ result['skills']
887
+ )
888
+ result['explanation'] = llm_explanation
889
+
890
+ explanation_progress.progress((i + 1) / len(st.session_state.results))
891
+
892
+ explanation_progress.empty()
893
+ explanation_text.empty()
894
+
895
+ # Mark explanations as generated
896
+ st.session_state.explanations_generated = True
897
+
898
+ st.success(f"πŸ€– AI explanations generated for all {len(st.session_state.results)} candidates!")
899
+
900
+ except Exception as e:
901
+ st.error(f"❌ Error generating explanations: {str(e)}")
902
+
903
+ elif st.session_state.results and st.session_state.explanations_generated:
904
+ st.info("βœ… AI explanations already generated!")
905
+
906
+ elif st.session_state.results and not use_llm_explanations:
907
+ st.info("πŸ’‘ Enable 'Generate AI Explanations' in sidebar to use this feature")
908
+
909
+ elif st.session_state.results and not st.session_state.qwen3_model:
910
+ st.warning("⚠️ LLM model not available. Check your Hugging Face token.")
911
 
912
  # Display Results
913
  if st.session_state.results:
 
933
  "Top Skills": ", ".join(result['skills'][:5])
934
  })
935
 
936
+ summary_df = pd.DataFrame(summary_data)
937
 
938
  # Style the dataframe
939
  def color_scores(val):
 
976
  "Intent_Score": result['intent_score'],
977
  "Intent_Analysis": intent_text,
978
  "Skills": "; ".join(result['skills']),
979
+ "AI_Explanation": result['explanation'],
980
  "Resume_Preview": result['text_preview']
981
  })
982
 
 
1007
  st.write(f"β€’ {skill}")
1008
 
1009
  with col2:
1010
+ st.write("**πŸ’‘ AI-Generated Match Analysis:**")
1011
+ st.info(result['explanation'])
1012
+
1013
  st.write("**πŸ“„ Resume Preview:**")
1014
  st.text_area("", result['text_preview'], height=200, disabled=True, key=f"preview_{result['rank']}")
1015
 
 
1069
  st.session_state.resume_texts = []
1070
  st.session_state.file_names = []
1071
  st.session_state.results = []
1072
+ st.session_state.explanations_generated = False
1073
  st.session_state.current_job_description = ""
1074
  st.success("βœ… Resumes cleared!")
1075
  st.rerun()
 
1079
  st.session_state.resume_texts = []
1080
  st.session_state.file_names = []
1081
  st.session_state.results = []
1082
+ st.session_state.explanations_generated = False
1083
  st.session_state.current_job_description = ""
1084
 
1085
  if torch.cuda.is_available():
 
1093
  st.markdown(
1094
  """
1095
  <div style='text-align: center; color: #666;'>
1096
+ πŸš€ Powered by BAAI/bge-large-en-v1.5 & Qwen3-14B | Built with Streamlit
1097
  </div>
1098
  """,
1099
  unsafe_allow_html=True
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  streamlit==1.31.0
2
- transformers>=4.55.0
3
- torch>=2.3.0
4
  pdfplumber==0.10.1
5
  PyPDF2==3.0.1
6
  python-docx==1.0.1
@@ -10,9 +10,10 @@ rank-bm25==0.2.2
10
  pandas==2.1.3
11
  numpy==1.24.3
12
  tqdm==4.66.1
13
- huggingface-hub>=0.27.0
14
- bitsandbytes>=0.44.1
15
- accelerate>=0.27.2
16
  datasets==2.18.0
17
  sentence-transformers==2.7.0
18
- einops
 
 
1
  streamlit==1.31.0
2
+ transformers>=4.51.0
3
+ torch==2.1.2
4
  pdfplumber==0.10.1
5
  PyPDF2==3.0.1
6
  python-docx==1.0.1
 
10
  pandas==2.1.3
11
  numpy==1.24.3
12
  tqdm==4.66.1
13
+ huggingface-hub==0.30.0
14
+ bitsandbytes==0.44.1
15
+ accelerate==0.27.2
16
  datasets==2.18.0
17
  sentence-transformers==2.7.0
18
+ plotly==5.18.0
19
+ einops