Zwounds commited on
Commit
8f0676c
Β·
verified Β·
1 Parent(s): 639add2

Update cv_extraction_app.py

Browse files
Files changed (1) hide show
  1. cv_extraction_app.py +232 -15
cv_extraction_app.py CHANGED
@@ -9,6 +9,26 @@ import google.generativeai as genai
9
  import json
10
  import re
11
  from dotenv import load_dotenv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Load environment variables from .env file
14
  load_dotenv()
@@ -289,6 +309,8 @@ def get_accomplishments_from_llm(cv_text, faculty_name_hint=None):
289
  - "funding_amount": For grants or funded projects (often in "Other" category), the numeric funding amount if explicitly stated (e.g., 250000). Extract only the number, without currency symbols or commas. Use "N/A" if not applicable or not found.
290
  - "confidence": A number from 1-5 indicating your confidence in this categorization (5 being highest confidence).
291
 
 
 
292
  Ensure the entire output is a single, valid JSON object like this example:
293
  {{
294
  "faculty_name": "Example Faculty Name",
@@ -317,29 +339,72 @@ def get_accomplishments_from_llm(cv_text, faculty_name_hint=None):
317
  prompt,
318
  generation_config=genai.types.GenerationConfig(
319
  response_mime_type="application/json",
320
- temperature=0.2 # Lower temperature for more consistent JSON formatting
321
  )
322
  )
323
  response_text = response.text.strip()
324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  # Try to fix common JSON formatting issues before parsing
326
  try:
327
- parsed_json = json.loads(response_text)
 
 
328
  except json.JSONDecodeError as e:
329
- logging.warning(f"Initial JSON parsing failed: {e}. Attempting to fix common issues.")
330
 
331
  # Try to extract JSON from markdown code blocks if present
332
- if response_text.startswith("```json") and "```" in response_text:
333
- code_block_content = response_text.split("```")[1]
334
- if code_block_content.startswith("json"):
335
- code_block_content = code_block_content[4:].strip()
336
- try:
337
- parsed_json = json.loads(code_block_content)
338
- logging.info("Successfully extracted JSON from code block")
339
- except json.JSONDecodeError:
340
- raise # Re-raise if this also fails
 
 
 
 
 
 
 
 
 
 
341
  else:
342
- raise # Re-raise the original error if not in a code block
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  extracted_faculty_name = faculty_name_hint or "Unknown"
345
  llm_faculty_name = parsed_json.get("faculty_name", "Unknown")
@@ -361,8 +426,155 @@ def get_accomplishments_from_llm(cv_text, faculty_name_hint=None):
361
  return extracted_faculty_name, accomplishments_list
362
  except Exception as e:
363
  logging.error(f"Error in LLM processing: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  return faculty_name_hint or "Unknown", []
365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  def get_accomplishments_with_retry(cv_text, faculty_name_hint=None, max_retries=2, initial_backoff=3):
367
  """Wrapper function that adds retry logic to the LLM API call."""
368
  retries = 0
@@ -370,8 +582,13 @@ def get_accomplishments_with_retry(cv_text, faculty_name_hint=None, max_retries=
370
 
371
  while retries <= max_retries:
372
  try:
373
- # Call the original function that might raise exceptions
374
- return get_accomplishments_from_llm(cv_text, faculty_name_hint)
 
 
 
 
 
375
  except json.JSONDecodeError as e:
376
  retries += 1
377
  logging.error(f"JSONDecodeError on attempt {retries}/{max_retries+1}: {e}. Response might not be valid JSON.")
 
9
  import json
10
  import re
11
  from dotenv import load_dotenv
12
+ from typing import List, Optional, Union
13
+ from pydantic import BaseModel, Field
14
+ from pydantic_ai import Agent
15
+ from pydantic_ai.models.gemini import GeminiModel
16
+
17
+ # --- Pydantic Models for Data Structures ---
18
+ class Accomplishment(BaseModel):
19
+ """Pydantic model for a scholarly accomplishment."""
20
+ category: str = Field(..., description="The specific type of scholarly work")
21
+ main_category: str = Field(..., description="The general category this work falls under")
22
+ year: Union[str, int] = Field("N/A", description="The year the accomplishment occurred")
23
+ description: str = Field(..., description="The full description or citation of the accomplishment")
24
+ doi_url: str = Field("N/A", description="The DOI or URL associated with the accomplishment")
25
+ funding_amount: Union[str, int] = Field("N/A", description="For grants or funded projects, the numeric funding amount")
26
+ confidence: int = Field(3, description="A number from 1-5 indicating confidence in this categorization")
27
+
28
+ class CVData(BaseModel):
29
+ """Pydantic model for CV data including faculty name and accomplishments."""
30
+ faculty_name: str = Field(..., description="The name of the faculty member")
31
+ accomplishments: List[Accomplishment] = Field(default_factory=list, description="List of scholarly accomplishments")
32
 
33
  # Load environment variables from .env file
34
  load_dotenv()
 
309
  - "funding_amount": For grants or funded projects (often in "Other" category), the numeric funding amount if explicitly stated (e.g., 250000). Extract only the number, without currency symbols or commas. Use "N/A" if not applicable or not found.
310
  - "confidence": A number from 1-5 indicating your confidence in this categorization (5 being highest confidence).
311
 
312
+ IMPORTANT: Ensure your JSON output is valid and does not contain any control characters or invalid escape sequences.
313
+
314
  Ensure the entire output is a single, valid JSON object like this example:
315
  {{
316
  "faculty_name": "Example Faculty Name",
 
339
  prompt,
340
  generation_config=genai.types.GenerationConfig(
341
  response_mime_type="application/json",
342
+ temperature=0.1 # Lower temperature for more consistent JSON formatting
343
  )
344
  )
345
  response_text = response.text.strip()
346
 
347
+ # Clean the response text to remove control characters and fix common JSON issues
348
+ def clean_json_text(text):
349
+ # Remove control characters
350
+ text = ''.join(ch for ch in text if ch >= ' ')
351
+
352
+ # Fix common JSON formatting issues
353
+ text = text.replace('\\"', '"') # Fix escaped quotes
354
+ text = text.replace('\\n', ' ') # Replace newlines with spaces
355
+ text = text.replace('\\t', ' ') # Replace tabs with spaces
356
+ text = text.replace('\\r', '') # Remove carriage returns
357
+
358
+ # Remove any markdown code block markers
359
+ if text.startswith("```json"):
360
+ text = text.replace("```json", "", 1)
361
+ if text.endswith("```"):
362
+ text = text.replace("```", "", 1)
363
+
364
+ return text.strip()
365
+
366
  # Try to fix common JSON formatting issues before parsing
367
  try:
368
+ cleaned_text = clean_json_text(response_text)
369
+ parsed_json = json.loads(cleaned_text)
370
+ logging.info("Successfully parsed JSON response")
371
  except json.JSONDecodeError as e:
372
+ logging.warning(f"Initial JSON parsing failed after cleaning: {e}. Attempting additional fixes.")
373
 
374
  # Try to extract JSON from markdown code blocks if present
375
+ if "```" in response_text:
376
+ parts = response_text.split("```")
377
+ for part in parts:
378
+ if part.strip().startswith("json"):
379
+ code_content = part.replace("json", "", 1).strip()
380
+ else:
381
+ code_content = part.strip()
382
+
383
+ if code_content:
384
+ try:
385
+ cleaned_code = clean_json_text(code_content)
386
+ parsed_json = json.loads(cleaned_code)
387
+ logging.info("Successfully extracted JSON from code block")
388
+ break
389
+ except json.JSONDecodeError:
390
+ continue
391
+ else:
392
+ # If we get here, none of the parts worked
393
+ raise
394
  else:
395
+ # Last resort: try to create a minimal valid JSON with just the faculty name
396
+ faculty_name = faculty_name_hint or "Unknown"
397
+
398
+ # Try to extract faculty name from text using regex
399
+ name_match = re.search(r'"faculty_name"\s*:\s*"([^"]+)"', response_text)
400
+ if name_match:
401
+ faculty_name = name_match.group(1)
402
+
403
+ logging.warning(f"Creating minimal JSON with faculty name: {faculty_name}")
404
+ parsed_json = {
405
+ "faculty_name": faculty_name,
406
+ "accomplishments": []
407
+ }
408
 
409
  extracted_faculty_name = faculty_name_hint or "Unknown"
410
  llm_faculty_name = parsed_json.get("faculty_name", "Unknown")
 
426
  return extracted_faculty_name, accomplishments_list
427
  except Exception as e:
428
  logging.error(f"Error in LLM processing: {e}")
429
+ # Try a simpler approach as fallback
430
+ try:
431
+ # Create a simpler prompt just to get the faculty name
432
+ simple_prompt = f"""
433
+ Extract the faculty name from this CV text. Return as JSON: {{"faculty_name": "Name Here"}}
434
+
435
+ CV Text (first part):
436
+ {cv_text[:5000]}
437
+ """
438
+ model = genai.GenerativeModel(MODEL_NAME)
439
+ response = model.generate_content(
440
+ simple_prompt,
441
+ generation_config=genai.types.GenerationConfig(
442
+ response_mime_type="application/json",
443
+ temperature=0.1
444
+ )
445
+ )
446
+
447
+ # Clean and parse the response
448
+ cleaned_text = ''.join(ch for ch in response.text.strip() if ch >= ' ')
449
+ simple_json = json.loads(cleaned_text)
450
+ faculty_name = simple_json.get("faculty_name", faculty_name_hint or "Unknown")
451
+
452
+ logging.info(f"Fallback extraction got faculty name: {faculty_name}")
453
+ return faculty_name, []
454
+ except Exception as fallback_error:
455
+ logging.error(f"Fallback extraction also failed: {fallback_error}")
456
+ return faculty_name_hint or "Unknown", []
457
+
458
+ def get_accomplishments_with_pydantic_ai(cv_text, faculty_name_hint=None):
459
+ """Uses Pydantic-AI to extract structured data from CV text."""
460
+ if not cv_text:
461
  return faculty_name_hint or "Unknown", []
462
 
463
+ prompt = f"""
464
+ Analyze the following CV text. First, identify the primary faculty member's name, usually found prominently at the beginning of the document or in the header/footer.
465
+ Extract the name directly from the CV content. Look for patterns like "Curriculum Vitae of [Name]", "[Name], Ph.D.", or other indicators of the primary faculty member.
466
+
467
+ IMPORTANT: Return the faculty name in proper case (e.g., "John Smith" or "Jane Doe-Smith"), NOT in all caps, even if it appears in all caps in the document.
468
+
469
+ Second, extract scholarly accomplishments based on the categories below. Follow the decision tree approach to categorize each accomplishment accurately.
470
+ BE COMPREHENSIVE: Strive to extract ALL identifiable scholarly accomplishments from the CV text that fit the defined categories. Pay attention to all sections of the CV. If an item is ambiguous but potentially relevant, lean towards including it for later review.
471
+
472
+ # DECISION TREE FOR CATEGORIZATION:
473
+
474
+ Step 1: Determine the general type of scholarly work:
475
+ - Is it a book or book contribution? β†’ Go to Books & Book Contributions
476
+ - Is it a journal article or similar publication? β†’ Go to Journal & Article Publications
477
+ - Is it a conference presentation or lecture? β†’ Go to Conference & Presentations
478
+ - Is it a creative or artistic work? β†’ Go to Creative & Artistic Works
479
+ - Is it a legal document or technical report? β†’ Go to Legal & Technical Documents
480
+ - Is it something else scholarly? β†’ Go to Other Scholarly Contributions
481
+
482
+ Step 2: Within each general type, determine the specific category:
483
+
484
+ ## Books & Book Contributions
485
+ - "Book, Authored": A complete book written by the faculty member as primary author
486
+ - "Book, Chapter": A chapter contributed to a book edited by someone else
487
+ - "Book, Edited": A book where the faculty member served as editor rather than author
488
+ - "Book, introduction, preface, etc.": Shorter contributions to books like forewords, introductions
489
+
490
+ ## Journal & Article Publications
491
+ - "Journal Article, peer-reviewed": Articles published in peer-reviewed academic journals
492
+ - "Journal Article, other": Articles in non-peer-reviewed journals
493
+ - "Newspaper/Magazine Article": Articles in popular press or magazines
494
+ - "Review/Commentary (including Blogging)": Book reviews, commentaries, blog posts
495
+
496
+ ## Conference & Presentations
497
+ - "Conference Presentation - published as proceedings": Presentations published in conference proceedings
498
+ - "Conference Presentation, other": Presentations at conferences without formal publication
499
+ - "Lecture (Invited)": Talks given by invitation rather than through submission process
500
+
501
+ ## Creative & Artistic Works
502
+ - "Digital Project": Digital scholarship, websites, tools, or resources created
503
+ - "Curated an Art Show": Organization and curation of artistic exhibitions
504
+ - "Direction/Choreography/Dramaturgy/Design": Creative direction of performances
505
+ - "Exhibited at Curated Art Show": Participation as an artist in exhibitions
506
+ - "Music Composition Published/Performed": Musical works composed
507
+ - "Performance (music, dance, theater)": Performance as an artist
508
+ - "Play or Screenplay Produced/Performed": Written dramatic works
509
+ - "Poem or Short Story Published": Creative writing published
510
+
511
+ ## Legal & Technical Documents
512
+ - "Legal Brief (Submitted)": Legal documents submitted to courts
513
+ - "Legal Review": Analysis of legal cases or issues
514
+ - "Technical/Policy Reports, peer-reviewed": Technical reports that underwent peer review
515
+ - "Technical/Policy Reports, other": Technical reports without peer review
516
+
517
+ ## Funding, Grants & Awards
518
+ - "Grant (External)": Research grants received from external funding agencies (e.g., NSF, NIH, foundations).
519
+ - "Grant (Internal)": Research grants or seed funding received from internal university sources.
520
+ - "Fellowship": Competitive fellowships awarded for research or scholarly work.
521
+ - "Award/Honor": Awards, honors, or distinctions received for scholarly work or contributions.
522
+
523
+ ## Other Scholarly Contributions
524
+ - "Patent": Registered intellectual property
525
+ - "Other": Scholarly contributions that don't fit other categories, such as datasets, software, or professional service.
526
+
527
+ CV Text:
528
+ ---
529
+ {cv_text[:45000]}
530
+ ---
531
+ """
532
+
533
+ logging.info(f"Sending request to Pydantic-AI for faculty: {faculty_name_hint or 'Unknown'}")
534
+
535
+ try:
536
+ # Set up environment variable for Gemini API key
537
+ os.environ['GEMINI_API_KEY'] = GOOGLE_API_KEY
538
+
539
+ # Create a Gemini model using the google-gla provider
540
+ model_name = f"google-gla:{MODEL_NAME}"
541
+
542
+ # Create an Agent with our CVData output type
543
+ agent = Agent(model_name, output_type=CVData, temperature=0.1, instrument=True)
544
+
545
+ # Run the agent with our prompt
546
+ result = agent.run_sync(prompt)
547
+
548
+ # Extract the structured data
549
+ cv_data = result.output
550
+
551
+ # Extract faculty name
552
+ extracted_faculty_name = faculty_name_hint or "Unknown"
553
+ if cv_data.faculty_name and cv_data.faculty_name != "Unknown":
554
+ extracted_faculty_name = cv_data.faculty_name
555
+
556
+ # Convert accomplishments to list format
557
+ accomplishments_list = []
558
+ for acc in cv_data.accomplishments:
559
+ accomplishments_list.append({
560
+ "category": acc.category,
561
+ "main_category": acc.main_category,
562
+ "year": acc.year,
563
+ "description": acc.description,
564
+ "doi_url": acc.doi_url,
565
+ "funding_amount": acc.funding_amount,
566
+ "confidence": acc.confidence
567
+ })
568
+
569
+ logging.info(f"Successfully parsed faculty name '{extracted_faculty_name}' and {len(accomplishments_list)} accomplishments using Pydantic-AI.")
570
+ return extracted_faculty_name, accomplishments_list
571
+
572
+ except Exception as e:
573
+ logging.error(f"Error in Pydantic-AI processing: {e}")
574
+ # Fall back to the original method if Pydantic-AI fails
575
+ logging.info("Falling back to original extraction method")
576
+ return get_accomplishments_from_llm(cv_text, faculty_name_hint)
577
+
578
  def get_accomplishments_with_retry(cv_text, faculty_name_hint=None, max_retries=2, initial_backoff=3):
579
  """Wrapper function that adds retry logic to the LLM API call."""
580
  retries = 0
 
582
 
583
  while retries <= max_retries:
584
  try:
585
+ # Try using Pydantic-AI first
586
+ try:
587
+ return get_accomplishments_with_pydantic_ai(cv_text, faculty_name_hint)
588
+ except Exception as pydantic_error:
589
+ logging.warning(f"Pydantic-AI extraction failed: {pydantic_error}. Falling back to standard extraction.")
590
+ # If Pydantic-AI fails, fall back to the original method
591
+ return get_accomplishments_from_llm(cv_text, faculty_name_hint)
592
  except json.JSONDecodeError as e:
593
  retries += 1
594
  logging.error(f"JSONDecodeError on attempt {retries}/{max_retries+1}: {e}. Response might not be valid JSON.")