Spaces:

Zwounds
/

cv-to-csv-extractor

Running

App Files Files Community

Zwounds commited on May 6

Commit

8f0676c

verified ·

1 Parent(s): 639add2

Update cv_extraction_app.py

Browse files

Files changed (1) hide show

cv_extraction_app.py +232 -15

cv_extraction_app.py CHANGED Viewed

@@ -9,6 +9,26 @@ import google.generativeai as genai
 import json
 import re
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
@@ -289,6 +309,8 @@ def get_accomplishments_from_llm(cv_text, faculty_name_hint=None):
         - "funding_amount": For grants or funded projects (often in "Other" category), the numeric funding amount if explicitly stated (e.g., 250000). Extract only the number, without currency symbols or commas. Use "N/A" if not applicable or not found.
         - "confidence": A number from 1-5 indicating your confidence in this categorization (5 being highest confidence).
     Ensure the entire output is a single, valid JSON object like this example:
     {{
       "faculty_name": "Example Faculty Name",
@@ -317,29 +339,72 @@ def get_accomplishments_from_llm(cv_text, faculty_name_hint=None):
             prompt,
             generation_config=genai.types.GenerationConfig(
                 response_mime_type="application/json",
-                temperature=0.2  # Lower temperature for more consistent JSON formatting
             )
         )
         response_text = response.text.strip()
         # Try to fix common JSON formatting issues before parsing
         try:
-            parsed_json = json.loads(response_text)
         except json.JSONDecodeError as e:
-            logging.warning(f"Initial JSON parsing failed: {e}. Attempting to fix common issues.")
             # Try to extract JSON from markdown code blocks if present
-            if response_text.startswith("```json") and "```" in response_text:
-                code_block_content = response_text.split("```")[1]
-                if code_block_content.startswith("json"):
-                    code_block_content = code_block_content[4:].strip()
-                try:
-                    parsed_json = json.loads(code_block_content)
-                    logging.info("Successfully extracted JSON from code block")
-                except json.JSONDecodeError:
-                    raise  # Re-raise if this also fails
             else:
-                raise  # Re-raise the original error if not in a code block
         extracted_faculty_name = faculty_name_hint or "Unknown"
         llm_faculty_name = parsed_json.get("faculty_name", "Unknown")
@@ -361,8 +426,155 @@ def get_accomplishments_from_llm(cv_text, faculty_name_hint=None):
         return extracted_faculty_name, accomplishments_list
     except Exception as e:
         logging.error(f"Error in LLM processing: {e}")
         return faculty_name_hint or "Unknown", []
 def get_accomplishments_with_retry(cv_text, faculty_name_hint=None, max_retries=2, initial_backoff=3):
     """Wrapper function that adds retry logic to the LLM API call."""
     retries = 0
@@ -370,8 +582,13 @@ def get_accomplishments_with_retry(cv_text, faculty_name_hint=None, max_retries=
     while retries <= max_retries:
         try:
-            # Call the original function that might raise exceptions
-            return get_accomplishments_from_llm(cv_text, faculty_name_hint)
         except json.JSONDecodeError as e:
             retries += 1
             logging.error(f"JSONDecodeError on attempt {retries}/{max_retries+1}: {e}. Response might not be valid JSON.")

 import json
 import re
 from dotenv import load_dotenv
+from typing import List, Optional, Union
+from pydantic import BaseModel, Field
+from pydantic_ai import Agent
+from pydantic_ai.models.gemini import GeminiModel
+# --- Pydantic Models for Data Structures ---
+class Accomplishment(BaseModel):
+    """Pydantic model for a scholarly accomplishment."""
+    category: str = Field(..., description="The specific type of scholarly work")
+    main_category: str = Field(..., description="The general category this work falls under")
+    year: Union[str, int] = Field("N/A", description="The year the accomplishment occurred")
+    description: str = Field(..., description="The full description or citation of the accomplishment")
+    doi_url: str = Field("N/A", description="The DOI or URL associated with the accomplishment")
+    funding_amount: Union[str, int] = Field("N/A", description="For grants or funded projects, the numeric funding amount")
+    confidence: int = Field(3, description="A number from 1-5 indicating confidence in this categorization")
+class CVData(BaseModel):
+    """Pydantic model for CV data including faculty name and accomplishments."""
+    faculty_name: str = Field(..., description="The name of the faculty member")
+    accomplishments: List[Accomplishment] = Field(default_factory=list, description="List of scholarly accomplishments")
 # Load environment variables from .env file
 load_dotenv()
         - "funding_amount": For grants or funded projects (often in "Other" category), the numeric funding amount if explicitly stated (e.g., 250000). Extract only the number, without currency symbols or commas. Use "N/A" if not applicable or not found.
         - "confidence": A number from 1-5 indicating your confidence in this categorization (5 being highest confidence).
+    IMPORTANT: Ensure your JSON output is valid and does not contain any control characters or invalid escape sequences.
     Ensure the entire output is a single, valid JSON object like this example:
     {{
       "faculty_name": "Example Faculty Name",
             prompt,
             generation_config=genai.types.GenerationConfig(
                 response_mime_type="application/json",
+                temperature=0.1  # Lower temperature for more consistent JSON formatting
             )
         )
         response_text = response.text.strip()
+        # Clean the response text to remove control characters and fix common JSON issues
+        def clean_json_text(text):
+            # Remove control characters
+            text = ''.join(ch for ch in text if ch >= ' ')
+            # Fix common JSON formatting issues
+            text = text.replace('\\"', '"')  # Fix escaped quotes
+            text = text.replace('\\n', ' ')  # Replace newlines with spaces
+            text = text.replace('\\t', ' ')  # Replace tabs with spaces
+            text = text.replace('\\r', '')   # Remove carriage returns
+            # Remove any markdown code block markers
+            if text.startswith("```json"):
+                text = text.replace("```json", "", 1)
+            if text.endswith("```"):
+                text = text.replace("```", "", 1)
+            return text.strip()
         # Try to fix common JSON formatting issues before parsing
         try:
+            cleaned_text = clean_json_text(response_text)
+            parsed_json = json.loads(cleaned_text)
+            logging.info("Successfully parsed JSON response")
         except json.JSONDecodeError as e:
+            logging.warning(f"Initial JSON parsing failed after cleaning: {e}. Attempting additional fixes.")
             # Try to extract JSON from markdown code blocks if present
+            if "```" in response_text:
+                parts = response_text.split("```")
+                for part in parts:
+                    if part.strip().startswith("json"):
+                        code_content = part.replace("json", "", 1).strip()
+                    else:
+                        code_content = part.strip()
+                    if code_content:
+                        try:
+                            cleaned_code = clean_json_text(code_content)
+                            parsed_json = json.loads(cleaned_code)
+                            logging.info("Successfully extracted JSON from code block")
+                            break
+                        except json.JSONDecodeError:
+                            continue
+                else:
+                    # If we get here, none of the parts worked
+                    raise
             else:
+                # Last resort: try to create a minimal valid JSON with just the faculty name
+                faculty_name = faculty_name_hint or "Unknown"
+                # Try to extract faculty name from text using regex
+                name_match = re.search(r'"faculty_name"\s*:\s*"([^"]+)"', response_text)
+                if name_match:
+                    faculty_name = name_match.group(1)
+                logging.warning(f"Creating minimal JSON with faculty name: {faculty_name}")
+                parsed_json = {
+                    "faculty_name": faculty_name,
+                    "accomplishments": []
+                }
         extracted_faculty_name = faculty_name_hint or "Unknown"
         llm_faculty_name = parsed_json.get("faculty_name", "Unknown")
         return extracted_faculty_name, accomplishments_list
     except Exception as e:
         logging.error(f"Error in LLM processing: {e}")
+        # Try a simpler approach as fallback
+        try:
+            # Create a simpler prompt just to get the faculty name
+            simple_prompt = f"""
+            Extract the faculty name from this CV text. Return as JSON: {{"faculty_name": "Name Here"}}
+            CV Text (first part):
+            {cv_text[:5000]}
+            """
+            model = genai.GenerativeModel(MODEL_NAME)
+            response = model.generate_content(
+                simple_prompt,
+                generation_config=genai.types.GenerationConfig(
+                    response_mime_type="application/json",
+                    temperature=0.1
+                )
+            )
+            # Clean and parse the response
+            cleaned_text = ''.join(ch for ch in response.text.strip() if ch >= ' ')
+            simple_json = json.loads(cleaned_text)
+            faculty_name = simple_json.get("faculty_name", faculty_name_hint or "Unknown")
+            logging.info(f"Fallback extraction got faculty name: {faculty_name}")
+            return faculty_name, []
+        except Exception as fallback_error:
+            logging.error(f"Fallback extraction also failed: {fallback_error}")
+            return faculty_name_hint or "Unknown", []
+def get_accomplishments_with_pydantic_ai(cv_text, faculty_name_hint=None):
+    """Uses Pydantic-AI to extract structured data from CV text."""
+    if not cv_text:
         return faculty_name_hint or "Unknown", []
+    prompt = f"""
+    Analyze the following CV text. First, identify the primary faculty member's name, usually found prominently at the beginning of the document or in the header/footer.
+    Extract the name directly from the CV content. Look for patterns like "Curriculum Vitae of [Name]", "[Name], Ph.D.", or other indicators of the primary faculty member.
+    IMPORTANT: Return the faculty name in proper case (e.g., "John Smith" or "Jane Doe-Smith"), NOT in all caps, even if it appears in all caps in the document.
+    Second, extract scholarly accomplishments based on the categories below. Follow the decision tree approach to categorize each accomplishment accurately.
+    BE COMPREHENSIVE: Strive to extract ALL identifiable scholarly accomplishments from the CV text that fit the defined categories. Pay attention to all sections of the CV. If an item is ambiguous but potentially relevant, lean towards including it for later review.
+    # DECISION TREE FOR CATEGORIZATION:
+    Step 1: Determine the general type of scholarly work:
+    - Is it a book or book contribution? → Go to Books & Book Contributions
+    - Is it a journal article or similar publication? → Go to Journal & Article Publications
+    - Is it a conference presentation or lecture? → Go to Conference & Presentations
+    - Is it a creative or artistic work? → Go to Creative & Artistic Works
+    - Is it a legal document or technical report? → Go to Legal & Technical Documents
+    - Is it something else scholarly? → Go to Other Scholarly Contributions
+    Step 2: Within each general type, determine the specific category:
+    ## Books & Book Contributions
+    - "Book, Authored": A complete book written by the faculty member as primary author
+    - "Book, Chapter": A chapter contributed to a book edited by someone else
+    - "Book, Edited": A book where the faculty member served as editor rather than author
+    - "Book, introduction, preface, etc.": Shorter contributions to books like forewords, introductions
+    ## Journal & Article Publications
+    - "Journal Article, peer-reviewed": Articles published in peer-reviewed academic journals
+    - "Journal Article, other": Articles in non-peer-reviewed journals
+    - "Newspaper/Magazine Article": Articles in popular press or magazines
+    - "Review/Commentary (including Blogging)": Book reviews, commentaries, blog posts
+    ## Conference & Presentations
+    - "Conference Presentation - published as proceedings": Presentations published in conference proceedings
+    - "Conference Presentation, other": Presentations at conferences without formal publication
+    - "Lecture (Invited)": Talks given by invitation rather than through submission process
+    ## Creative & Artistic Works
+    - "Digital Project": Digital scholarship, websites, tools, or resources created
+    - "Curated an Art Show": Organization and curation of artistic exhibitions
+    - "Direction/Choreography/Dramaturgy/Design": Creative direction of performances
+    - "Exhibited at Curated Art Show": Participation as an artist in exhibitions
+    - "Music Composition Published/Performed": Musical works composed
+    - "Performance (music, dance, theater)": Performance as an artist
+    - "Play or Screenplay Produced/Performed": Written dramatic works
+    - "Poem or Short Story Published": Creative writing published
+    ## Legal & Technical Documents
+    - "Legal Brief (Submitted)": Legal documents submitted to courts
+    - "Legal Review": Analysis of legal cases or issues
+    - "Technical/Policy Reports, peer-reviewed": Technical reports that underwent peer review
+    - "Technical/Policy Reports, other": Technical reports without peer review
+    ## Funding, Grants & Awards
+    - "Grant (External)": Research grants received from external funding agencies (e.g., NSF, NIH, foundations).
+    - "Grant (Internal)": Research grants or seed funding received from internal university sources.
+    - "Fellowship": Competitive fellowships awarded for research or scholarly work.
+    - "Award/Honor": Awards, honors, or distinctions received for scholarly work or contributions.
+    ## Other Scholarly Contributions
+    - "Patent": Registered intellectual property
+    - "Other": Scholarly contributions that don't fit other categories, such as datasets, software, or professional service.
+    CV Text:
+    ---
+    {cv_text[:45000]}
+    ---
+    """
+    logging.info(f"Sending request to Pydantic-AI for faculty: {faculty_name_hint or 'Unknown'}")
+    try:
+        # Set up environment variable for Gemini API key
+        os.environ['GEMINI_API_KEY'] = GOOGLE_API_KEY
+        # Create a Gemini model using the google-gla provider
+        model_name = f"google-gla:{MODEL_NAME}"
+        # Create an Agent with our CVData output type
+        agent = Agent(model_name, output_type=CVData, temperature=0.1, instrument=True)
+        # Run the agent with our prompt
+        result = agent.run_sync(prompt)
+        # Extract the structured data
+        cv_data = result.output
+        # Extract faculty name
+        extracted_faculty_name = faculty_name_hint or "Unknown"
+        if cv_data.faculty_name and cv_data.faculty_name != "Unknown":
+            extracted_faculty_name = cv_data.faculty_name
+        # Convert accomplishments to list format
+        accomplishments_list = []
+        for acc in cv_data.accomplishments:
+            accomplishments_list.append({
+                "category": acc.category,
+                "main_category": acc.main_category,
+                "year": acc.year,
+                "description": acc.description,
+                "doi_url": acc.doi_url,
+                "funding_amount": acc.funding_amount,
+                "confidence": acc.confidence
+            })
+        logging.info(f"Successfully parsed faculty name '{extracted_faculty_name}' and {len(accomplishments_list)} accomplishments using Pydantic-AI.")
+        return extracted_faculty_name, accomplishments_list
+    except Exception as e:
+        logging.error(f"Error in Pydantic-AI processing: {e}")
+        # Fall back to the original method if Pydantic-AI fails
+        logging.info("Falling back to original extraction method")
+        return get_accomplishments_from_llm(cv_text, faculty_name_hint)
 def get_accomplishments_with_retry(cv_text, faculty_name_hint=None, max_retries=2, initial_backoff=3):
     """Wrapper function that adds retry logic to the LLM API call."""
     retries = 0
     while retries <= max_retries:
         try:
+            # Try using Pydantic-AI first
+            try:
+                return get_accomplishments_with_pydantic_ai(cv_text, faculty_name_hint)
+            except Exception as pydantic_error:
+                logging.warning(f"Pydantic-AI extraction failed: {pydantic_error}. Falling back to standard extraction.")
+                # If Pydantic-AI fails, fall back to the original method
+                return get_accomplishments_from_llm(cv_text, faculty_name_hint)
         except json.JSONDecodeError as e:
             retries += 1
             logging.error(f"JSONDecodeError on attempt {retries}/{max_retries+1}: {e}. Response might not be valid JSON.")