File size: 43,531 Bytes
d7bf02a
 
 
 
 
 
 
 
 
 
 
8f0676c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7bf02a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f0676c
 
d7bf02a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f0676c
d7bf02a
 
 
 
8f0676c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7bf02a
 
8f0676c
 
 
d7bf02a
8f0676c
d7bf02a
 
8f0676c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7bf02a
8f0676c
 
 
 
 
 
 
 
 
 
 
 
 
d7bf02a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f0676c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7bf02a
 
8f0676c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7bf02a
 
 
 
 
 
 
8f0676c
 
 
 
 
 
 
d7bf02a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b265e28
 
 
 
 
 
 
d7bf02a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
import gradio as gr
import os
import tempfile
import pandas as pd
import logging
import time
from PyPDF2 import PdfReader
import google.generativeai as genai
import json
import re
from dotenv import load_dotenv
from typing import List, Optional, Union
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from pydantic_ai.models.gemini import GeminiModel

# --- Pydantic Models for Data Structures ---
class Accomplishment(BaseModel):
    """Pydantic model for a scholarly accomplishment."""
    category: str = Field(..., description="The specific type of scholarly work")
    main_category: str = Field(..., description="The general category this work falls under")
    year: Union[str, int] = Field("N/A", description="The year the accomplishment occurred")
    description: str = Field(..., description="The full description or citation of the accomplishment")
    doi_url: str = Field("N/A", description="The DOI or URL associated with the accomplishment")
    funding_amount: Union[str, int] = Field("N/A", description="For grants or funded projects, the numeric funding amount")
    confidence: int = Field(3, description="A number from 1-5 indicating confidence in this categorization")

class CVData(BaseModel):
    """Pydantic model for CV data including faculty name and accomplishments."""
    faculty_name: str = Field(..., description="The name of the faculty member")
    accomplishments: List[Accomplishment] = Field(default_factory=list, description="List of scholarly accomplishments")

# Load environment variables from .env file
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Configuration ---
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
MODEL_NAME = 'gemini-2.5-flash-preview-04-17'  # Using the original model as specified
APP_PASSWORD = os.environ.get('APP_PASSWORD')  # Password for app authentication

# Main categories (first tier)
MAIN_CATEGORIES = [
    "Books & Book Contributions",
    "Journal & Article Publications",
    "Conference & Presentations",
    "Creative & Artistic Works",
    "Legal & Technical Documents",
    "Funding, Grants & Awards",  # New Main Category
    "Other Scholarly Contributions"
]

# Specific types (second tier) - these will be the actual categories in the CSV
SCHOLARLY_WORK_TYPES = [
    # Books & Book Contributions
    "Book, Authored",
    "Book, Chapter",
    "Book, Edited",
    "Book, introduction, preface, etc.",
    # Journal & Article Publications
    "Journal Article, peer-reviewed",
    "Journal Article, other",
    "Newspaper/Magazine Article",
    "Review/Commentary (including Blogging)",
    # Conference & Presentations
    "Conference Presentation - published as proceedings",
    "Conference Presentation, other",
    "Lecture (Invited)",
    # Creative & Artistic Works
    "Digital Project",
    "Curated an Art Show",
    "Direction/Choreography/Dramaturgy/Design",
    "Exhibited at Curated Art Show",
    "Music Composition Published/Performed",
    "Performance (music, dance, theater)",
    "Play or Screenplay Produced/Performed",
    "Poem or Short Story Published",
    # Legal & Technical Documents
    "Legal Brief (Submitted)",
    "Legal Review",
    "Technical/Policy Reports, peer-reviewed",
    "Technical/Policy Reports, other",
    # Funding, Grants & Awards
    "Grant (External)",
    "Grant (Internal)",
    "Fellowship",
    "Award/Honor",
    # Other Scholarly Contributions
    "Patent",
    "Other"
]

# --- Helper Functions ---

def clean_text(text):
    """Cleans text by replacing common ligatures and smart quotes."""
    replacements = {
        "ff": "ff", "fi": "fi", "fl": "fl", "ffi": "ffi", "ffl": "ffl",
        """: "\"", """: "\"", "'": "'", "'": "'",
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

def clean_cv_specific_text(text):
    """Apply CV-specific cleaning rules to improve text quality."""
    # Remove page numbers (common in CVs) - improved regex
    text = re.sub(r'\n\s*\d+\s*(\n|$)', '\n', text)
    text = re.sub(r'^\s*\d+\s*\n', '', text)  # Page number at the very beginning

    # Fix common CV formatting issues like names split across lines
    text = re.sub(r'([a-zA-Z])\s*\n\s*([a-zA-Z])', r'\1 \2', text)  # General case for text split over newlines
    text = re.sub(r'([A-Z][a-z]+(?:-[A-Z][a-z]+)?)\s*\n\s*([A-Z][a-z]+)', r'\1 \2', text)  # More specific for names

    # Normalize citation formats - e.g., year punctuation
    text = re.sub(r'(\d{4})\s*\.\s*', r'\1. ', text)
    # Remove excessive newlines
    text = re.sub(r'\n\s*\n', '\n', text)
    return text

def extract_text_from_pdf(pdf_file):
    """Extracts text from a given PDF file."""
    logging.info(f"Extracting text from: {pdf_file.name}")
    try:
        reader = PdfReader(pdf_file.name)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
        cleaned_text = clean_text(text)
        cleaned_text = clean_cv_specific_text(cleaned_text)  # Apply CV specific cleaning
        logging.info(f"Successfully extracted and cleaned text from {pdf_file.name} (Length: {len(cleaned_text)})")
        return cleaned_text
    except Exception as e:
        logging.error(f"Error reading PDF {pdf_file.name}: {e}")
        return None

def extract_pdf_metadata(pdf_file):
    """Extract metadata from PDF that might help with faculty identification."""
    try:
        reader = PdfReader(pdf_file.name)
        metadata = reader.metadata
        author = metadata.get('/Author', '')
        title = metadata.get('/Title', '')
        # PyPDF2 might return Author as a list
        if isinstance(author, list):
            author = ", ".join(author) if author else ''
        if isinstance(title, list):
            title = ", ".join(title) if title else ''

        return {
            'author': str(author) if author else '',
            'title': str(title) if title else '',
            'filename': os.path.basename(pdf_file.name)
        }
    except Exception as e:
        logging.error(f"Error extracting metadata from {pdf_file.name}: {e}")
        return {'filename': os.path.basename(pdf_file.name), 'author': '', 'title': ''}

def get_faculty_name_from_llm(cv_text_chunk):
    """Sends a small chunk of CV text to LLM to extract only the faculty name."""
    if not cv_text_chunk:
        return "Unknown", None
    prompt = f"""
    Analyze the following CV text chunk. Identify the primary faculty member's name, usually found prominently at the beginning of the document.
    Return the result as a single JSON object with a top-level key "faculty_name" and the extracted faculty name as a string.
    If the name cannot be reliably determined, use "Unknown".

    Example: {{ "faculty_name": "Dr. Jane Doe" }}

    CV Text Chunk:
    ---
    {cv_text_chunk}
    ---
    JSON Output:
    """
    try:
        model = genai.GenerativeModel(MODEL_NAME)
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(response_mime_type="application/json")
        )
        parsed_json = json.loads(response.text)
        faculty_name = parsed_json.get("faculty_name", "Unknown")
        if not isinstance(faculty_name, str) or not faculty_name.strip():
            faculty_name = "Unknown"
        return faculty_name, None  # No accomplishments from this call
    except Exception as e:
        logging.error(f"Error extracting faculty name with LLM: {e}")
        return "Unknown", None

def get_accomplishments_from_llm(cv_text, faculty_name_hint=None):
    """Sends CV text to Google Gemini API and returns faculty name and structured accomplishments."""
    if not cv_text:
        return faculty_name_hint or "Unknown", []

    prompt = f"""
    Analyze the following CV text. First, identify the primary faculty member's name, usually found prominently at the beginning of the document or in the header/footer.
    Extract the name directly from the CV content. Look for patterns like "Curriculum Vitae of [Name]", "[Name], Ph.D.", or other indicators of the primary faculty member.
    
    IMPORTANT: Return the faculty name in proper case (e.g., "John Smith" or "Jane Doe-Smith"), NOT in all caps, even if it appears in all caps in the document.

    Second, extract scholarly accomplishments based on the categories below. Follow the decision tree approach to categorize each accomplishment accurately.
    BE COMPREHENSIVE: Strive to extract ALL identifiable scholarly accomplishments from the CV text that fit the defined categories. Pay attention to all sections of the CV. If an item is ambiguous but potentially relevant, lean towards including it for later review.

    # DECISION TREE FOR CATEGORIZATION:

    Step 1: Determine the general type of scholarly work:
    - Is it a book or book contribution? β†’ Go to Books & Book Contributions
    - Is it a journal article or similar publication? β†’ Go to Journal & Article Publications
    - Is it a conference presentation or lecture? β†’ Go to Conference & Presentations
    - Is it a creative or artistic work? β†’ Go to Creative & Artistic Works
    - Is it a legal document or technical report? β†’ Go to Legal & Technical Documents
    - Is it something else scholarly? β†’ Go to Other Scholarly Contributions

    Step 2: Within each general type, determine the specific category:

    ## Books & Book Contributions
    - "Book, Authored": A complete book written by the faculty member as primary author
      Example: "Smith, J. (2020). The Evolution of Digital Learning. Routledge."
    - "Book, Chapter": A chapter contributed to a book edited by someone else
      Example: "Smith, J. (2020). Digital pedagogy frameworks. In A. Johnson (Ed.), Handbook of Educational Technology (pp. 45-67). Routledge."
    - "Book, Edited": A book where the faculty member served as editor rather than author
      Example: "Smith, J. (Ed.). (2020). Perspectives on Digital Learning. Routledge."
    - "Book, introduction, preface, etc.": Shorter contributions to books like forewords, introductions
      Example: "Smith, J. (2020). Foreword. In A. Johnson, Digital Learning Environments (pp. ix-xi). Routledge."

    ## Journal & Article Publications
    - "Journal Article, peer-reviewed": Articles published in peer-reviewed academic journals
      Example: "Smith, J. (2020). Digital literacy in higher education. Journal of Educational Technology, 45(2), 123-145. https://doi.org/10.xxxx/yyyy"
      Look for: journal name, volume/issue numbers, DOI, mentions of peer review
    - "Journal Article, other": Articles in non-peer-reviewed journals
      Example: "Smith, J. (2020). Teaching in digital environments. Educational Practice, 15, 78-92."
    - "Newspaper/Magazine Article": Articles in popular press or magazines
      Example: "Smith, J. (2020, March 15). How technology is changing education. The Education Times, pp. 23-24."
    - "Review/Commentary (including Blogging)": Book reviews, commentaries, blog posts
      Example: "Smith, J. (2020). [Review of the book Digital Pedagogy, by A. Johnson]. Educational Review, 12(3), 45-47."

    ## Conference & Presentations
    - "Conference Presentation - published as proceedings": Presentations published in conference proceedings
      Example: "Smith, J. (2020). Virtual reality in education. Proceedings of the International Conference on Educational Technology, 234-241. IEEE."
      Look for: "Proceedings of", publisher information, page numbers
    - "Conference Presentation, other": Presentations at conferences without formal publication
      Example: "Smith, J. (2020, June). Virtual reality applications. Paper presented at the Educational Technology Conference, Boston, MA."
    - "Lecture (Invited)": Talks given by invitation rather than through submission process
      Example: "Smith, J. (2020, April). The future of digital learning. Invited lecture at Harvard University, Cambridge, MA."
      Look for: "invited", "keynote", "guest lecture"

    ## Creative & Artistic Works
    - "Digital Project": Digital scholarship, websites, tools, or resources created
      Example: "Smith, J. (2018-2020). Digital Learning Archive [Web application]. https://digitallearningarchive.org"
    - "Curated an Art Show": Organization and curation of artistic exhibitions
      Example: "Smith, J. (Curator). (2020). Digital Art in Education [Exhibition]. University Gallery, Boston, MA."
    - "Direction/Choreography/Dramaturgy/Design": Creative direction of performances
      Example: "Smith, J. (Director). (2020). The Digital Divide [Theater production]. University Theater, Boston, MA."
    - "Exhibited at Curated Art Show": Participation as an artist in exhibitions
      Example: "Smith, J. (2020). Learning Through Screens [Digital art]. In Digital Expressions, University Gallery, Boston, MA."
    - "Music Composition Published/Performed": Musical works composed
      Example: "Smith, J. (Composer). (2020). Digital Sonata [Musical composition]. Performed by Boston Symphony, Symphony Hall, Boston, MA."
    - "Performance (music, dance, theater)": Performance as an artist
      Example: "Smith, J. (Performer). (2020). The Digital Age [Dance performance]. Kennedy Center, Washington, DC."
    - "Play or Screenplay Produced/Performed": Written dramatic works
      Example: "Smith, J. (Playwright). (2020). Virtual Connections [Play]. Produced at University Theater, Boston, MA."
    - "Poem or Short Story Published": Creative writing published
      Example: "Smith, J. (2020). Digital dreams [Poem]. Literary Journal, 23(2), 45-46."

    ## Legal & Technical Documents
    - "Legal Brief (Submitted)": Legal documents submitted to courts
      Example: "Smith, J. (2020). Amicus brief in Digital Rights Foundation v. State Board of Education. Supreme Court of Massachusetts."
    - "Legal Review": Analysis of legal cases or issues
      Example: "Smith, J. (2020). Digital privacy in educational settings: A legal analysis. Harvard Law Review, 133(4), 1023-1056."
    - "Technical/Policy Reports, peer-reviewed": Technical reports that underwent peer review
      Example: "Smith, J. (2020). Digital learning standards (Technical Report No. 2020-05). Educational Technology Consortium. [Peer-reviewed]"
    - "Technical/Policy Reports, other": Technical reports without peer review
      Example: "Smith, J. (2020). Implementing digital tools in K-12 (White Paper). Center for Digital Education."

    ## Funding, Grants & Awards
    - "Grant (External)": Research grants received from external funding agencies (e.g., NSF, NIH, foundations).
      Example: "Smith, J. (PI). (2021-2024). Project Title. National Science Foundation (#1234567). $500,000."
      For this category, extract the numeric funding amount into the "funding_amount" field (e.g., 500000).
    - "Grant (Internal)": Research grants or seed funding received from internal university sources.
      Example: "Smith, J. (PI). (2020). Pilot study on X. University Research Grant. $10,000."
      For this category, extract the numeric funding amount into the "funding_amount" field (e.g., 10000).
    - "Fellowship": Competitive fellowships awarded for research or scholarly work. May or may not have an explicit monetary value listed.
      Example: "Smith, J. (2019-2020). Doctoral Dissertation Fellowship. Mellon Foundation. $30,000 stipend."
      If a monetary value is stated, extract it into "funding_amount". Otherwise, use "N/A".
    - "Award/Honor": Awards, honors, or distinctions received for scholarly work or contributions. Typically no funding amount.
      Example: "Smith, J. (2022). Best Paper Award, International Conference on Educational Technology."
      "funding_amount" should usually be "N/A" for this category unless explicitly stated as a monetary prize.

    ## Other Scholarly Contributions
    - "Patent": Registered intellectual property
      Example: "Smith, J. (2020). Digital learning assessment system (U.S. Patent No. 10,123,456). U.S. Patent and Trademark Office."
    - "Other": Scholarly contributions that don't fit other categories, such as datasets, software, or professional service.
      Example: "Smith, J. (2020). Dataset: Survey of digital learning practices [Data set]. Harvard Dataverse. https://doi.org/10.xxxx/yyyy"


    Return the result as a single JSON object containing:
    1. A top-level key "faculty_name" with the extracted faculty name as a string. If the name cannot be reliably determined from this text and no hint was provided, use "Unknown". If a hint was provided, prefer the hint if no clear name is in the text.
    2. A top-level key "accomplishments" containing a list of JSON objects, where each object represents one accomplishment with the following details:
        - "category": The specific type of scholarly work from the list above (e.g., "Book, Authored", "Journal Article, peer-reviewed", etc.)
        - "main_category": The general category this work falls under (e.g., "Books & Book Contributions", "Journal & Article Publications", etc.)
        - "year": The year the accomplishment occurred (as an integer or string). If multiple years or a range, use the start year or the most prominent year. If no year is found, use "N/A".
        - "description": The full description or citation of the accomplishment.
        - "doi_url": The DOI or URL associated with the accomplishment, if present. Use "N/A" if not found.
        - "funding_amount": For grants or funded projects (often in "Other" category), the numeric funding amount if explicitly stated (e.g., 250000). Extract only the number, without currency symbols or commas. Use "N/A" if not applicable or not found.
        - "confidence": A number from 1-5 indicating your confidence in this categorization (5 being highest confidence).

    IMPORTANT: Ensure your JSON output is valid and does not contain any control characters or invalid escape sequences.
    
    Ensure the entire output is a single, valid JSON object like this example:
    {{
      "faculty_name": "Example Faculty Name",
      "accomplishments": [
        {{ "category": "Journal Article, peer-reviewed", "main_category": "Journal & Article Publications", "year": "2023", "description": "...", "doi_url": "...", "funding_amount": "N/A", "confidence": 5 }},
        {{ "category": "Book, Chapter", "main_category": "Books & Book Contributions", "year": "2022", "description": "...", "doi_url": "N/A", "funding_amount": "N/A", "confidence": 4 }}
      ]
    }}
    Do not include any text before or after the JSON object.

    CV Text:
    ---
    {cv_text[:45000]} 
    ---

    JSON Output:
    """
    # Max input tokens for flash is ~128k, but output also counts.
    # CV text can be very long. We'll truncate here, but a more robust solution might involve chunking.

    logging.info(f"Sending request to Gemini API for faculty: {faculty_name_hint or 'Unknown'}")
    
    try:
        model = genai.GenerativeModel(MODEL_NAME)
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                response_mime_type="application/json",
                temperature=0.1  # Lower temperature for more consistent JSON formatting
            )
        )
        response_text = response.text.strip()
        
        # Clean the response text to remove control characters and fix common JSON issues
        def clean_json_text(text):
            # Remove control characters
            text = ''.join(ch for ch in text if ch >= ' ')
            
            # Fix common JSON formatting issues
            text = text.replace('\\"', '"')  # Fix escaped quotes
            text = text.replace('\\n', ' ')  # Replace newlines with spaces
            text = text.replace('\\t', ' ')  # Replace tabs with spaces
            text = text.replace('\\r', '')   # Remove carriage returns
            
            # Remove any markdown code block markers
            if text.startswith("```json"):
                text = text.replace("```json", "", 1)
            if text.endswith("```"):
                text = text.replace("```", "", 1)
                
            return text.strip()
        
        # Try to fix common JSON formatting issues before parsing
        try:
            cleaned_text = clean_json_text(response_text)
            parsed_json = json.loads(cleaned_text)
            logging.info("Successfully parsed JSON response")
        except json.JSONDecodeError as e:
            logging.warning(f"Initial JSON parsing failed after cleaning: {e}. Attempting additional fixes.")
            
            # Try to extract JSON from markdown code blocks if present
            if "```" in response_text:
                parts = response_text.split("```")
                for part in parts:
                    if part.strip().startswith("json"):
                        code_content = part.replace("json", "", 1).strip()
                    else:
                        code_content = part.strip()
                    
                    if code_content:
                        try:
                            cleaned_code = clean_json_text(code_content)
                            parsed_json = json.loads(cleaned_code)
                            logging.info("Successfully extracted JSON from code block")
                            break
                        except json.JSONDecodeError:
                            continue
                else:
                    # If we get here, none of the parts worked
                    raise
            else:
                # Last resort: try to create a minimal valid JSON with just the faculty name
                faculty_name = faculty_name_hint or "Unknown"
                
                # Try to extract faculty name from text using regex
                name_match = re.search(r'"faculty_name"\s*:\s*"([^"]+)"', response_text)
                if name_match:
                    faculty_name = name_match.group(1)
                
                logging.warning(f"Creating minimal JSON with faculty name: {faculty_name}")
                parsed_json = {
                    "faculty_name": faculty_name,
                    "accomplishments": []
                }

        extracted_faculty_name = faculty_name_hint or "Unknown"
        llm_faculty_name = parsed_json.get("faculty_name", "Unknown")
        if not isinstance(llm_faculty_name, str) or not llm_faculty_name.strip():
            llm_faculty_name = "Unknown"

        if faculty_name_hint and faculty_name_hint != "Unknown":
            extracted_faculty_name = faculty_name_hint
        elif llm_faculty_name != "Unknown":
            extracted_faculty_name = llm_faculty_name

        accomplishments_list = []
        if "accomplishments" in parsed_json and isinstance(parsed_json["accomplishments"], list):
            accomplishments_list = parsed_json["accomplishments"]
            logging.info(f"Successfully parsed faculty name '{extracted_faculty_name}' and {len(accomplishments_list)} accomplishments.")
        else:
            logging.warning("LLM response JSON does not contain a valid 'accomplishments' list.")
                
        return extracted_faculty_name, accomplishments_list
    except Exception as e:
        logging.error(f"Error in LLM processing: {e}")
        # Try a simpler approach as fallback
        try:
            # Create a simpler prompt just to get the faculty name
            simple_prompt = f"""
            Extract the faculty name from this CV text. Return as JSON: {{"faculty_name": "Name Here"}}
            
            CV Text (first part):
            {cv_text[:5000]}
            """
            model = genai.GenerativeModel(MODEL_NAME)
            response = model.generate_content(
                simple_prompt,
                generation_config=genai.types.GenerationConfig(
                    response_mime_type="application/json",
                    temperature=0.1
                )
            )
            
            # Clean and parse the response
            cleaned_text = ''.join(ch for ch in response.text.strip() if ch >= ' ')
            simple_json = json.loads(cleaned_text)
            faculty_name = simple_json.get("faculty_name", faculty_name_hint or "Unknown")
            
            logging.info(f"Fallback extraction got faculty name: {faculty_name}")
            return faculty_name, []
        except Exception as fallback_error:
            logging.error(f"Fallback extraction also failed: {fallback_error}")
            return faculty_name_hint or "Unknown", []

def get_accomplishments_with_pydantic_ai(cv_text, faculty_name_hint=None):
    """Uses Pydantic-AI to extract structured data from CV text."""
    if not cv_text:
        return faculty_name_hint or "Unknown", []

    prompt = f"""
    Analyze the following CV text. First, identify the primary faculty member's name, usually found prominently at the beginning of the document or in the header/footer.
    Extract the name directly from the CV content. Look for patterns like "Curriculum Vitae of [Name]", "[Name], Ph.D.", or other indicators of the primary faculty member.
    
    IMPORTANT: Return the faculty name in proper case (e.g., "John Smith" or "Jane Doe-Smith"), NOT in all caps, even if it appears in all caps in the document.

    Second, extract scholarly accomplishments based on the categories below. Follow the decision tree approach to categorize each accomplishment accurately.
    BE COMPREHENSIVE: Strive to extract ALL identifiable scholarly accomplishments from the CV text that fit the defined categories. Pay attention to all sections of the CV. If an item is ambiguous but potentially relevant, lean towards including it for later review.

    # DECISION TREE FOR CATEGORIZATION:

    Step 1: Determine the general type of scholarly work:
    - Is it a book or book contribution? β†’ Go to Books & Book Contributions
    - Is it a journal article or similar publication? β†’ Go to Journal & Article Publications
    - Is it a conference presentation or lecture? β†’ Go to Conference & Presentations
    - Is it a creative or artistic work? β†’ Go to Creative & Artistic Works
    - Is it a legal document or technical report? β†’ Go to Legal & Technical Documents
    - Is it something else scholarly? β†’ Go to Other Scholarly Contributions

    Step 2: Within each general type, determine the specific category:

    ## Books & Book Contributions
    - "Book, Authored": A complete book written by the faculty member as primary author
    - "Book, Chapter": A chapter contributed to a book edited by someone else
    - "Book, Edited": A book where the faculty member served as editor rather than author
    - "Book, introduction, preface, etc.": Shorter contributions to books like forewords, introductions

    ## Journal & Article Publications
    - "Journal Article, peer-reviewed": Articles published in peer-reviewed academic journals
    - "Journal Article, other": Articles in non-peer-reviewed journals
    - "Newspaper/Magazine Article": Articles in popular press or magazines
    - "Review/Commentary (including Blogging)": Book reviews, commentaries, blog posts

    ## Conference & Presentations
    - "Conference Presentation - published as proceedings": Presentations published in conference proceedings
    - "Conference Presentation, other": Presentations at conferences without formal publication
    - "Lecture (Invited)": Talks given by invitation rather than through submission process

    ## Creative & Artistic Works
    - "Digital Project": Digital scholarship, websites, tools, or resources created
    - "Curated an Art Show": Organization and curation of artistic exhibitions
    - "Direction/Choreography/Dramaturgy/Design": Creative direction of performances
    - "Exhibited at Curated Art Show": Participation as an artist in exhibitions
    - "Music Composition Published/Performed": Musical works composed
    - "Performance (music, dance, theater)": Performance as an artist
    - "Play or Screenplay Produced/Performed": Written dramatic works
    - "Poem or Short Story Published": Creative writing published

    ## Legal & Technical Documents
    - "Legal Brief (Submitted)": Legal documents submitted to courts
    - "Legal Review": Analysis of legal cases or issues
    - "Technical/Policy Reports, peer-reviewed": Technical reports that underwent peer review
    - "Technical/Policy Reports, other": Technical reports without peer review

    ## Funding, Grants & Awards
    - "Grant (External)": Research grants received from external funding agencies (e.g., NSF, NIH, foundations).
    - "Grant (Internal)": Research grants or seed funding received from internal university sources.
    - "Fellowship": Competitive fellowships awarded for research or scholarly work.
    - "Award/Honor": Awards, honors, or distinctions received for scholarly work or contributions.

    ## Other Scholarly Contributions
    - "Patent": Registered intellectual property
    - "Other": Scholarly contributions that don't fit other categories, such as datasets, software, or professional service.

    CV Text:
    ---
    {cv_text[:45000]} 
    ---
    """

    logging.info(f"Sending request to Pydantic-AI for faculty: {faculty_name_hint or 'Unknown'}")
    
    try:
        # Set up environment variable for Gemini API key
        os.environ['GEMINI_API_KEY'] = GOOGLE_API_KEY
        
        # Create a Gemini model using the google-gla provider
        model_name = f"google-gla:{MODEL_NAME}"
        
        # Create an Agent with our CVData output type
        agent = Agent(model_name, output_type=CVData, temperature=0.1, instrument=True)
        
        # Run the agent with our prompt
        result = agent.run_sync(prompt)
        
        # Extract the structured data
        cv_data = result.output
        
        # Extract faculty name
        extracted_faculty_name = faculty_name_hint or "Unknown"
        if cv_data.faculty_name and cv_data.faculty_name != "Unknown":
            extracted_faculty_name = cv_data.faculty_name
            
        # Convert accomplishments to list format
        accomplishments_list = []
        for acc in cv_data.accomplishments:
            accomplishments_list.append({
                "category": acc.category,
                "main_category": acc.main_category,
                "year": acc.year,
                "description": acc.description,
                "doi_url": acc.doi_url,
                "funding_amount": acc.funding_amount,
                "confidence": acc.confidence
            })
        
        logging.info(f"Successfully parsed faculty name '{extracted_faculty_name}' and {len(accomplishments_list)} accomplishments using Pydantic-AI.")
        return extracted_faculty_name, accomplishments_list
        
    except Exception as e:
        logging.error(f"Error in Pydantic-AI processing: {e}")
        # Fall back to the original method if Pydantic-AI fails
        logging.info("Falling back to original extraction method")
        return get_accomplishments_from_llm(cv_text, faculty_name_hint)

def get_accomplishments_with_retry(cv_text, faculty_name_hint=None, max_retries=2, initial_backoff=3):
    """Wrapper function that adds retry logic to the LLM API call."""
    retries = 0
    backoff_time = initial_backoff
    
    while retries <= max_retries:
        try:
            # Try using Pydantic-AI first
            try:
                return get_accomplishments_with_pydantic_ai(cv_text, faculty_name_hint)
            except Exception as pydantic_error:
                logging.warning(f"Pydantic-AI extraction failed: {pydantic_error}. Falling back to standard extraction.")
                # If Pydantic-AI fails, fall back to the original method
                return get_accomplishments_from_llm(cv_text, faculty_name_hint)
        except json.JSONDecodeError as e:
            retries += 1
            logging.error(f"JSONDecodeError on attempt {retries}/{max_retries+1}: {e}. Response might not be valid JSON.")
            if retries > max_retries:
                logging.error(f"Failed after {max_retries+1} attempts due to JSONDecodeError.")
                return faculty_name_hint or "Unknown", []
            # No retry for JSONDecodeError usually, as it implies a persistent issue with response format
            # However, for robustness, we can allow one retry if it's not the last attempt.
            if retries <=1:  # Only retry JSON decode once
                 logging.info(f"Retrying JSON decode in {backoff_time}s...")
                 time.sleep(backoff_time)
                 backoff_time *= 2
            else:
                return faculty_name_hint or "Unknown", []  # Give up on JSON decode errors after 1 retry
        except Exception as e:  # Catches other API errors, network issues, etc.
            retries += 1
            logging.warning(f"API Error on attempt {retries}/{max_retries+1} for faculty '{faculty_name_hint or 'Unknown'}': {e}")
            if "content filter" in str(e).lower():
                logging.error(f"Content filter triggered for faculty '{faculty_name_hint or 'Unknown'}'. No further retries for this error.")
                return faculty_name_hint or "Unknown", []  # Don't retry content filter errors

            if retries > max_retries:
                logging.error(f"Failed after {max_retries+1} attempts for faculty '{faculty_name_hint or 'Unknown'}'.")
                return faculty_name_hint or "Unknown", []
                
            logging.info(f"Retrying in {backoff_time}s for faculty '{faculty_name_hint or 'Unknown'}'...")
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponential backoff
    return faculty_name_hint or "Unknown", []  # Should be unreachable if logic is correct

def validate_and_clean_accomplishment(item, faculty_name_cv, filename):
    """Validates and cleans a single accomplishment item."""
    category = item.get("category", "Other")
    main_category_map = {
        "Book, Authored": "Books & Book Contributions",
        "Book, Chapter": "Books & Book Contributions",
        "Book, Edited": "Books & Book Contributions",
        "Book, introduction, preface, etc.": "Books & Book Contributions",
        "Journal Article, peer-reviewed": "Journal & Article Publications",
        "Journal Article, other": "Journal & Article Publications",
        "Newspaper/Magazine Article": "Journal & Article Publications",
        "Review/Commentary (including Blogging)": "Journal & Article Publications",
        "Conference Presentation - published as proceedings": "Conference & Presentations",
        "Conference Presentation, other": "Conference & Presentations",
        "Lecture (Invited)": "Conference & Presentations",
        "Digital Project": "Creative & Artistic Works",
        "Curated an Art Show": "Creative & Artistic Works",
        "Direction/Choreography/Dramaturgy/Design": "Creative & Artistic Works",
        "Exhibited at Curated Art Show": "Creative & Artistic Works",
        "Music Composition Published/Performed": "Creative & Artistic Works",
        "Performance (music, dance, theater)": "Creative & Artistic Works",
        "Play or Screenplay Produced/Performed": "Creative & Artistic Works",
        "Poem or Short Story Published": "Creative & Artistic Works",
        "Legal Brief (Submitted)": "Legal & Technical Documents",
        "Legal Review": "Legal & Technical Documents",
        "Technical/Policy Reports, peer-reviewed": "Legal & Technical Documents",
        "Technical/Policy Reports, other": "Legal & Technical Documents",
        "Grant (External)": "Funding, Grants & Awards",
        "Grant (Internal)": "Funding, Grants & Awards",
        "Fellowship": "Funding, Grants & Awards",
        "Award/Honor": "Funding, Grants & Awards",
        "Patent": "Other Scholarly Contributions",
        "Other": "Other Scholarly Contributions"
    }
    main_category = item.get("main_category")
    # If main_category is not provided by LLM or is unexpected, try to map it
    if not main_category or main_category not in MAIN_CATEGORIES:
        main_category = main_category_map.get(category, "Other Scholarly Contributions")

    year = str(item.get("year", "N/A"))  # Ensure year is string
    description = item.get("description", "").strip()
    doi_url = item.get("doi_url", "N/A")
    funding_amount = item.get("funding_amount", "N/A")
    confidence = item.get("confidence", 3)  # Default to medium confidence
    try:
        confidence = int(confidence)
    except (ValueError, TypeError):
        confidence = 3  # Default if conversion fails

    needs_review = confidence < 3

    # Basic validation: if description is empty, skip
    if not description:
        return None

    return {
        "Faculty_Name": faculty_name_cv,
        "CV_Filename": os.path.basename(filename),
        "Main_Category": main_category,
        "Category": category,
        "Year": year,
        "Description": description,
        "DOI_URL": doi_url,
        "Funding_Amount": funding_amount,
        "Confidence": confidence,
        "Needs_Review": "Yes" if needs_review else "No"
    }

# --- Gradio App Functions ---

def check_password(password):
    """Check if the provided password matches the app password."""
    if not APP_PASSWORD:
        # If no password is set, allow access (for development)
        return True
    return password == APP_PASSWORD

def process_cv_files(pdf_files, progress=gr.Progress()):
    """Process uploaded CV files and extract accomplishments."""
    if not pdf_files:
        raise gr.Error("Please upload at least one PDF file.")
    
    if not GOOGLE_API_KEY:
        raise gr.Error("Google API key is not configured. Please set the GOOGLE_API_KEY environment variable.")
    
    genai.configure(api_key=GOOGLE_API_KEY)
    
    all_accomplishments = []
    total_steps = len(pdf_files) * 4  # 4 steps per file: extract text, get metadata, extract accomplishments, process results
    current_step = 0
    
    # Process each PDF file
    for i, pdf_file in enumerate(pdf_files):
        file_name = os.path.basename(pdf_file.name)
        progress(current_step/total_steps, f"Processing file {i+1}/{len(pdf_files)}: {file_name}")
        current_step += 1
        
        # Extract text from PDF
        progress(current_step/total_steps, f"Extracting text from {file_name}")
        cv_text = extract_text_from_pdf(pdf_file)
        if not cv_text:
            gr.Warning(f"Could not extract text from {file_name}. Skipping.")
            current_step += 3  # Skip remaining steps for this file
            continue
        current_step += 1
        
        # Get PDF metadata
        progress(current_step/total_steps, f"Processing metadata for {file_name}")
        pdf_metadata = extract_pdf_metadata(pdf_file)
        current_step += 1
        
        # Extract faculty name and accomplishments
        progress(current_step/total_steps, f"Extracting accomplishments from {file_name}")
        faculty_name_cv, accomplishments_list = get_accomplishments_with_retry(cv_text)
        current_step += 1
        
        # Fallback logic if LLM returns "Unknown"
        if faculty_name_cv == "Unknown":
            metadata_author = pdf_metadata.get('author', '').strip()
            if metadata_author:
                faculty_name_cv = metadata_author
                logging.info(f"Used PDF metadata author '{faculty_name_cv}' for {pdf_file.name}")
        
        if faculty_name_cv == "Unknown":  # If still unknown, try filename
            name_from_file = os.path.splitext(os.path.basename(pdf_file.name))[0].replace("_", " ").replace("-", " ")
            # Basic heuristic to see if it looks like a name
            if len(name_from_file.split()) > 1 and len(name_from_file.split()) < 4:
                faculty_name_cv = name_from_file.title()
        
        # Process accomplishments
        if accomplishments_list:
            for item in accomplishments_list:
                processed_item = validate_and_clean_accomplishment(item, faculty_name_cv, pdf_file.name)
                if processed_item:
                    all_accomplishments.append(processed_item)
        else:
            gr.Warning(f"No accomplishments found for {os.path.basename(pdf_file.name)}.")
    
    if not all_accomplishments:
        raise gr.Error("No accomplishments were extracted from the provided PDFs.")
    
    # Convert to DataFrame for display
    df = pd.DataFrame(all_accomplishments)
    
    # Create CSV in memory for download
    csv_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
    df.to_csv(csv_file.name, index=False)
    
    return df, csv_file.name

# --- Gradio Interface ---

# Create the authentication interface
with gr.Blocks(title="CV to CSV Extraction App") as app:
    gr.Markdown("# CV to CSV Extraction App")
    gr.Markdown("Extract publications and accomplishments from faculty CVs")
    
    # Authentication state
    authenticated = gr.State(False)
    
    # Login interface
    with gr.Group(visible=True) as login_group:
        gr.Markdown("### Authentication Required")
        password_input = gr.Textbox(type="password", label="Password")
        login_button = gr.Button("Login")
        login_error = gr.Markdown(visible=False)
    
    # Main app interface (initially hidden)
    with gr.Group(visible=False) as main_app:
        with gr.Tab("Extract from CVs"):
            gr.Markdown("### Upload Faculty CV PDFs")
            gr.Markdown("Upload one or more PDF files containing faculty CVs. The app will extract publications and other scholarly accomplishments.")
            
            # File upload
            pdf_input = gr.File(file_count="multiple", label="Upload CV PDFs", file_types=[".pdf"])
            process_button = gr.Button("Extract Accomplishments")
            
            # Results display
            results = gr.DataFrame(label="Extracted Accomplishments", interactive=False)
            
            # Download button
            csv_output = gr.File(label="Download as CSV")
            
            # Process button click
            process_button.click(
                fn=process_cv_files,
                inputs=[pdf_input],
                outputs=[results, csv_output],
                api_name="extract_accomplishments"
            )
    
    # Login button click
    def login(password):
        if check_password(password):
            return {
                login_group: gr.update(visible=False),
                main_app: gr.update(visible=True),
                login_error: gr.update(visible=False),
                authenticated: True
            }
        else:
            return {
                login_error: gr.update(visible=True, value="Invalid password. Please try again."),
                authenticated: False
            }
    
    login_button.click(
        fn=login,
        inputs=[password_input],
        outputs=[login_group, main_app, login_error, authenticated]
    )
    
    # Make pressing Enter in password field work
    password_input.submit(
        fn=login,
        inputs=[password_input],
        outputs=[login_group, main_app, login_error, authenticated]
    )

# Launch the app
if __name__ == "__main__":
    app.launch()