Spaces:
Running
Running
File size: 1,236 Bytes
0a40afa 966ffcd 0a40afa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
"""Extract raw text from the uploaded PDF using PyMuPDF.
This keeps the implementation minimal for a POC while remaining easy to extend.
"""
from typing import Dict, Any, List
import fitz # PyMuPDF
from .base_agent import BaseAgent
class PDFAgent(BaseAgent):
"""Reads the PDF, concatenates all page text and stores it under ``ctx['text']``."""
def _extract_text(self, pdf_bytes: bytes) -> str:
doc = fitz.open(stream=pdf_bytes, filetype="pdf") # type: ignore[arg-type]
pages: List[str] = [page.get_text() for page in doc] # list-comp for clarity
return "\n".join(pages)
# -----------------------------------------------------
def execute(self, ctx: Dict[str, Any]): # noqa: D401
pdf_file = ctx.get("pdf_file")
if pdf_file is None:
raise ValueError("PDFAgent expected 'pdf_file' in context but none provided.")
pdf_bytes = pdf_file.read()
text = self._extract_text(pdf_bytes)
ctx["text"] = text
# After extracting pages
num_pages = len(fitz.open(stream=pdf_bytes, filetype="pdf")) # type: ignore[arg-type]
if "cost_tracker" in ctx:
ctx["cost_tracker"].add_di_pages(num_pages)
return text |