Spaces:

levalencia
/

doctorecord

Running

Add cost tracking functionality across various components, including Executor, Planner, and FieldMapperAgent. Integrate CostTracker to monitor LLM and document intelligence costs, enhancing logging for cost-related metrics and providing detailed cost breakdowns in the user interface.

966ffcd 17 days ago

raw

history blame contribute delete

1.24 kB

	"""Extract raw text from the uploaded PDF using PyMuPDF.
	This keeps the implementation minimal for a POC while remaining easy to extend.
	"""
	from typing import Dict, Any, List

	import fitz # PyMuPDF

	from .base_agent import BaseAgent


	class PDFAgent(BaseAgent):
	"""Reads the PDF, concatenates all page text and stores it under ``ctx['text']``."""

	def _extract_text(self, pdf_bytes: bytes) -> str:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf") # type: ignore[arg-type]
	pages: List[str] = [page.get_text() for page in doc] # list-comp for clarity
	return "\n".join(pages)

	# -----------------------------------------------------
	def execute(self, ctx: Dict[str, Any]): # noqa: D401
	pdf_file = ctx.get("pdf_file")
	if pdf_file is None:
	raise ValueError("PDFAgent expected 'pdf_file' in context but none provided.")

	pdf_bytes = pdf_file.read()
	text = self._extract_text(pdf_bytes)
	ctx["text"] = text

	# After extracting pages
	num_pages = len(fitz.open(stream=pdf_bytes, filetype="pdf")) # type: ignore[arg-type]
	if "cost_tracker" in ctx:
	ctx["cost_tracker"].add_di_pages(num_pages)

	return text