Spaces:
Running
Running

Refactor AzureDIService to enhance document analysis logging and update table extraction logic. Temporarily disable table extraction and improve content type logging. Update TableAgent to reflect changes in context handling.
54478a0
"""Extract tables from PDF using Azure Document Intelligence.""" | |
from typing import Dict, Any | |
import logging | |
from .base_agent import BaseAgent | |
from services.azure_di_service import AzureDIService | |
class TableAgent(BaseAgent): | |
def __init__(self, settings): | |
self.service = AzureDIService(settings.AZURE_DI_ENDPOINT, settings.AZURE_DI_KEY) | |
self.logger = logging.getLogger(__name__) | |
def execute(self, ctx: Dict[str, Any]): | |
"""Extract tables from PDF.""" | |
try: | |
pdf_file = ctx.get("pdf_file") | |
if not pdf_file: | |
self.logger.error("No PDF file found in context") | |
return {} | |
# Get the current position of the file pointer | |
current_pos = pdf_file.tell() | |
self.logger.info(f"Current file position: {current_pos}") | |
# Reset to beginning if not at start | |
if current_pos != 0: | |
self.logger.info("Resetting file pointer to beginning") | |
pdf_file.seek(0) | |
# Read the file | |
pdf_bytes = pdf_file.read() | |
self.logger.info(f"Read {len(pdf_bytes)} bytes from PDF") | |
# Extract content using Azure DI | |
result = self.service.extract_tables(pdf_bytes) | |
# Store both text and tables in context | |
ctx["text"] = result["text"] | |
#ctx["tables"] = result["tables"] | |
self.logger.info(f"Extracted {len(result['text'])} characters of text including tables") | |
return result | |
except Exception as e: | |
self.logger.error(f"Error in TableAgent: {str(e)}") | |
self.logger.exception("Full traceback:") | |
return {} |