|
import os |
|
import time |
|
import tempfile |
|
import traceback |
|
from pathlib import Path |
|
from typing import List, Union, Optional, Dict, Any, Literal |
|
|
|
from fastapi import FastAPI, File, UploadFile, HTTPException, status |
|
from pydantic import BaseModel, Field |
|
from PIL import Image |
|
|
|
|
|
from mdr_pdf_parser import ( |
|
MagicPDFProcessor, |
|
MDRStructuredBlock, |
|
MDRTextBlock, |
|
MDRTableBlock, |
|
MDRFormulaBlock, |
|
MDRFigureBlock, |
|
MDRTextKind, |
|
MDRTableFormat, |
|
MDRRectangle, |
|
MDRTextSpan, |
|
MDRExtractedTableFormat |
|
) |
|
|
|
|
|
|
|
MODEL_DIR = os.environ.get("MDR_MODEL_DIR", "/models") |
|
DEVICE = os.environ.get("MDR_DEVICE", "cuda") |
|
TABLE_FORMAT_STR = os.environ.get("MDR_TABLE_FORMAT", "MARKDOWN") |
|
|
|
|
|
try: |
|
TABLE_FORMAT = MDRExtractedTableFormat[TABLE_FORMAT_STR.upper()] |
|
except KeyError: |
|
print(f"Warning: Invalid MDR_TABLE_FORMAT '{TABLE_FORMAT_STR}'. Defaulting to DISABLE.") |
|
TABLE_FORMAT = MDRExtractedTableFormat.DISABLE |
|
|
|
|
|
|
|
print("Initializing MagicPDFProcessor...") |
|
print(f"Model Directory: {MODEL_DIR}") |
|
print(f"Target Device: {DEVICE}") |
|
print(f"Table Format: {TABLE_FORMAT.name}") |
|
start_time = time.time() |
|
try: |
|
mdr_processor = MagicPDFProcessor( |
|
device=DEVICE, |
|
model_dir_path=MODEL_DIR, |
|
extract_table_format=TABLE_FORMAT, |
|
|
|
debug_dir_path=None |
|
) |
|
print(f"MagicPDFProcessor initialized successfully ({time.time() - start_time:.2f}s)") |
|
except Exception as e: |
|
print(f"FATAL ERROR: Failed to initialize MagicPDFProcessor during startup: {e}") |
|
print("Service cannot start.") |
|
traceback.print_exc() |
|
|
|
mdr_processor = None |
|
|
|
|
|
|
|
|
|
class MDRPointModel(BaseModel): |
|
x: float |
|
y: float |
|
|
|
class MDRRectangleModel(BaseModel): |
|
lt: MDRPointModel |
|
rt: MDRPointModel |
|
lb: MDRPointModel |
|
rb: MDRPointModel |
|
|
|
@classmethod |
|
def from_mdr_rectangle(cls, rect: MDRRectangle): |
|
return cls( |
|
lt=MDRPointModel(x=rect.lt[0], y=rect.lt[1]), |
|
rt=MDRPointModel(x=rect.rt[0], y=rect.rt[1]), |
|
lb=MDRPointModel(x=rect.lb[0], y=rect.lb[1]), |
|
rb=MDRPointModel(x=rect.rb[0], y=rect.rb[1]), |
|
) |
|
|
|
class MDRTextSpanModel(BaseModel): |
|
content: str |
|
rank: float |
|
rect: MDRRectangleModel |
|
|
|
@classmethod |
|
def from_mdr_text_span(cls, span: MDRTextSpan): |
|
return cls( |
|
content=span.content, |
|
rank=span.rank, |
|
rect=MDRRectangleModel.from_mdr_rectangle(span.rect) |
|
) |
|
|
|
class MDRBasicBlockModel(BaseModel): |
|
block_type: str |
|
rect: MDRRectangleModel |
|
texts: List[MDRTextSpanModel] = Field(default_factory=list) |
|
font_size: float |
|
|
|
class MDRTextBlockModel(MDRBasicBlockModel): |
|
block_type: Literal["TextBlock"] = "TextBlock" |
|
kind: str |
|
has_paragraph_indentation: bool |
|
last_line_touch_end: bool |
|
|
|
texts: List[MDRTextSpanModel] |
|
|
|
@classmethod |
|
def from_mdr_text_block(cls, block: MDRTextBlock): |
|
return cls( |
|
rect=MDRRectangleModel.from_mdr_rectangle(block.rect), |
|
texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], |
|
font_size=block.font_size, |
|
kind=block.kind.name, |
|
has_paragraph_indentation=block.has_paragraph_indentation, |
|
last_line_touch_end=block.last_line_touch_end |
|
) |
|
|
|
class MDRTableBlockModel(MDRBasicBlockModel): |
|
block_type: Literal["TableBlock"] = "TableBlock" |
|
content: str |
|
format: str |
|
|
|
|
|
@classmethod |
|
def from_mdr_table_block(cls, block: MDRTableBlock): |
|
return cls( |
|
rect=MDRRectangleModel.from_mdr_rectangle(block.rect), |
|
texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], |
|
font_size=block.font_size, |
|
content=block.content, |
|
format=block.format.name |
|
) |
|
|
|
class MDRFormulaBlockModel(MDRBasicBlockModel): |
|
block_type: Literal["FormulaBlock"] = "FormulaBlock" |
|
content: Optional[str] = None |
|
|
|
|
|
@classmethod |
|
def from_mdr_formula_block(cls, block: MDRFormulaBlock): |
|
return cls( |
|
rect=MDRRectangleModel.from_mdr_rectangle(block.rect), |
|
texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], |
|
font_size=block.font_size, |
|
content=block.content |
|
) |
|
|
|
class MDRFigureBlockModel(MDRBasicBlockModel): |
|
block_type: Literal["FigureBlock"] = "FigureBlock" |
|
|
|
|
|
@classmethod |
|
def from_mdr_figure_block(cls, block: MDRFigureBlock): |
|
return cls( |
|
rect=MDRRectangleModel.from_mdr_rectangle(block.rect), |
|
texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], |
|
font_size=block.font_size |
|
) |
|
|
|
|
|
MDRStructuredBlockModelAPI = Union[MDRTextBlockModel, MDRTableBlockModel, MDRFormulaBlockModel, MDRFigureBlockModel] |
|
|
|
|
|
app = FastAPI( |
|
title="MagicDataReadiness PDF Processor", |
|
description="API service to extract structured content from PDF files.", |
|
version="1.0.0" |
|
) |
|
|
|
@app.on_event("startup") |
|
async def startup_event(): |
|
if mdr_processor is None: |
|
|
|
raise RuntimeError("MagicPDFProcessor failed to initialize. Service cannot start.") |
|
print("MagicDataReadiness Service is ready.") |
|
|
|
@app.get("/health") |
|
async def health_check(): |
|
"""Simple health check endpoint.""" |
|
if mdr_processor is None: |
|
raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Processor not initialized") |
|
return {"status": "ok", "message": "MagicPDFProcessor is running."} |
|
|
|
@app.post("/process-pdf/", |
|
response_model=List[MDRStructuredBlockModelAPI], |
|
summary="Process a PDF file", |
|
description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).") |
|
async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF file to process.")): |
|
""" |
|
Handles PDF file upload, processing, and returns extracted blocks. |
|
""" |
|
if mdr_processor is None: |
|
raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Processor not initialized") |
|
|
|
if not file.filename.lower().endswith(".pdf"): |
|
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.") |
|
|
|
|
|
temp_pdf_path = "" |
|
try: |
|
|
|
temp_dir = Path("./temp_uploads") |
|
temp_dir.mkdir(exist_ok=True) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir=temp_dir) as temp_file: |
|
content = await file.read() |
|
temp_file.write(content) |
|
temp_pdf_path = temp_file.name |
|
print(f"Received file '{file.filename}', saved temporarily to '{temp_pdf_path}'") |
|
except Exception as e: |
|
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}") |
|
|
|
extracted_blocks_api: List[MDRStructuredBlockModelAPI] = [] |
|
start_process_time = time.time() |
|
|
|
try: |
|
print(f"Processing '{temp_pdf_path}'...") |
|
|
|
|
|
all_blocks = list(mdr_processor.process_document(pdf_input=temp_pdf_path)) |
|
print(f"Extracted {len(all_blocks)} raw blocks.") |
|
|
|
|
|
for block in all_blocks: |
|
if isinstance(block, MDRTextBlock): |
|
extracted_blocks_api.append(MDRTextBlockModel.from_mdr_text_block(block)) |
|
elif isinstance(block, MDRTableBlock): |
|
extracted_blocks_api.append(MDRTableBlockModel.from_mdr_table_block(block)) |
|
elif isinstance(block, MDRFormulaBlock): |
|
extracted_blocks_api.append(MDRFormulaBlockModel.from_mdr_formula_block(block)) |
|
elif isinstance(block, MDRFigureBlock): |
|
extracted_blocks_api.append(MDRFigureBlockModel.from_mdr_figure_block(block)) |
|
|
|
|
|
process_time = time.time() - start_process_time |
|
print(f"Processing finished in {process_time:.2f}s. Returning {len(extracted_blocks_api)} blocks.") |
|
|
|
except Exception as e: |
|
print(f"ERROR during PDF processing: {e}") |
|
traceback.print_exc() |
|
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}") |
|
finally: |
|
|
|
if temp_pdf_path and os.path.exists(temp_pdf_path): |
|
try: |
|
os.remove(temp_pdf_path) |
|
print(f"Cleaned up temporary file: {temp_pdf_path}") |
|
except OSError as e: |
|
print(f"Warning: Could not remove temporary file {temp_pdf_path}: {e}") |
|
|
|
return extracted_blocks_api |
|
|
|
|
|
@app.get("/") |
|
async def read_root(): |
|
return { |
|
"message": "Welcome to the MagicDataReadiness PDF Processor API!", |
|
"docs_url": "/docs", |
|
"health_url": "/health" |
|
} |