rodrigomasini commited on
Commit
3d87b72
·
verified ·
1 Parent(s): d1dd214

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +272 -0
main.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import tempfile
4
+ import traceback
5
+ from pathlib import Path
6
+ from typing import List, Union, Optional, Dict, Any
7
+
8
+ from fastapi import FastAPI, File, UploadFile, HTTPException, status
9
+ from pydantic import BaseModel, Field, field_validator
10
+ from PIL import Image # For type hinting, though not directly used in API models
11
+
12
+ # Import necessary components from the monolithic script
13
+ from magic_pdf_processor import (
14
+ MagicPDFProcessor,
15
+ MDRStructuredBlock,
16
+ MDRTextBlock,
17
+ MDRTableBlock,
18
+ MDRFormulaBlock,
19
+ MDRFigureBlock,
20
+ MDRTextKind,
21
+ MDRTableFormat,
22
+ MDRRectangle,
23
+ MDRTextSpan,
24
+ MDRExtractedTableFormat # For configuration
25
+ )
26
+
27
+ # --- Configuration ---
28
+ # Read from environment variables, falling back to defaults
29
+ MODEL_DIR = os.environ.get("MDR_MODEL_DIR", "/models") # Default path inside container
30
+ DEVICE = os.environ.get("MDR_DEVICE", "cuda") # Default to cuda, processor will fallback if needed
31
+ TABLE_FORMAT_STR = os.environ.get("MDR_TABLE_FORMAT", "MARKDOWN") # Default table format
32
+
33
+ # Convert table format string to Enum
34
+ try:
35
+ TABLE_FORMAT = MDRExtractedTableFormat[TABLE_FORMAT_STR.upper()]
36
+ except KeyError:
37
+ print(f"Warning: Invalid MDR_TABLE_FORMAT '{TABLE_FORMAT_STR}'. Defaulting to DISABLE.")
38
+ TABLE_FORMAT = MDRExtractedTableFormat.DISABLE
39
+
40
+ # --- Initialize Processor ---
41
+ # This happens once when the service starts
42
+ print("Initializing MagicPDFProcessor...")
43
+ print(f"Model Directory: {MODEL_DIR}")
44
+ print(f"Target Device: {DEVICE}")
45
+ print(f"Table Format: {TABLE_FORMAT.name}")
46
+ start_time = time.time()
47
+ try:
48
+ mdr_processor = MagicPDFProcessor(
49
+ device=DEVICE,
50
+ model_dir_path=MODEL_DIR,
51
+ extract_table_format=TABLE_FORMAT,
52
+ # Set debug_dir_path=None for production service
53
+ debug_dir_path=None
54
+ )
55
+ print(f"MagicPDFProcessor initialized successfully ({time.time() - start_time:.2f}s)")
56
+ except Exception as e:
57
+ print(f"FATAL ERROR: Failed to initialize MagicPDFProcessor during startup: {e}")
58
+ print("Service cannot start.")
59
+ traceback.print_exc()
60
+ # Optionally exit or raise to prevent FastAPI from starting incorrectly
61
+ mdr_processor = None # Ensure processor is None if init fails
62
+
63
+ # --- API Models (Pydantic) ---
64
+ # Define models for API input/output for validation and documentation
65
+
66
+ class MDRPointModel(BaseModel):
67
+ x: float
68
+ y: float
69
+
70
+ class MDRRectangleModel(BaseModel):
71
+ lt: MDRPointModel
72
+ rt: MDRPointModel
73
+ lb: MDRPointModel
74
+ rb: MDRPointModel
75
+
76
+ @classmethod
77
+ def from_mdr_rectangle(cls, rect: MDRRectangle):
78
+ return cls(
79
+ lt=MDRPointModel(x=rect.lt[0], y=rect.lt[1]),
80
+ rt=MDRPointModel(x=rect.rt[0], y=rect.rt[1]),
81
+ lb=MDRPointModel(x=rect.lb[0], y=rect.lb[1]),
82
+ rb=MDRPointModel(x=rect.rb[0], y=rect.rb[1]),
83
+ )
84
+
85
+ class MDRTextSpanModel(BaseModel):
86
+ content: str
87
+ rank: float
88
+ rect: MDRRectangleModel
89
+
90
+ @classmethod
91
+ def from_mdr_text_span(cls, span: MDRTextSpan):
92
+ return cls(
93
+ content=span.content,
94
+ rank=span.rank,
95
+ rect=MDRRectangleModel.from_mdr_rectangle(span.rect)
96
+ )
97
+
98
+ class MDRBasicBlockModel(BaseModel):
99
+ block_type: str # To distinguish block types in the union
100
+ rect: MDRRectangleModel
101
+ texts: List[MDRTextSpanModel] = Field(default_factory=list) # Captions/footnotes
102
+ font_size: float
103
+
104
+ class MDRTextBlockModel(MDRBasicBlockModel):
105
+ block_type: Literal["TextBlock"] = "TextBlock"
106
+ kind: str # Use string representation of enum
107
+ has_paragraph_indentation: bool
108
+ last_line_touch_end: bool
109
+ # Override texts field specifically for TextBlock
110
+ texts: List[MDRTextSpanModel] # Text content itself
111
+
112
+ @classmethod
113
+ def from_mdr_text_block(cls, block: MDRTextBlock):
114
+ return cls(
115
+ rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
116
+ texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts],
117
+ font_size=block.font_size,
118
+ kind=block.kind.name, # Convert enum to string name
119
+ has_paragraph_indentation=block.has_paragraph_indentation,
120
+ last_line_touch_end=block.last_line_touch_end
121
+ )
122
+
123
+ class MDRTableBlockModel(MDRBasicBlockModel):
124
+ block_type: Literal["TableBlock"] = "TableBlock"
125
+ content: str
126
+ format: str # Use string representation of enum
127
+ # Omit 'image' field from API response
128
+
129
+ @classmethod
130
+ def from_mdr_table_block(cls, block: MDRTableBlock):
131
+ return cls(
132
+ rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
133
+ texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], # Captions
134
+ font_size=block.font_size,
135
+ content=block.content,
136
+ format=block.format.name # Convert enum to string name
137
+ )
138
+
139
+ class MDRFormulaBlockModel(MDRBasicBlockModel):
140
+ block_type: Literal["FormulaBlock"] = "FormulaBlock"
141
+ content: Optional[str] = None
142
+ # Omit 'image' field from API response
143
+
144
+ @classmethod
145
+ def from_mdr_formula_block(cls, block: MDRFormulaBlock):
146
+ return cls(
147
+ rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
148
+ texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], # Captions
149
+ font_size=block.font_size,
150
+ content=block.content
151
+ )
152
+
153
+ class MDRFigureBlockModel(MDRBasicBlockModel):
154
+ block_type: Literal["FigureBlock"] = "FigureBlock"
155
+ # Omit 'image' field from API response
156
+
157
+ @classmethod
158
+ def from_mdr_figure_block(cls, block: MDRFigureBlock):
159
+ return cls(
160
+ rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
161
+ texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], # Captions
162
+ font_size=block.font_size
163
+ )
164
+
165
+ # Union type for the response model
166
+ MDRStructuredBlockModel = Union[MDRTextBlockModel, MDRTableBlockModel, MDRFormulaBlockModel, MDRFigureBlockModel]
167
+
168
+ # --- FastAPI App ---
169
+ app = FastAPI(
170
+ title="MagicDataReadiness PDF Processor",
171
+ description="API service to extract structured content from PDF files.",
172
+ version="1.0.0"
173
+ )
174
+
175
+ @app.on_event("startup")
176
+ async def startup_event():
177
+ if mdr_processor is None:
178
+ # This prevents the app from starting if initialization failed
179
+ raise RuntimeError("MagicPDFProcessor failed to initialize. Service cannot start.")
180
+ print("MagicDataReadiness Service is ready.")
181
+
182
+ @app.get("/health")
183
+ async def health_check():
184
+ """Simple health check endpoint."""
185
+ if mdr_processor is None:
186
+ raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Processor not initialized")
187
+ return {"status": "ok", "message": "MagicPDFProcessor is running."}
188
+
189
+ @app.post("/process-pdf/",
190
+ response_model=List[MDRStructuredBlockModel], # Use the Union type
191
+ summary="Process a PDF file",
192
+ description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).")
193
+ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF file to process.")):
194
+ """
195
+ Handles PDF file upload, processing, and returns extracted blocks.
196
+ """
197
+ if mdr_processor is None:
198
+ raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Processor not initialized")
199
+
200
+ if not file.filename.lower().endswith(".pdf"):
201
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
202
+
203
+ # Save uploaded file temporarily
204
+ try:
205
+ # Create a temporary directory if it doesn't exist
206
+ temp_dir = Path("./temp_uploads")
207
+ temp_dir.mkdir(exist_ok=True)
208
+
209
+ # Use a temporary file within the directory
210
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir=temp_dir) as temp_file:
211
+ content = await file.read()
212
+ temp_file.write(content)
213
+ temp_pdf_path = temp_file.name
214
+ print(f"Received file '{file.filename}', saved temporarily to '{temp_pdf_path}'")
215
+ except Exception as e:
216
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}")
217
+
218
+ extracted_blocks_api: List[MDRStructuredBlockModel] = []
219
+ start_process_time = time.time()
220
+
221
+ try:
222
+ print(f"Processing '{temp_pdf_path}'...")
223
+ # Process the document using the temporary file path
224
+ # Note: process_document returns a generator, collect all blocks
225
+ all_blocks = list(mdr_processor.process_document(pdf_input=temp_pdf_path))
226
+ print(f"Extracted {len(all_blocks)} raw blocks.")
227
+
228
+ # Convert internal block types to API response models
229
+ for block in all_blocks:
230
+ if isinstance(block, MDRTextBlock):
231
+ extracted_blocks_api.append(MDRTextBlockModel.from_mdr_text_block(block))
232
+ elif isinstance(block, MDRTableBlock):
233
+ extracted_blocks_api.append(MDRTableBlockModel.from_mdr_table_block(block))
234
+ elif isinstance(block, MDRFormulaBlock):
235
+ extracted_blocks_api.append(MDRFormulaBlockModel.from_mdr_formula_block(block))
236
+ elif isinstance(block, MDRFigureBlock):
237
+ extracted_blocks_api.append(MDRFigureBlockModel.from_mdr_figure_block(block))
238
+ # Add more elif clauses if there are other block types
239
+
240
+ process_time = time.time() - start_process_time
241
+ print(f"Processing finished in {process_time:.2f}s. Returning {len(extracted_blocks_api)} blocks.")
242
+
243
+ except Exception as e:
244
+ print(f"ERROR during PDF processing: {e}")
245
+ traceback.print_exc()
246
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}")
247
+ finally:
248
+ # Clean up the temporary file
249
+ try:
250
+ os.remove(temp_pdf_path)
251
+ print(f"Cleaned up temporary file: {temp_pdf_path}")
252
+ except OSError as e:
253
+ print(f"Warning: Could not remove temporary file {temp_pdf_path}: {e}")
254
+
255
+ return extracted_blocks_api
256
+
257
+ # Optional: Add root endpoint for basic info/docs link
258
+ @app.get("/")
259
+ async def read_root():
260
+ return {
261
+ "message": "Welcome to the MagicDataReadiness PDF Processor API!",
262
+ "docs_url": "/docs",
263
+ "health_url": "/health"
264
+ }
265
+
266
+ # --- Run with Uvicorn (for local testing) ---
267
+ # This part is usually not included when deploying with Docker,
268
+ # as Docker CMD handles running uvicorn.
269
+ # if __name__ == "__main__":
270
+ # import uvicorn
271
+ # print("Starting Uvicorn server locally...")
272
+ # uvicorn.run(app, host="0.0.0.0", port=8000)