rodrigomasini commited on
Commit
ae5cad8
·
verified ·
1 Parent(s): f6a3cf4

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +16 -21
main.py CHANGED
@@ -6,11 +6,12 @@ from pathlib import Path
6
  from typing import List, Union, Optional, Dict, Any
7
 
8
  from fastapi import FastAPI, File, UploadFile, HTTPException, status
9
- from pydantic import BaseModel, Field, field_validator
10
- from PIL import Image # For type hinting, though not directly used in API models
11
 
12
- # Import necessary components from the monolithic script
13
- from mdr_pdf_parse import (
 
14
  MagicPDFProcessor,
15
  MDRStructuredBlock,
16
  MDRTextBlock,
@@ -163,7 +164,7 @@ class MDRFigureBlockModel(MDRBasicBlockModel):
163
  )
164
 
165
  # Union type for the response model
166
- MDRStructuredBlockModel = Union[MDRTextBlockModel, MDRTableBlockModel, MDRFormulaBlockModel, MDRFigureBlockModel]
167
 
168
  # --- FastAPI App ---
169
  app = FastAPI(
@@ -187,7 +188,7 @@ async def health_check():
187
  return {"status": "ok", "message": "MagicPDFProcessor is running."}
188
 
189
  @app.post("/process-pdf/",
190
- response_model=List[MDRStructuredBlockModel], # Use the Union type
191
  summary="Process a PDF file",
192
  description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).")
193
  async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF file to process.")):
@@ -201,6 +202,7 @@ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF
201
  raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
202
 
203
  # Save uploaded file temporarily
 
204
  try:
205
  # Create a temporary directory if it doesn't exist
206
  temp_dir = Path("./temp_uploads")
@@ -215,7 +217,7 @@ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF
215
  except Exception as e:
216
  raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}")
217
 
218
- extracted_blocks_api: List[MDRStructuredBlockModel] = []
219
  start_process_time = time.time()
220
 
221
  try:
@@ -246,11 +248,12 @@ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF
246
  raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}")
247
  finally:
248
  # Clean up the temporary file
249
- try:
250
- os.remove(temp_pdf_path)
251
- print(f"Cleaned up temporary file: {temp_pdf_path}")
252
- except OSError as e:
253
- print(f"Warning: Could not remove temporary file {temp_pdf_path}: {e}")
 
254
 
255
  return extracted_blocks_api
256
 
@@ -261,12 +264,4 @@ async def read_root():
261
  "message": "Welcome to the MagicDataReadiness PDF Processor API!",
262
  "docs_url": "/docs",
263
  "health_url": "/health"
264
- }
265
-
266
- # --- Run with Uvicorn (for local testing) ---
267
- # This part is usually not included when deploying with Docker,
268
- # as Docker CMD handles running uvicorn.
269
- # if __name__ == "__main__":
270
- # import uvicorn
271
- # print("Starting Uvicorn server locally...")
272
- # uvicorn.run(app, host="0.0.0.0", port=8000)
 
6
  from typing import List, Union, Optional, Dict, Any
7
 
8
  from fastapi import FastAPI, File, UploadFile, HTTPException, status
9
+ from pydantic import BaseModel, Field # Removed field_validator as it wasn't used
10
+ from PIL import Image # For type hinting
11
 
12
+ # --- CORRECTED IMPORT ---
13
+ # Import directly from the monolithic script file name
14
+ from magic_pdf_processor import (
15
  MagicPDFProcessor,
16
  MDRStructuredBlock,
17
  MDRTextBlock,
 
164
  )
165
 
166
  # Union type for the response model
167
+ MDRStructuredBlockModelAPI = Union[MDRTextBlockModel, MDRTableBlockModel, MDRFormulaBlockModel, MDRFigureBlockModel] # Renamed API Union type
168
 
169
  # --- FastAPI App ---
170
  app = FastAPI(
 
188
  return {"status": "ok", "message": "MagicPDFProcessor is running."}
189
 
190
  @app.post("/process-pdf/",
191
+ response_model=List[MDRStructuredBlockModelAPI], # Use the Union type
192
  summary="Process a PDF file",
193
  description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).")
194
  async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF file to process.")):
 
202
  raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
203
 
204
  # Save uploaded file temporarily
205
+ temp_pdf_path = "" # Initialize path
206
  try:
207
  # Create a temporary directory if it doesn't exist
208
  temp_dir = Path("./temp_uploads")
 
217
  except Exception as e:
218
  raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}")
219
 
220
+ extracted_blocks_api: List[MDRStructuredBlockModelAPI] = []
221
  start_process_time = time.time()
222
 
223
  try:
 
248
  raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}")
249
  finally:
250
  # Clean up the temporary file
251
+ if temp_pdf_path and os.path.exists(temp_pdf_path):
252
+ try:
253
+ os.remove(temp_pdf_path)
254
+ print(f"Cleaned up temporary file: {temp_pdf_path}")
255
+ except OSError as e:
256
+ print(f"Warning: Could not remove temporary file {temp_pdf_path}: {e}")
257
 
258
  return extracted_blocks_api
259
 
 
264
  "message": "Welcome to the MagicDataReadiness PDF Processor API!",
265
  "docs_url": "/docs",
266
  "health_url": "/health"
267
+ }