Update main.py
Browse files
main.py
CHANGED
@@ -6,11 +6,12 @@ from pathlib import Path
|
|
6 |
from typing import List, Union, Optional, Dict, Any
|
7 |
|
8 |
from fastapi import FastAPI, File, UploadFile, HTTPException, status
|
9 |
-
from pydantic import BaseModel, Field
|
10 |
-
from PIL import Image # For type hinting
|
11 |
|
12 |
-
#
|
13 |
-
from
|
|
|
14 |
MagicPDFProcessor,
|
15 |
MDRStructuredBlock,
|
16 |
MDRTextBlock,
|
@@ -163,7 +164,7 @@ class MDRFigureBlockModel(MDRBasicBlockModel):
|
|
163 |
)
|
164 |
|
165 |
# Union type for the response model
|
166 |
-
|
167 |
|
168 |
# --- FastAPI App ---
|
169 |
app = FastAPI(
|
@@ -187,7 +188,7 @@ async def health_check():
|
|
187 |
return {"status": "ok", "message": "MagicPDFProcessor is running."}
|
188 |
|
189 |
@app.post("/process-pdf/",
|
190 |
-
response_model=List[
|
191 |
summary="Process a PDF file",
|
192 |
description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).")
|
193 |
async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF file to process.")):
|
@@ -201,6 +202,7 @@ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF
|
|
201 |
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
|
202 |
|
203 |
# Save uploaded file temporarily
|
|
|
204 |
try:
|
205 |
# Create a temporary directory if it doesn't exist
|
206 |
temp_dir = Path("./temp_uploads")
|
@@ -215,7 +217,7 @@ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF
|
|
215 |
except Exception as e:
|
216 |
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}")
|
217 |
|
218 |
-
extracted_blocks_api: List[
|
219 |
start_process_time = time.time()
|
220 |
|
221 |
try:
|
@@ -246,11 +248,12 @@ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF
|
|
246 |
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}")
|
247 |
finally:
|
248 |
# Clean up the temporary file
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
|
|
254 |
|
255 |
return extracted_blocks_api
|
256 |
|
@@ -261,12 +264,4 @@ async def read_root():
|
|
261 |
"message": "Welcome to the MagicDataReadiness PDF Processor API!",
|
262 |
"docs_url": "/docs",
|
263 |
"health_url": "/health"
|
264 |
-
}
|
265 |
-
|
266 |
-
# --- Run with Uvicorn (for local testing) ---
|
267 |
-
# This part is usually not included when deploying with Docker,
|
268 |
-
# as Docker CMD handles running uvicorn.
|
269 |
-
# if __name__ == "__main__":
|
270 |
-
# import uvicorn
|
271 |
-
# print("Starting Uvicorn server locally...")
|
272 |
-
# uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
6 |
from typing import List, Union, Optional, Dict, Any
|
7 |
|
8 |
from fastapi import FastAPI, File, UploadFile, HTTPException, status
|
9 |
+
from pydantic import BaseModel, Field # Removed field_validator as it wasn't used
|
10 |
+
from PIL import Image # For type hinting
|
11 |
|
12 |
+
# --- CORRECTED IMPORT ---
|
13 |
+
# Import directly from the monolithic script file name
|
14 |
+
from magic_pdf_processor import (
|
15 |
MagicPDFProcessor,
|
16 |
MDRStructuredBlock,
|
17 |
MDRTextBlock,
|
|
|
164 |
)
|
165 |
|
166 |
# Union type for the response model
|
167 |
+
MDRStructuredBlockModelAPI = Union[MDRTextBlockModel, MDRTableBlockModel, MDRFormulaBlockModel, MDRFigureBlockModel] # Renamed API Union type
|
168 |
|
169 |
# --- FastAPI App ---
|
170 |
app = FastAPI(
|
|
|
188 |
return {"status": "ok", "message": "MagicPDFProcessor is running."}
|
189 |
|
190 |
@app.post("/process-pdf/",
|
191 |
+
response_model=List[MDRStructuredBlockModelAPI], # Use the Union type
|
192 |
summary="Process a PDF file",
|
193 |
description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).")
|
194 |
async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF file to process.")):
|
|
|
202 |
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
|
203 |
|
204 |
# Save uploaded file temporarily
|
205 |
+
temp_pdf_path = "" # Initialize path
|
206 |
try:
|
207 |
# Create a temporary directory if it doesn't exist
|
208 |
temp_dir = Path("./temp_uploads")
|
|
|
217 |
except Exception as e:
|
218 |
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}")
|
219 |
|
220 |
+
extracted_blocks_api: List[MDRStructuredBlockModelAPI] = []
|
221 |
start_process_time = time.time()
|
222 |
|
223 |
try:
|
|
|
248 |
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}")
|
249 |
finally:
|
250 |
# Clean up the temporary file
|
251 |
+
if temp_pdf_path and os.path.exists(temp_pdf_path):
|
252 |
+
try:
|
253 |
+
os.remove(temp_pdf_path)
|
254 |
+
print(f"Cleaned up temporary file: {temp_pdf_path}")
|
255 |
+
except OSError as e:
|
256 |
+
print(f"Warning: Could not remove temporary file {temp_pdf_path}: {e}")
|
257 |
|
258 |
return extracted_blocks_api
|
259 |
|
|
|
264 |
"message": "Welcome to the MagicDataReadiness PDF Processor API!",
|
265 |
"docs_url": "/docs",
|
266 |
"health_url": "/health"
|
267 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|