rodrigomasini commited on
Commit
b9cd1f4
·
verified ·
1 Parent(s): cd9e373

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +175 -97
main.py CHANGED
@@ -2,67 +2,61 @@ import os
2
  import time
3
  import tempfile
4
  import traceback
 
5
  from pathlib import Path
6
  from typing import List, Union, Optional, Dict, Any, Literal
 
7
 
8
- from fastapi import FastAPI, File, UploadFile, HTTPException, status
9
  from pydantic import BaseModel, Field
10
- from PIL import Image
 
11
 
12
  # --- IMPORTS ---
13
  from mdr_pdf_parser import (
14
  MagicPDFProcessor,
15
- MDRStructuredBlock,
16
  MDRTextBlock,
17
  MDRTableBlock,
18
  MDRFormulaBlock,
19
  MDRFigureBlock,
20
- MDRTextKind,
21
- MDRTableFormat,
22
- MDRRectangle,
23
- MDRTextSpan,
24
  MDRExtractedTableFormat # For configuration
25
  )
26
 
 
 
 
 
 
 
 
 
 
 
 
27
  # --- Configuration ---
28
  # Read from environment variables, falling back to defaults
29
- MODEL_DIR = os.environ.get("MDR_MODEL_DIR", "/models") # Default path inside container
30
- DEVICE = os.environ.get("MDR_DEVICE", "cuda") # Default to cuda, processor will fallback if needed
31
- TABLE_FORMAT_STR = os.environ.get("MDR_TABLE_FORMAT", "MARKDOWN") # Default table format
 
32
 
33
  # Convert table format string to Enum
34
  try:
35
  TABLE_FORMAT = MDRExtractedTableFormat[TABLE_FORMAT_STR.upper()]
36
  except KeyError:
37
- print(f"Warning: Invalid MDR_TABLE_FORMAT '{TABLE_FORMAT_STR}'. Defaulting to DISABLE.")
38
  TABLE_FORMAT = MDRExtractedTableFormat.DISABLE
39
 
40
- # --- Initialize Processor ---
41
- # This happens once when the service starts
42
- print("Initializing MagicPDFProcessor...")
43
- print(f"Model Directory: {MODEL_DIR}")
44
- print(f"Target Device: {DEVICE}")
45
- print(f"Table Format: {TABLE_FORMAT.name}")
46
- start_time = time.time()
47
- try:
48
- mdr_processor = MagicPDFProcessor(
49
- device=DEVICE,
50
- model_dir_path=MODEL_DIR,
51
- extract_table_format=TABLE_FORMAT,
52
- # Set debug_dir_path=None for production service
53
- debug_dir_path=None
54
- )
55
- print(f"MagicPDFProcessor initialized successfully ({time.time() - start_time:.2f}s)")
56
- except Exception as e:
57
- print(f"FATAL ERROR: Failed to initialize MagicPDFProcessor during startup: {e}")
58
- print("Service cannot start.")
59
- traceback.print_exc()
60
- # Optionally exit or raise to prevent FastAPI from starting incorrectly
61
- mdr_processor = None # Ensure processor is None if init fails
62
 
63
  # --- API Models (Pydantic) ---
64
- # Define models for API input/output for validation and documentation
65
-
66
  class MDRPointModel(BaseModel):
67
  x: float
68
  y: float
@@ -96,18 +90,17 @@ class MDRTextSpanModel(BaseModel):
96
  )
97
 
98
  class MDRBasicBlockModel(BaseModel):
99
- block_type: str # To distinguish block types in the union
100
  rect: MDRRectangleModel
101
- texts: List[MDRTextSpanModel] = Field(default_factory=list) # Captions/footnotes
102
  font_size: float
103
 
104
  class MDRTextBlockModel(MDRBasicBlockModel):
105
  block_type: Literal["TextBlock"] = "TextBlock"
106
- kind: str # Use string representation of enum
107
  has_paragraph_indentation: bool
108
  last_line_touch_end: bool
109
- # Override texts field specifically for TextBlock
110
- texts: List[MDRTextSpanModel] # Text content itself
111
 
112
  @classmethod
113
  def from_mdr_text_block(cls, block: MDRTextBlock):
@@ -115,7 +108,7 @@ class MDRTextBlockModel(MDRBasicBlockModel):
115
  rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
116
  texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts],
117
  font_size=block.font_size,
118
- kind=block.kind.name, # Convert enum to string name
119
  has_paragraph_indentation=block.has_paragraph_indentation,
120
  last_line_touch_end=block.last_line_touch_end
121
  )
@@ -123,47 +116,43 @@ class MDRTextBlockModel(MDRBasicBlockModel):
123
  class MDRTableBlockModel(MDRBasicBlockModel):
124
  block_type: Literal["TableBlock"] = "TableBlock"
125
  content: str
126
- format: str # Use string representation of enum
127
- # Omit 'image' field from API response
128
 
129
  @classmethod
130
  def from_mdr_table_block(cls, block: MDRTableBlock):
131
  return cls(
132
  rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
133
- texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], # Captions
134
  font_size=block.font_size,
135
  content=block.content,
136
- format=block.format.name # Convert enum to string name
137
  )
138
 
139
  class MDRFormulaBlockModel(MDRBasicBlockModel):
140
  block_type: Literal["FormulaBlock"] = "FormulaBlock"
141
  content: Optional[str] = None
142
- # Omit 'image' field from API response
143
 
144
  @classmethod
145
  def from_mdr_formula_block(cls, block: MDRFormulaBlock):
146
  return cls(
147
  rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
148
- texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], # Captions
149
  font_size=block.font_size,
150
  content=block.content
151
  )
152
 
153
  class MDRFigureBlockModel(MDRBasicBlockModel):
154
  block_type: Literal["FigureBlock"] = "FigureBlock"
155
- # Omit 'image' field from API response
156
 
157
  @classmethod
158
  def from_mdr_figure_block(cls, block: MDRFigureBlock):
159
  return cls(
160
  rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
161
- texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts], # Captions
162
  font_size=block.font_size
163
  )
164
 
165
- # Union type for the response model
166
- MDRStructuredBlockModelAPI = Union[MDRTextBlockModel, MDRTableBlockModel, MDRFormulaBlockModel, MDRFigureBlockModel] # Renamed API Union type
167
 
168
  # --- FastAPI App ---
169
  app = FastAPI(
@@ -172,95 +161,184 @@ app = FastAPI(
172
  version="1.0.0"
173
  )
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  @app.on_event("startup")
176
  async def startup_event():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  if mdr_processor is None:
178
- # This prevents the app from starting if initialization failed
179
- raise RuntimeError("MagicPDFProcessor failed to initialize. Service cannot start.")
180
- print("MagicDataReadiness Service is ready.")
 
 
 
 
181
 
182
- @app.get("/health")
 
183
  async def health_check():
184
  """Simple health check endpoint."""
185
  if mdr_processor is None:
 
186
  raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Processor not initialized")
187
  return {"status": "ok", "message": "MagicPDFProcessor is running."}
188
 
189
  @app.post("/process-pdf/",
190
- response_model=List[MDRStructuredBlockModelAPI], # Use the Union type
191
  summary="Process a PDF file",
192
  description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).")
193
- async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF file to process.")):
194
  """
195
  Handles PDF file upload, processing, and returns extracted blocks.
196
  """
 
 
 
 
197
  if mdr_processor is None:
 
198
  raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Processor not initialized")
199
 
200
- if not file.filename.lower().endswith(".pdf"):
 
201
  raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
202
 
203
- # Save uploaded file temporarily
204
- temp_pdf_path = "" # Initialize path
205
  try:
 
 
206
  # Create a temporary directory if it doesn't exist
207
- temp_dir = Path("./temp_uploads")
208
- temp_dir.mkdir(exist_ok=True)
209
 
210
- # Use a temporary file within the directory
211
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir=temp_dir) as temp_file:
212
  content = await file.read()
213
  temp_file.write(content)
214
- temp_pdf_path = temp_file.name
215
- print(f"Received file '{file.filename}', saved temporarily to '{temp_pdf_path}'")
 
 
216
  except Exception as e:
 
217
  raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}")
218
 
219
  extracted_blocks_api: List[MDRStructuredBlockModelAPI] = []
220
- start_process_time = time.time()
221
 
222
  try:
223
- print(f"Processing '{temp_pdf_path}'...")
 
 
 
 
 
 
 
 
 
 
224
  # Process the document using the temporary file path
225
- # Note: process_document returns a generator, collect all blocks
226
- all_blocks = list(mdr_processor.process_document(pdf_input=temp_pdf_path))
227
- print(f"Extracted {len(all_blocks)} raw blocks.")
228
-
229
- # Convert internal block types to API response models
230
- for block in all_blocks:
231
- if isinstance(block, MDRTextBlock):
232
- extracted_blocks_api.append(MDRTextBlockModel.from_mdr_text_block(block))
233
- elif isinstance(block, MDRTableBlock):
234
- extracted_blocks_api.append(MDRTableBlockModel.from_mdr_table_block(block))
235
- elif isinstance(block, MDRFormulaBlock):
236
- extracted_blocks_api.append(MDRFormulaBlockModel.from_mdr_formula_block(block))
237
- elif isinstance(block, MDRFigureBlock):
238
- extracted_blocks_api.append(MDRFigureBlockModel.from_mdr_figure_block(block))
239
- # Add more elif clauses if there are other block types
240
-
241
- process_time = time.time() - start_process_time
242
- print(f"Processing finished in {process_time:.2f}s. Returning {len(extracted_blocks_api)} blocks.")
 
243
 
244
  except Exception as e:
245
- print(f"ERROR during PDF processing: {e}")
246
- traceback.print_exc()
247
  raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}")
248
  finally:
249
  # Clean up the temporary file
250
- if temp_pdf_path and os.path.exists(temp_pdf_path):
251
  try:
252
- os.remove(temp_pdf_path)
253
- print(f"Cleaned up temporary file: {temp_pdf_path}")
254
  except OSError as e:
255
- print(f"Warning: Could not remove temporary file {temp_pdf_path}: {e}")
 
 
 
256
 
257
  return extracted_blocks_api
258
 
259
- # Optional: Add root endpoint for basic info/docs link
260
- @app.get("/")
261
  async def read_root():
 
262
  return {
263
  "message": "Welcome to the MagicDataReadiness PDF Processor API!",
264
- "docs_url": "/docs",
265
- "health_url": "/health"
266
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import time
3
  import tempfile
4
  import traceback
5
+ import logging # Added
6
  from pathlib import Path
7
  from typing import List, Union, Optional, Dict, Any, Literal
8
+ import uuid # For request IDs
9
 
10
+ from fastapi import FastAPI, File, UploadFile, HTTPException, status, Request # Added Request
11
  from pydantic import BaseModel, Field
12
+ # PIL.Image is not directly used in this file after refactoring,
13
+ # but mdr_pdf_parser might use it, so keep if necessary for that.
14
 
15
  # --- IMPORTS ---
16
  from mdr_pdf_parser import (
17
  MagicPDFProcessor,
18
+ MDRStructuredBlock, # Assuming this is the base type for the others
19
  MDRTextBlock,
20
  MDRTableBlock,
21
  MDRFormulaBlock,
22
  MDRFigureBlock,
23
+ MDRTextKind, # Used by MDRTextBlockModel
24
+ MDRTableFormat, # Used by MDRTableBlockModel
25
+ MDRRectangle, # Used by MDRRectangleModel
26
+ MDRTextSpan, # Used by MDRTextSpanModel
27
  MDRExtractedTableFormat # For configuration
28
  )
29
 
30
+ # --- Logging Configuration ---
31
+ LOG_LEVEL_STR = os.environ.get("LOG_LEVEL", "INFO").upper()
32
+ LOG_LEVEL = getattr(logging, LOG_LEVEL_STR, logging.INFO)
33
+
34
+ logging.basicConfig(
35
+ level=LOG_LEVEL,
36
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
37
+ datefmt="%Y-%m-%d %H:%M:%S",
38
+ )
39
+ logger = logging.getLogger("mdr_fastapi_service")
40
+
41
  # --- Configuration ---
42
  # Read from environment variables, falling back to defaults
43
+ MODEL_DIR = os.environ.get("MDR_MODEL_DIR", "/models")
44
+ DEVICE = os.environ.get("MDR_DEVICE", "cuda")
45
+ TABLE_FORMAT_STR = os.environ.get("MDR_TABLE_FORMAT", "MARKDOWN")
46
+ DEBUG_DIR_PATH_STR = os.environ.get("MDR_DEBUG_DIR_PATH", None) # For processor's debug output
47
 
48
  # Convert table format string to Enum
49
  try:
50
  TABLE_FORMAT = MDRExtractedTableFormat[TABLE_FORMAT_STR.upper()]
51
  except KeyError:
52
+ logger.warning(f"Invalid MDR_TABLE_FORMAT '{TABLE_FORMAT_STR}'. Defaulting to DISABLE.")
53
  TABLE_FORMAT = MDRExtractedTableFormat.DISABLE
54
 
55
+ # --- Global Processor Variable ---
56
+ mdr_processor: Optional[MagicPDFProcessor] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # --- API Models (Pydantic) ---
59
+ # (Pydantic models remain largely the same, ensure they are correct and complete)
 
60
  class MDRPointModel(BaseModel):
61
  x: float
62
  y: float
 
90
  )
91
 
92
  class MDRBasicBlockModel(BaseModel):
93
+ block_type: str
94
  rect: MDRRectangleModel
95
+ texts: List[MDRTextSpanModel] = Field(default_factory=list)
96
  font_size: float
97
 
98
  class MDRTextBlockModel(MDRBasicBlockModel):
99
  block_type: Literal["TextBlock"] = "TextBlock"
100
+ kind: str
101
  has_paragraph_indentation: bool
102
  last_line_touch_end: bool
103
+ texts: List[MDRTextSpanModel]
 
104
 
105
  @classmethod
106
  def from_mdr_text_block(cls, block: MDRTextBlock):
 
108
  rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
109
  texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts],
110
  font_size=block.font_size,
111
+ kind=block.kind.name,
112
  has_paragraph_indentation=block.has_paragraph_indentation,
113
  last_line_touch_end=block.last_line_touch_end
114
  )
 
116
  class MDRTableBlockModel(MDRBasicBlockModel):
117
  block_type: Literal["TableBlock"] = "TableBlock"
118
  content: str
119
+ format: str
 
120
 
121
  @classmethod
122
  def from_mdr_table_block(cls, block: MDRTableBlock):
123
  return cls(
124
  rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
125
+ texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts],
126
  font_size=block.font_size,
127
  content=block.content,
128
+ format=block.format.name
129
  )
130
 
131
  class MDRFormulaBlockModel(MDRBasicBlockModel):
132
  block_type: Literal["FormulaBlock"] = "FormulaBlock"
133
  content: Optional[str] = None
 
134
 
135
  @classmethod
136
  def from_mdr_formula_block(cls, block: MDRFormulaBlock):
137
  return cls(
138
  rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
139
+ texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts],
140
  font_size=block.font_size,
141
  content=block.content
142
  )
143
 
144
  class MDRFigureBlockModel(MDRBasicBlockModel):
145
  block_type: Literal["FigureBlock"] = "FigureBlock"
 
146
 
147
  @classmethod
148
  def from_mdr_figure_block(cls, block: MDRFigureBlock):
149
  return cls(
150
  rect=MDRRectangleModel.from_mdr_rectangle(block.rect),
151
+ texts=[MDRTextSpanModel.from_mdr_text_span(span) for span in block.texts],
152
  font_size=block.font_size
153
  )
154
 
155
+ MDRStructuredBlockModelAPI = Union[MDRTextBlockModel, MDRTableBlockModel, MDRFormulaBlockModel, MDRFigureBlockModel]
 
156
 
157
  # --- FastAPI App ---
158
  app = FastAPI(
 
161
  version="1.0.0"
162
  )
163
 
164
+ # --- Helper Functions ---
165
+ def _convert_block_to_api_model(block: MDRStructuredBlock) -> Optional[MDRStructuredBlockModelAPI]:
166
+ """Converts internal MDR block to an API model."""
167
+ if isinstance(block, MDRTextBlock):
168
+ return MDRTextBlockModel.from_mdr_text_block(block)
169
+ elif isinstance(block, MDRTableBlock):
170
+ return MDRTableBlockModel.from_mdr_table_block(block)
171
+ elif isinstance(block, MDRFormulaBlock):
172
+ return MDRFormulaBlockModel.from_mdr_formula_block(block)
173
+ elif isinstance(block, MDRFigureBlock):
174
+ return MDRFigureBlockModel.from_mdr_figure_block(block)
175
+ logger.warning(f"Unknown block type encountered: {type(block)}. Skipping conversion.")
176
+ return None
177
+
178
+ # --- Application Lifecycle Events ---
179
  @app.on_event("startup")
180
  async def startup_event():
181
+ global mdr_processor
182
+ logger.info("Application startup sequence initiated.")
183
+ logger.info("--- Configuration ---")
184
+ logger.info(f" MDR_MODEL_DIR: {MODEL_DIR}")
185
+ logger.info(f" MDR_DEVICE: {DEVICE}")
186
+ logger.info(f" MDR_TABLE_FORMAT: {TABLE_FORMAT.name}")
187
+ logger.info(f" MDR_DEBUG_DIR_PATH: {DEBUG_DIR_PATH_STR if DEBUG_DIR_PATH_STR else 'Not set'}")
188
+ logger.info(f" LOG_LEVEL: {LOG_LEVEL_STR}")
189
+ logger.info("---------------------")
190
+
191
+ logger.info("Initializing MagicPDFProcessor...")
192
+ init_start_time = time.time()
193
+ try:
194
+ mdr_processor = MagicPDFProcessor(
195
+ device=DEVICE,
196
+ model_dir_path=MODEL_DIR,
197
+ extract_table_format=TABLE_FORMAT,
198
+ debug_dir_path=DEBUG_DIR_PATH_STR # Pass the actual path or None
199
+ )
200
+ init_duration = time.time() - init_start_time
201
+ logger.info(f"MagicPDFProcessor initialized successfully ({init_duration:.2f}s)")
202
+ except Exception as e:
203
+ logger.critical(f"Failed to initialize MagicPDFProcessor: {e}", exc_info=True)
204
+ # mdr_processor will remain None, startup_event_check will handle this
205
+ # No need to print traceback here, logger.critical with exc_info=True does it.
206
+
207
+ @app.on_event("startup") # Separate event to check after initialization attempt
208
+ async def startup_event_check():
209
  if mdr_processor is None:
210
+ logger.error("MagicPDFProcessor is not initialized. Service cannot function correctly.")
211
+ # Depending on deployment, you might want to exit or let it run in a degraded state.
212
+ # For now, it will allow FastAPI to start but /health and /process-pdf will fail.
213
+ # raise RuntimeError("MagicPDFProcessor failed to initialize. Service cannot start.") # This would stop FastAPI
214
+ else:
215
+ logger.info("MagicDataReadiness Service is ready and processor is available.")
216
+
217
 
218
+ # --- API Endpoints ---
219
+ @app.get("/health", summary="Health Check")
220
  async def health_check():
221
  """Simple health check endpoint."""
222
  if mdr_processor is None:
223
+ logger.warning("/health endpoint called but processor is not initialized.")
224
  raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Processor not initialized")
225
  return {"status": "ok", "message": "MagicPDFProcessor is running."}
226
 
227
  @app.post("/process-pdf/",
228
+ response_model=List[MDRStructuredBlockModelAPI],
229
  summary="Process a PDF file",
230
  description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).")
231
+ async def process_pdf_endpoint(request: Request, file: UploadFile = File(..., description="The PDF file to process.")):
232
  """
233
  Handles PDF file upload, processing, and returns extracted blocks.
234
  """
235
+ request_id = str(uuid.uuid4())
236
+ client_host = request.client.host if request.client else "unknown"
237
+ logger.info(f"RID-{request_id}: Received /process-pdf request from {client_host} for file: '{file.filename}' (type: {file.content_type}, size: {file.size})")
238
+
239
  if mdr_processor is None:
240
+ logger.error(f"RID-{request_id}: Processor not initialized. Cannot process request.")
241
  raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Processor not initialized")
242
 
243
+ if not file.filename or not file.filename.lower().endswith(".pdf"):
244
+ logger.warning(f"RID-{request_id}: Invalid file type uploaded: '{file.filename}'")
245
  raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
246
 
247
+ temp_pdf_path_obj = None # To ensure it's always defined for finally block
 
248
  try:
249
+ # Save uploaded file temporarily
250
+ save_start_time = time.time()
251
  # Create a temporary directory if it doesn't exist
252
+ temp_dir = Path("./temp_uploads") # Consider making this configurable
253
+ temp_dir.mkdir(parents=True, exist_ok=True)
254
 
255
+ # Use a temporary file with a unique name to avoid collisions
256
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf", dir=temp_dir, prefix=f"req_{request_id}_") as temp_file:
257
  content = await file.read()
258
  temp_file.write(content)
259
+ temp_pdf_path_obj = Path(temp_file.name)
260
+ save_duration = time.time() - save_start_time
261
+ logger.info(f"RID-{request_id}: File '{file.filename}' saved temporarily to '{temp_pdf_path_obj}' ({save_duration:.2f}s)")
262
+
263
  except Exception as e:
264
+ logger.error(f"RID-{request_id}: Failed to save uploaded file '{file.filename}': {e}", exc_info=True)
265
  raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}")
266
 
267
  extracted_blocks_api: List[MDRStructuredBlockModelAPI] = []
268
+ processing_start_time = time.time()
269
 
270
  try:
271
+ logger.info(f"RID-{request_id}: Starting PDF processing for '{temp_pdf_path_obj}'...")
272
+
273
+ # Define a progress callback for the processor
274
+ def log_progress(completed_pages: int, total_pages: int):
275
+ if total_pages > 0:
276
+ percent_done = (completed_pages / total_pages) * 100
277
+ logger.info(f"RID-{request_id}: Processor progress - Page {completed_pages}/{total_pages} ({percent_done:.1f}%) scanned.")
278
+ else:
279
+ logger.info(f"RID-{request_id}: Processor progress - Page {completed_pages} scanned (total pages unknown or zero).")
280
+
281
+
282
  # Process the document using the temporary file path
283
+ # MagicPDFProcessor.process_document returns a generator
284
+ all_blocks_internal = list(mdr_processor.process_document(
285
+ pdf_input=str(temp_pdf_path_obj),
286
+ report_progress=log_progress # Pass the progress logger
287
+ ))
288
+ collection_duration = time.time() - processing_start_time
289
+ logger.info(f"RID-{request_id}: Extracted {len(all_blocks_internal)} raw blocks from processor ({collection_duration:.2f}s).")
290
+
291
+ conversion_start_time = time.time()
292
+ for i, block_internal in enumerate(all_blocks_internal):
293
+ logger.debug(f"RID-{request_id}: Converting internal block {i+1}/{len(all_blocks_internal)} of type {type(block_internal)} to API model.")
294
+ api_block = _convert_block_to_api_model(block_internal)
295
+ if api_block:
296
+ extracted_blocks_api.append(api_block)
297
+ conversion_duration = time.time() - conversion_start_time
298
+ logger.info(f"RID-{request_id}: Converted {len(extracted_blocks_api)} blocks to API models ({conversion_duration:.2f}s).")
299
+
300
+ total_processing_duration = time.time() - processing_start_time
301
+ logger.info(f"RID-{request_id}: PDF processing finished in {total_processing_duration:.2f}s. Returning {len(extracted_blocks_api)} blocks.")
302
 
303
  except Exception as e:
304
+ logger.error(f"RID-{request_id}: Error during PDF processing for '{temp_pdf_path_obj}': {e}", exc_info=True)
305
+ # Ensure traceback is logged by logger.error with exc_info=True
306
  raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}")
307
  finally:
308
  # Clean up the temporary file
309
+ if temp_pdf_path_obj and temp_pdf_path_obj.exists():
310
  try:
311
+ os.remove(temp_pdf_path_obj)
312
+ logger.info(f"RID-{request_id}: Cleaned up temporary file: {temp_pdf_path_obj}")
313
  except OSError as e:
314
+ logger.warning(f"RID-{request_id}: Could not remove temporary file {temp_pdf_path_obj}: {e}")
315
+ elif temp_pdf_path_obj: # Path was set but file doesn't exist (e.g. save failed)
316
+ logger.info(f"RID-{request_id}: Temporary file {temp_pdf_path_obj} not found for cleanup (may have failed to save).")
317
+
318
 
319
  return extracted_blocks_api
320
 
321
+ @app.get("/", summary="Root Endpoint")
 
322
  async def read_root():
323
+ """Provides basic information about the API."""
324
  return {
325
  "message": "Welcome to the MagicDataReadiness PDF Processor API!",
326
+ "docs_url": "/docs", # FastAPI default
327
+ "redoc_url": "/redoc", # FastAPI default
328
+ "health_url": "/health",
329
+ "active_configuration": {
330
+ "model_directory": MODEL_DIR,
331
+ "target_device": DEVICE, # This is the configured device, actual might differ if fallback
332
+ "table_format": TABLE_FORMAT.name,
333
+ "log_level": LOG_LEVEL_STR,
334
+ "processor_debug_output": DEBUG_DIR_PATH_STR if DEBUG_DIR_PATH_STR else "Disabled"
335
+ }
336
+ }
337
+
338
+ # --- Main execution for local testing (optional) ---
339
+ if __name__ == "__main__":
340
+ # This block is for running the app directly with uvicorn for local development.
341
+ # It's not strictly necessary if you always run with `uvicorn main:app`.
342
+ import uvicorn
343
+ logger.info("Starting Uvicorn server for local development...")
344
+ uvicorn.run(app, host="0.0.0.0", port=8000, log_level=LOG_LEVEL_STR.lower())