from fastapi import FastAPI, UploadFile, File, HTTPException from fastapi.responses import JSONResponse import shutil import os from uuid import uuid4 from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.output import text_from_rendered from threading import Lock app = FastAPI() # Singleton class for PdfConverter class PdfConverterSingleton: _instance = None _lock = Lock() def __new__(cls): if cls._instance is None: with cls._lock: if cls._instance is None: instance = super().__new__(cls) instance._initialize() cls._instance = instance return cls._instance def _initialize(self): self.converter = PdfConverter(artifact_dict=create_model_dict()) def get_text(self, pdf_path: str) -> str: rendered = self.converter(pdf_path) text, _, _ = text_from_rendered(rendered) return str(text) # API function to call converter def extract_text_from_pdf(pdf_path: str) -> str: return PdfConverterSingleton().get_text(pdf_path) # Endpoint to upload a file and extract markdown text @app.post("/extract-pdf-text") async def extract_pdf_text(file: UploadFile = File(...)): if file.content_type != "application/pdf": raise HTTPException(status_code=400, detail="Only PDF files are supported.") temp_filename = f"/tmp/{uuid4().hex}.pdf" try: with open(temp_filename, "wb") as buffer: shutil.copyfileobj(file.file, buffer) text = extract_text_from_pdf(temp_filename) return JSONResponse(content={"markdown_text": text}) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) finally: if os.path.exists(temp_filename): os.remove(temp_filename)