from fastapi import FastAPI, UploadFile, File, HTTPException from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware import shutil import os from uuid import uuid4 from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.output import text_from_rendered from threading import Lock app = FastAPI() # Enable CORS for all origins app.add_middleware( CORSMiddleware, allow_origins=["*"], # Allow all origins allow_credentials=True, allow_methods=["*"], # Allow all HTTP methods allow_headers=["*"], # Allow all headers ) # Singleton class for PdfConverter class PdfConverterSingleton: _instance = None _lock = Lock() def __new__(cls): if cls._instance is None: with cls._lock: if cls._instance is None: instance = super().__new__(cls) instance._initialize() cls._instance = instance return cls._instance def _initialize(self): self.converter = PdfConverter(artifact_dict=create_model_dict()) def get_text(self, pdf_path: str) -> str: rendered = self.converter(pdf_path) text, _, _ = text_from_rendered(rendered) return str(text) # API function to call converter def extract_text_from_pdf(pdf_path: str) -> str: return PdfConverterSingleton().get_text(pdf_path) # Endpoint to upload a file and extract markdown text @app.post("/extract-pdf-text") async def extract_pdf_text(file: UploadFile = File(...)): if file.content_type != "application/pdf": raise HTTPException(status_code=400, detail="Only PDF files are supported.") temp_filename = f"/tmp/{uuid4().hex}.pdf" try: with open(temp_filename, "wb") as buffer: shutil.copyfileobj(file.file, buffer) text = extract_text_from_pdf(temp_filename) return JSONResponse(content={"markdown_text": text}) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) finally: if os.path.exists(temp_filename): os.remove(temp_filename)