Spaces:
Running
Running
File size: 2,157 Bytes
19907be 7c1f442 19907be 3e56329 19907be 3e56329 7c1f442 3e56329 7c1f442 3e56329 7c1f442 3e56329 19907be 3e56329 19907be 3e56329 19907be 3e56329 19907be 3e56329 19907be 3e56329 19907be 3e56329 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import shutil
import os
from uuid import uuid4
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from threading import Lock
app = FastAPI()
# Enable CORS for all origins
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins
allow_credentials=True,
allow_methods=["*"], # Allow all HTTP methods
allow_headers=["*"], # Allow all headers
)
# Singleton class for PdfConverter
class PdfConverterSingleton:
_instance = None
_lock = Lock()
def __new__(cls):
if cls._instance is None:
with cls._lock:
if cls._instance is None:
instance = super().__new__(cls)
instance._initialize()
cls._instance = instance
return cls._instance
def _initialize(self):
self.converter = PdfConverter(artifact_dict=create_model_dict())
def get_text(self, pdf_path: str) -> str:
rendered = self.converter(pdf_path)
text, _, _ = text_from_rendered(rendered)
return str(text)
# API function to call converter
def extract_text_from_pdf(pdf_path: str) -> str:
return PdfConverterSingleton().get_text(pdf_path)
# Endpoint to upload a file and extract markdown text
@app.post("/extract-pdf-text")
async def extract_pdf_text(file: UploadFile = File(...)):
if file.content_type != "application/pdf":
raise HTTPException(status_code=400, detail="Only PDF files are supported.")
temp_filename = f"/tmp/{uuid4().hex}.pdf"
try:
with open(temp_filename, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
text = extract_text_from_pdf(temp_filename)
return JSONResponse(content={"markdown_text": text})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
if os.path.exists(temp_filename):
os.remove(temp_filename) |