File size: 2,157 Bytes
19907be
 
7c1f442
19907be
 
 
 
 
 
3e56329
19907be
 
 
3e56329
7c1f442
 
3e56329
7c1f442
3e56329
 
7c1f442
 
3e56329
 
 
 
19907be
3e56329
 
 
 
 
 
 
 
19907be
3e56329
 
19907be
3e56329
 
 
 
19907be
3e56329
 
 
 
 
19907be
 
 
 
 
 
 
 
 
 
3e56329
 
19907be
 
 
 
 
 
3e56329
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import shutil
import os
from uuid import uuid4
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from threading import Lock

app = FastAPI()

# Enable CORS for all origins
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allow all HTTP methods
    allow_headers=["*"],  # Allow all headers
)

# Singleton class for PdfConverter
class PdfConverterSingleton:
    _instance = None
    _lock = Lock()

    def __new__(cls):
        if cls._instance is None:
            with cls._lock:
                if cls._instance is None:
                    instance = super().__new__(cls)
                    instance._initialize()
                    cls._instance = instance
        return cls._instance

    def _initialize(self):
        self.converter = PdfConverter(artifact_dict=create_model_dict())

    def get_text(self, pdf_path: str) -> str:
        rendered = self.converter(pdf_path)
        text, _, _ = text_from_rendered(rendered)
        return str(text)

# API function to call converter
def extract_text_from_pdf(pdf_path: str) -> str:
    return PdfConverterSingleton().get_text(pdf_path)

# Endpoint to upload a file and extract markdown text
@app.post("/extract-pdf-text")
async def extract_pdf_text(file: UploadFile = File(...)):
    if file.content_type != "application/pdf":
        raise HTTPException(status_code=400, detail="Only PDF files are supported.")

    temp_filename = f"/tmp/{uuid4().hex}.pdf"
    try:
        with open(temp_filename, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        text = extract_text_from_pdf(temp_filename)
        return JSONResponse(content={"markdown_text": text})

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

    finally:
        if os.path.exists(temp_filename):
            os.remove(temp_filename)