Spaces:

AyoubChLin
/

marker-app

Running

App Files Files Community

AyoubChLin commited on 9 days ago

Commit

3e56329

verified ·

1 Parent(s): 948351c

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -24

app.py CHANGED Viewed

@@ -4,58 +4,65 @@ from fastapi.middleware.cors import CORSMiddleware
 import shutil
 import os
 from uuid import uuid4
-from concurrent.futures import ThreadPoolExecutor
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
-import asyncio
 app = FastAPI()
-# CORS
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
     allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
-# Initialize converter artifact once
-ARTIFACT_DICT = create_model_dict()
-# ThreadPoolExecutor: use 2 * number of vCPUs for I/O-bound tasks
-EXECUTOR = ThreadPoolExecutor(max_workers=8)  # Tune based on PDF size
-def _extract_text_sync(pdf_path: str) -> str:
-    """Blocking function to extract text from a PDF using PdfConverter."""
-    converter = PdfConverter(artifact_dict=ARTIFACT_DICT)  # fresh instance
-    rendered = converter(pdf_path)
-    text, _, _ = text_from_rendered(rendered)
-    return str(text)
-async def extract_text_from_pdf(pdf_path: str) -> str:
-    """Async wrapper for the blocking PDF extraction."""
-    loop = asyncio.get_running_loop()
-    return await loop.run_in_executor(EXECUTOR, _extract_text_sync, pdf_path)
 @app.post("/extract-pdf-text")
 async def extract_pdf_text(file: UploadFile = File(...)):
     if file.content_type != "application/pdf":
         raise HTTPException(status_code=400, detail="Only PDF files are supported.")
     temp_filename = f"/tmp/{uuid4().hex}.pdf"
     try:
         with open(temp_filename, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
-        markdown_text = await extract_text_from_pdf(temp_filename)
-        return JSONResponse(content={"markdown_text": markdown_text})
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
     finally:
         if os.path.exists(temp_filename):
-            os.remove(temp_filename)

 import shutil
 import os
 from uuid import uuid4
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
+from threading import Lock
 app = FastAPI()
+# Enable CORS for all origins
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],  # Allow all origins
     allow_credentials=True,
+    allow_methods=["*"],  # Allow all HTTP methods
+    allow_headers=["*"],  # Allow all headers
 )
+# Singleton class for PdfConverter
+class PdfConverterSingleton:
+    _instance = None
+    _lock = Lock()
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    instance = super().__new__(cls)
+                    instance._initialize()
+                    cls._instance = instance
+        return cls._instance
+    def _initialize(self):
+        self.converter = PdfConverter(artifact_dict=create_model_dict())
+    def get_text(self, pdf_path: str) -> str:
+        rendered = self.converter(pdf_path)
+        text, _, _ = text_from_rendered(rendered)
+        return str(text)
+# API function to call converter
+def extract_text_from_pdf(pdf_path: str) -> str:
+    return PdfConverterSingleton().get_text(pdf_path)
+# Endpoint to upload a file and extract markdown text
 @app.post("/extract-pdf-text")
 async def extract_pdf_text(file: UploadFile = File(...)):
     if file.content_type != "application/pdf":
         raise HTTPException(status_code=400, detail="Only PDF files are supported.")
     temp_filename = f"/tmp/{uuid4().hex}.pdf"
     try:
         with open(temp_filename, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
+        text = extract_text_from_pdf(temp_filename)
+        return JSONResponse(content={"markdown_text": text})
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
     finally:
         if os.path.exists(temp_filename):
+            os.remove(temp_filename)