docling-app

Sleeping

App Files Files Community

AyoubChLin commited on 27 days ago

Commit

f8aea0d

verified ·

1 Parent(s): 1a272cd

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -10

app.py CHANGED Viewed

@@ -6,19 +6,25 @@ import os
 from uuid import uuid4
 from docling.document_converter import DocumentConverter
 from threading import Lock
 app = FastAPI()
-# Enable CORS for all origins
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Allow all origins
     allow_credentials=True,
-    allow_methods=["*"],  # Allow all HTTP methods
-    allow_headers=["*"],  # Allow all headers
 )
-# Singleton class for PdfConverter
 class PdfConverterSingleton:
     _instance = None
     _lock = Lock()
@@ -37,14 +43,18 @@ class PdfConverterSingleton:
     def get_text(self, pdf_path: str) -> str:
         result = self.converter.convert(pdf_path)
         return result.document.export_to_markdown()
-# API function to call converter
-def extract_text_from_pdf(pdf_path: str) -> str:
     return PdfConverterSingleton().get_text(pdf_path)
-# Endpoint to upload a file and extract markdown text
 @app.post("/extract-pdf-text")
 async def extract_pdf_text(file: UploadFile = File(...)):
     if file.content_type != "application/pdf":
@@ -55,7 +65,7 @@ async def extract_pdf_text(file: UploadFile = File(...)):
         with open(temp_filename, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
-        text = extract_text_from_pdf(temp_filename)
         return JSONResponse(content={"markdown_text": text})
     except Exception as e:

 from uuid import uuid4
 from docling.document_converter import DocumentConverter
 from threading import Lock
+from concurrent.futures import ThreadPoolExecutor
+import asyncio
 app = FastAPI()
+# CORS for all
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
+# Thread pool sized according to vCPU
+MAX_WORKERS = os.cpu_count() or 2  # fallback to 2 if os.cpu_count() returns None
+thread_pool = ThreadPoolExecutor(max_workers=MAX_WORKERS)
+# Singleton class for DocumentConverter
 class PdfConverterSingleton:
     _instance = None
     _lock = Lock()
     def get_text(self, pdf_path: str) -> str:
         result = self.converter.convert(pdf_path)
         return result.document.export_to_markdown()
+# Run sync function in threadpool
+def sync_extract_text(pdf_path: str) -> str:
     return PdfConverterSingleton().get_text(pdf_path)
+# Async wrapper for thread pool
+async def async_extract_text(pdf_path: str) -> str:
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(thread_pool, sync_extract_text, pdf_path)
+# Main endpoint
 @app.post("/extract-pdf-text")
 async def extract_pdf_text(file: UploadFile = File(...)):
     if file.content_type != "application/pdf":
         with open(temp_filename, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
+        text = await async_extract_text(temp_filename)
         return JSONResponse(content={"markdown_text": text})
     except Exception as e: