Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,19 +6,25 @@ import os
|
|
6 |
from uuid import uuid4
|
7 |
from docling.document_converter import DocumentConverter
|
8 |
from threading import Lock
|
|
|
|
|
9 |
|
10 |
app = FastAPI()
|
11 |
|
12 |
-
#
|
13 |
app.add_middleware(
|
14 |
CORSMiddleware,
|
15 |
-
allow_origins=["*"],
|
16 |
allow_credentials=True,
|
17 |
-
allow_methods=["*"],
|
18 |
-
allow_headers=["*"],
|
19 |
)
|
20 |
|
21 |
-
#
|
|
|
|
|
|
|
|
|
22 |
class PdfConverterSingleton:
|
23 |
_instance = None
|
24 |
_lock = Lock()
|
@@ -37,14 +43,18 @@ class PdfConverterSingleton:
|
|
37 |
|
38 |
def get_text(self, pdf_path: str) -> str:
|
39 |
result = self.converter.convert(pdf_path)
|
40 |
-
|
41 |
return result.document.export_to_markdown()
|
42 |
|
43 |
-
#
|
44 |
-
def
|
45 |
return PdfConverterSingleton().get_text(pdf_path)
|
46 |
|
47 |
-
#
|
|
|
|
|
|
|
|
|
|
|
48 |
@app.post("/extract-pdf-text")
|
49 |
async def extract_pdf_text(file: UploadFile = File(...)):
|
50 |
if file.content_type != "application/pdf":
|
@@ -55,7 +65,7 @@ async def extract_pdf_text(file: UploadFile = File(...)):
|
|
55 |
with open(temp_filename, "wb") as buffer:
|
56 |
shutil.copyfileobj(file.file, buffer)
|
57 |
|
58 |
-
text =
|
59 |
return JSONResponse(content={"markdown_text": text})
|
60 |
|
61 |
except Exception as e:
|
|
|
6 |
from uuid import uuid4
|
7 |
from docling.document_converter import DocumentConverter
|
8 |
from threading import Lock
|
9 |
+
from concurrent.futures import ThreadPoolExecutor
|
10 |
+
import asyncio
|
11 |
|
12 |
app = FastAPI()
|
13 |
|
14 |
+
# CORS for all
|
15 |
app.add_middleware(
|
16 |
CORSMiddleware,
|
17 |
+
allow_origins=["*"],
|
18 |
allow_credentials=True,
|
19 |
+
allow_methods=["*"],
|
20 |
+
allow_headers=["*"],
|
21 |
)
|
22 |
|
23 |
+
# Thread pool sized according to vCPU
|
24 |
+
MAX_WORKERS = os.cpu_count() or 2 # fallback to 2 if os.cpu_count() returns None
|
25 |
+
thread_pool = ThreadPoolExecutor(max_workers=MAX_WORKERS)
|
26 |
+
|
27 |
+
# Singleton class for DocumentConverter
|
28 |
class PdfConverterSingleton:
|
29 |
_instance = None
|
30 |
_lock = Lock()
|
|
|
43 |
|
44 |
def get_text(self, pdf_path: str) -> str:
|
45 |
result = self.converter.convert(pdf_path)
|
|
|
46 |
return result.document.export_to_markdown()
|
47 |
|
48 |
+
# Run sync function in threadpool
|
49 |
+
def sync_extract_text(pdf_path: str) -> str:
|
50 |
return PdfConverterSingleton().get_text(pdf_path)
|
51 |
|
52 |
+
# Async wrapper for thread pool
|
53 |
+
async def async_extract_text(pdf_path: str) -> str:
|
54 |
+
loop = asyncio.get_event_loop()
|
55 |
+
return await loop.run_in_executor(thread_pool, sync_extract_text, pdf_path)
|
56 |
+
|
57 |
+
# Main endpoint
|
58 |
@app.post("/extract-pdf-text")
|
59 |
async def extract_pdf_text(file: UploadFile = File(...)):
|
60 |
if file.content_type != "application/pdf":
|
|
|
65 |
with open(temp_filename, "wb") as buffer:
|
66 |
shutil.copyfileobj(file.file, buffer)
|
67 |
|
68 |
+
text = await async_extract_text(temp_filename)
|
69 |
return JSONResponse(content={"markdown_text": text})
|
70 |
|
71 |
except Exception as e:
|