AyoubChLin commited on
Commit
f8aea0d
·
verified ·
1 Parent(s): 1a272cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -10
app.py CHANGED
@@ -6,19 +6,25 @@ import os
6
  from uuid import uuid4
7
  from docling.document_converter import DocumentConverter
8
  from threading import Lock
 
 
9
 
10
  app = FastAPI()
11
 
12
- # Enable CORS for all origins
13
  app.add_middleware(
14
  CORSMiddleware,
15
- allow_origins=["*"], # Allow all origins
16
  allow_credentials=True,
17
- allow_methods=["*"], # Allow all HTTP methods
18
- allow_headers=["*"], # Allow all headers
19
  )
20
 
21
- # Singleton class for PdfConverter
 
 
 
 
22
  class PdfConverterSingleton:
23
  _instance = None
24
  _lock = Lock()
@@ -37,14 +43,18 @@ class PdfConverterSingleton:
37
 
38
  def get_text(self, pdf_path: str) -> str:
39
  result = self.converter.convert(pdf_path)
40
-
41
  return result.document.export_to_markdown()
42
 
43
- # API function to call converter
44
- def extract_text_from_pdf(pdf_path: str) -> str:
45
  return PdfConverterSingleton().get_text(pdf_path)
46
 
47
- # Endpoint to upload a file and extract markdown text
 
 
 
 
 
48
  @app.post("/extract-pdf-text")
49
  async def extract_pdf_text(file: UploadFile = File(...)):
50
  if file.content_type != "application/pdf":
@@ -55,7 +65,7 @@ async def extract_pdf_text(file: UploadFile = File(...)):
55
  with open(temp_filename, "wb") as buffer:
56
  shutil.copyfileobj(file.file, buffer)
57
 
58
- text = extract_text_from_pdf(temp_filename)
59
  return JSONResponse(content={"markdown_text": text})
60
 
61
  except Exception as e:
 
6
  from uuid import uuid4
7
  from docling.document_converter import DocumentConverter
8
  from threading import Lock
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ import asyncio
11
 
12
  app = FastAPI()
13
 
14
+ # CORS for all
15
  app.add_middleware(
16
  CORSMiddleware,
17
+ allow_origins=["*"],
18
  allow_credentials=True,
19
+ allow_methods=["*"],
20
+ allow_headers=["*"],
21
  )
22
 
23
+ # Thread pool sized according to vCPU
24
+ MAX_WORKERS = os.cpu_count() or 2 # fallback to 2 if os.cpu_count() returns None
25
+ thread_pool = ThreadPoolExecutor(max_workers=MAX_WORKERS)
26
+
27
+ # Singleton class for DocumentConverter
28
  class PdfConverterSingleton:
29
  _instance = None
30
  _lock = Lock()
 
43
 
44
  def get_text(self, pdf_path: str) -> str:
45
  result = self.converter.convert(pdf_path)
 
46
  return result.document.export_to_markdown()
47
 
48
+ # Run sync function in threadpool
49
+ def sync_extract_text(pdf_path: str) -> str:
50
  return PdfConverterSingleton().get_text(pdf_path)
51
 
52
+ # Async wrapper for thread pool
53
+ async def async_extract_text(pdf_path: str) -> str:
54
+ loop = asyncio.get_event_loop()
55
+ return await loop.run_in_executor(thread_pool, sync_extract_text, pdf_path)
56
+
57
+ # Main endpoint
58
  @app.post("/extract-pdf-text")
59
  async def extract_pdf_text(file: UploadFile = File(...)):
60
  if file.content_type != "application/pdf":
 
65
  with open(temp_filename, "wb") as buffer:
66
  shutil.copyfileobj(file.file, buffer)
67
 
68
+ text = await async_extract_text(temp_filename)
69
  return JSONResponse(content={"markdown_text": text})
70
 
71
  except Exception as e: