AyoubChLin commited on
Commit
948351c
·
verified ·
1 Parent(s): 7c1f442

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -30
app.py CHANGED
@@ -4,61 +4,54 @@ from fastapi.middleware.cors import CORSMiddleware
4
  import shutil
5
  import os
6
  from uuid import uuid4
 
7
  from marker.converters.pdf import PdfConverter
8
  from marker.models import create_model_dict
9
  from marker.output import text_from_rendered
10
- from threading import Lock
11
 
12
  app = FastAPI()
13
 
14
- # Enable CORS for all origins
15
  app.add_middleware(
16
  CORSMiddleware,
17
- allow_origins=["*"], # Allow all origins
18
  allow_credentials=True,
19
- allow_methods=["*"], # Allow all HTTP methods
20
- allow_headers=["*"], # Allow all headers
21
  )
22
 
23
- # Singleton class for PdfConverter
24
- class PdfConverterSingleton:
25
- _instance = None
26
- _lock = Lock()
27
 
28
- def __new__(cls):
29
- if cls._instance is None:
30
- with cls._lock:
31
- if cls._instance is None:
32
- instance = super().__new__(cls)
33
- instance._initialize()
34
- cls._instance = instance
35
- return cls._instance
36
 
37
- def _initialize(self):
38
- self.converter = PdfConverter(artifact_dict=create_model_dict())
 
 
 
 
39
 
40
- def get_text(self, pdf_path: str) -> str:
41
- rendered = self.converter(pdf_path)
42
- text, _, _ = text_from_rendered(rendered)
43
- return str(text)
44
 
45
- # API function to call converter
46
- def extract_text_from_pdf(pdf_path: str) -> str:
47
- return PdfConverterSingleton().get_text(pdf_path)
48
-
49
- # Endpoint to upload a file and extract markdown text
50
  @app.post("/extract-pdf-text")
51
  async def extract_pdf_text(file: UploadFile = File(...)):
52
  if file.content_type != "application/pdf":
53
  raise HTTPException(status_code=400, detail="Only PDF files are supported.")
54
 
55
  temp_filename = f"/tmp/{uuid4().hex}.pdf"
 
56
  try:
57
  with open(temp_filename, "wb") as buffer:
58
  shutil.copyfileobj(file.file, buffer)
59
 
60
- text = extract_text_from_pdf(temp_filename)
61
- return JSONResponse(content={"markdown_text": text})
62
 
63
  except Exception as e:
64
  raise HTTPException(status_code=500, detail=str(e))
 
4
  import shutil
5
  import os
6
  from uuid import uuid4
7
+ from concurrent.futures import ThreadPoolExecutor
8
  from marker.converters.pdf import PdfConverter
9
  from marker.models import create_model_dict
10
  from marker.output import text_from_rendered
11
+ import asyncio
12
 
13
  app = FastAPI()
14
 
15
+ # CORS
16
  app.add_middleware(
17
  CORSMiddleware,
18
+ allow_origins=["*"],
19
  allow_credentials=True,
20
+ allow_methods=["*"],
21
+ allow_headers=["*"],
22
  )
23
 
24
+ # Initialize converter artifact once
25
+ ARTIFACT_DICT = create_model_dict()
 
 
26
 
27
+ # ThreadPoolExecutor: use 2 * number of vCPUs for I/O-bound tasks
28
+ EXECUTOR = ThreadPoolExecutor(max_workers=8) # Tune based on PDF size
 
 
 
 
 
 
29
 
30
+ def _extract_text_sync(pdf_path: str) -> str:
31
+ """Blocking function to extract text from a PDF using PdfConverter."""
32
+ converter = PdfConverter(artifact_dict=ARTIFACT_DICT) # fresh instance
33
+ rendered = converter(pdf_path)
34
+ text, _, _ = text_from_rendered(rendered)
35
+ return str(text)
36
 
37
+ async def extract_text_from_pdf(pdf_path: str) -> str:
38
+ """Async wrapper for the blocking PDF extraction."""
39
+ loop = asyncio.get_running_loop()
40
+ return await loop.run_in_executor(EXECUTOR, _extract_text_sync, pdf_path)
41
 
 
 
 
 
 
42
  @app.post("/extract-pdf-text")
43
  async def extract_pdf_text(file: UploadFile = File(...)):
44
  if file.content_type != "application/pdf":
45
  raise HTTPException(status_code=400, detail="Only PDF files are supported.")
46
 
47
  temp_filename = f"/tmp/{uuid4().hex}.pdf"
48
+
49
  try:
50
  with open(temp_filename, "wb") as buffer:
51
  shutil.copyfileobj(file.file, buffer)
52
 
53
+ markdown_text = await extract_text_from_pdf(temp_filename)
54
+ return JSONResponse(content={"markdown_text": markdown_text})
55
 
56
  except Exception as e:
57
  raise HTTPException(status_code=500, detail=str(e))