AyoubChLin commited on
Commit
3e56329
·
verified ·
1 Parent(s): 948351c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -24
app.py CHANGED
@@ -4,58 +4,65 @@ from fastapi.middleware.cors import CORSMiddleware
4
  import shutil
5
  import os
6
  from uuid import uuid4
7
- from concurrent.futures import ThreadPoolExecutor
8
  from marker.converters.pdf import PdfConverter
9
  from marker.models import create_model_dict
10
  from marker.output import text_from_rendered
11
- import asyncio
12
 
13
  app = FastAPI()
14
 
15
- # CORS
16
  app.add_middleware(
17
  CORSMiddleware,
18
- allow_origins=["*"],
19
  allow_credentials=True,
20
- allow_methods=["*"],
21
- allow_headers=["*"],
22
  )
23
 
24
- # Initialize converter artifact once
25
- ARTIFACT_DICT = create_model_dict()
 
 
26
 
27
- # ThreadPoolExecutor: use 2 * number of vCPUs for I/O-bound tasks
28
- EXECUTOR = ThreadPoolExecutor(max_workers=8) # Tune based on PDF size
 
 
 
 
 
 
29
 
30
- def _extract_text_sync(pdf_path: str) -> str:
31
- """Blocking function to extract text from a PDF using PdfConverter."""
32
- converter = PdfConverter(artifact_dict=ARTIFACT_DICT) # fresh instance
33
- rendered = converter(pdf_path)
34
- text, _, _ = text_from_rendered(rendered)
35
- return str(text)
36
 
37
- async def extract_text_from_pdf(pdf_path: str) -> str:
38
- """Async wrapper for the blocking PDF extraction."""
39
- loop = asyncio.get_running_loop()
40
- return await loop.run_in_executor(EXECUTOR, _extract_text_sync, pdf_path)
41
 
 
 
 
 
 
42
  @app.post("/extract-pdf-text")
43
  async def extract_pdf_text(file: UploadFile = File(...)):
44
  if file.content_type != "application/pdf":
45
  raise HTTPException(status_code=400, detail="Only PDF files are supported.")
46
 
47
  temp_filename = f"/tmp/{uuid4().hex}.pdf"
48
-
49
  try:
50
  with open(temp_filename, "wb") as buffer:
51
  shutil.copyfileobj(file.file, buffer)
52
 
53
- markdown_text = await extract_text_from_pdf(temp_filename)
54
- return JSONResponse(content={"markdown_text": markdown_text})
55
 
56
  except Exception as e:
57
  raise HTTPException(status_code=500, detail=str(e))
58
 
59
  finally:
60
  if os.path.exists(temp_filename):
61
- os.remove(temp_filename)
 
4
  import shutil
5
  import os
6
  from uuid import uuid4
 
7
  from marker.converters.pdf import PdfConverter
8
  from marker.models import create_model_dict
9
  from marker.output import text_from_rendered
10
+ from threading import Lock
11
 
12
  app = FastAPI()
13
 
14
+ # Enable CORS for all origins
15
  app.add_middleware(
16
  CORSMiddleware,
17
+ allow_origins=["*"], # Allow all origins
18
  allow_credentials=True,
19
+ allow_methods=["*"], # Allow all HTTP methods
20
+ allow_headers=["*"], # Allow all headers
21
  )
22
 
23
+ # Singleton class for PdfConverter
24
+ class PdfConverterSingleton:
25
+ _instance = None
26
+ _lock = Lock()
27
 
28
+ def __new__(cls):
29
+ if cls._instance is None:
30
+ with cls._lock:
31
+ if cls._instance is None:
32
+ instance = super().__new__(cls)
33
+ instance._initialize()
34
+ cls._instance = instance
35
+ return cls._instance
36
 
37
+ def _initialize(self):
38
+ self.converter = PdfConverter(artifact_dict=create_model_dict())
 
 
 
 
39
 
40
+ def get_text(self, pdf_path: str) -> str:
41
+ rendered = self.converter(pdf_path)
42
+ text, _, _ = text_from_rendered(rendered)
43
+ return str(text)
44
 
45
+ # API function to call converter
46
+ def extract_text_from_pdf(pdf_path: str) -> str:
47
+ return PdfConverterSingleton().get_text(pdf_path)
48
+
49
+ # Endpoint to upload a file and extract markdown text
50
  @app.post("/extract-pdf-text")
51
  async def extract_pdf_text(file: UploadFile = File(...)):
52
  if file.content_type != "application/pdf":
53
  raise HTTPException(status_code=400, detail="Only PDF files are supported.")
54
 
55
  temp_filename = f"/tmp/{uuid4().hex}.pdf"
 
56
  try:
57
  with open(temp_filename, "wb") as buffer:
58
  shutil.copyfileobj(file.file, buffer)
59
 
60
+ text = extract_text_from_pdf(temp_filename)
61
+ return JSONResponse(content={"markdown_text": text})
62
 
63
  except Exception as e:
64
  raise HTTPException(status_code=500, detail=str(e))
65
 
66
  finally:
67
  if os.path.exists(temp_filename):
68
+ os.remove(temp_filename)