AyoubChLin commited on
Commit
19907be
·
verified ·
1 Parent(s): f3672be
Files changed (3) hide show
  1. Dockerfile +16 -0
  2. app.py +58 -0
  3. requirements.txt +2 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.10
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ import shutil
4
+ import os
5
+ from uuid import uuid4
6
+ from marker.converters.pdf import PdfConverter
7
+ from marker.models import create_model_dict
8
+ from marker.output import text_from_rendered
9
+ from threading import Lock
10
+
11
+ app = FastAPI()
12
+
13
+ # Singleton class for PdfConverter
14
+ class PdfConverterSingleton:
15
+ _instance = None
16
+ _lock = Lock()
17
+
18
+ def __new__(cls):
19
+ if cls._instance is None:
20
+ with cls._lock:
21
+ if cls._instance is None:
22
+ instance = super().__new__(cls)
23
+ instance._initialize()
24
+ cls._instance = instance
25
+ return cls._instance
26
+
27
+ def _initialize(self):
28
+ self.converter = PdfConverter(artifact_dict=create_model_dict())
29
+
30
+ def get_text(self, pdf_path: str) -> str:
31
+ rendered = self.converter(pdf_path)
32
+ text, _, _ = text_from_rendered(rendered)
33
+ return str(text)
34
+
35
+ # API function to call converter
36
+ def extract_text_from_pdf(pdf_path: str) -> str:
37
+ return PdfConverterSingleton().get_text(pdf_path)
38
+
39
+ # Endpoint to upload a file and extract markdown text
40
+ @app.post("/extract-pdf-text")
41
+ async def extract_pdf_text(file: UploadFile = File(...)):
42
+ if file.content_type != "application/pdf":
43
+ raise HTTPException(status_code=400, detail="Only PDF files are supported.")
44
+
45
+ temp_filename = f"/tmp/{uuid4().hex}.pdf"
46
+ try:
47
+ with open(temp_filename, "wb") as buffer:
48
+ shutil.copyfileobj(file.file, buffer)
49
+
50
+ text = extract_text_from_pdf(temp_filename)
51
+ return JSONResponse(content={"markdown_text": text})
52
+
53
+ except Exception as e:
54
+ raise HTTPException(status_code=500, detail=str(e))
55
+
56
+ finally:
57
+ if os.path.exists(temp_filename):
58
+ os.remove(temp_filename)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fastapi[standard]
2
+ marker