Update Dockerfile
Browse files- Dockerfile +21 -68
Dockerfile
CHANGED
@@ -6,97 +6,50 @@ ENV PYTHONUNBUFFERED=1 \
|
|
6 |
MDR_DEVICE=cpu \
|
7 |
MDR_TABLE_FORMAT=MARKDOWN \
|
8 |
LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
|
9 |
-
#
|
10 |
-
HF_HOME=/app/.cache/huggingface \
|
11 |
-
# Cache/config dirs for libraries
|
12 |
MPLCONFIGDIR=/app/.cache/matplotlib \
|
13 |
YOLO_CONFIG_DIR=/app/.config/Ultralytics
|
|
|
14 |
|
15 |
# Set the working directory in the container
|
16 |
WORKDIR /app
|
17 |
|
18 |
-
# Install system dependencies
|
19 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
20 |
libgl1-mesa-glx \
|
21 |
libglib2.0-0 \
|
22 |
-
libsm6 \
|
23 |
libxext6 \
|
24 |
libxrender-dev \
|
25 |
libfreetype6-dev \
|
26 |
-
|
27 |
-
|
28 |
&& rm -rf /var/lib/apt/lists/*
|
29 |
|
30 |
-
# Copy requirements
|
31 |
COPY requirements.txt .
|
32 |
|
33 |
-
# Install Python dependencies
|
34 |
-
|
35 |
RUN pip install --no-cache-dir --upgrade pip && \
|
36 |
-
pip install --no-cache-dir huggingface-hub && \
|
37 |
pip install --no-cache-dir -r requirements.txt
|
38 |
|
39 |
-
#
|
40 |
-
# Create directories needed for download AND runtime
|
41 |
-
RUN mkdir -p ${MDR_MODEL_DIR}/onnx_ocr \
|
42 |
-
${MDR_MODEL_DIR}/struct_eqtable \
|
43 |
-
${MDR_MODEL_DIR}/latex \
|
44 |
-
${MDR_MODEL_DIR}/layoutreader \
|
45 |
-
${MDR_MODEL_DIR}/yolo_hf_cache \
|
46 |
-
/app/temp_uploads \
|
47 |
-
${HF_HOME} \
|
48 |
-
${MPLCONFIGDIR} \
|
49 |
-
${YOLO_CONFIG_DIR} && \
|
50 |
-
# Set permissions broadly - adjust if needed, but often required on Spaces
|
51 |
-
chmod -R 777 ${MDR_MODEL_DIR} /app
|
52 |
-
|
53 |
-
# Download ONNX OCR Models (using requests/curl/wget or a helper script)
|
54 |
-
# Option 1: Direct download (if URLs are stable)
|
55 |
-
RUN curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/det/det.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/det.onnx" && \
|
56 |
-
curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/cls/cls.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/cls.onnx" && \
|
57 |
-
curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/rec/rec.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/rec.onnx" && \
|
58 |
-
curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ch_ppocr_server_v2.0/ppocr_keys_v1.txt" -o "${MDR_MODEL_DIR}/onnx_ocr/ppocr_keys_v1.txt"
|
59 |
-
|
60 |
-
# Download YOLO Model (using huggingface-cli)
|
61 |
-
RUN huggingface-cli download \
|
62 |
-
juliozhao/DocLayout-YOLO-DocStructBench \
|
63 |
-
doclayout_yolo_docstructbench_imgsz1024.pt \
|
64 |
-
--local-dir ${MDR_MODEL_DIR}/yolo_hf_cache \
|
65 |
-
--local-dir-use-symlinks False
|
66 |
-
|
67 |
-
# Download LaTeX Models (using requests/curl/wget or helper script)
|
68 |
-
RUN curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/weights.pth" -o "${MDR_MODEL_DIR}/latex/weights.pth" && \
|
69 |
-
curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/image_resizer.pth" -o "${MDR_MODEL_DIR}/latex/image_resizer.pth"
|
70 |
-
# Note: config.yaml for LaTeX might need to be created or copied if required by the library
|
71 |
-
|
72 |
-
# Download LayoutReader Model (using huggingface-cli)
|
73 |
-
# Assuming LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", ...)
|
74 |
-
# will handle caching correctly if HF_HOME is set and writable.
|
75 |
-
# If it uses a different mechanism or specific files are needed, download them explicitly.
|
76 |
-
# For LayoutReader using Hantian/layoutreader:
|
77 |
-
RUN huggingface-cli download \
|
78 |
-
Hantian/layoutreader \
|
79 |
-
--local-dir ${MDR_MODEL_DIR}/layoutreader \
|
80 |
-
--local-dir-use-symlinks False
|
81 |
-
|
82 |
-
# Download StructTable Model (using huggingface-cli)
|
83 |
-
# Assuming build_model(model_ckpt="U4R/StructTable-InternVL2-1B", ...)
|
84 |
-
RUN huggingface-cli download \
|
85 |
-
U4R/StructTable-InternVL2-1B \
|
86 |
-
--local-dir ${MDR_MODEL_DIR}/struct_eqtable \
|
87 |
-
--local-dir-use-symlinks False
|
88 |
-
|
89 |
-
# --- END PRE-DOWNLOAD MODELS ---
|
90 |
-
|
91 |
-
# Copy the application code AFTER downloads
|
92 |
COPY mdr_pdf_parser.py .
|
93 |
COPY main.py .
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
# Expose the port the app runs on
|
96 |
EXPOSE 8000
|
97 |
|
98 |
-
|
99 |
-
|
100 |
|
101 |
-
#
|
102 |
-
|
|
|
6 |
MDR_DEVICE=cpu \
|
7 |
MDR_TABLE_FORMAT=MARKDOWN \
|
8 |
LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
|
9 |
+
# --- ADDED: Point config/cache dirs to writable locations within /app ---
|
|
|
|
|
10 |
MPLCONFIGDIR=/app/.cache/matplotlib \
|
11 |
YOLO_CONFIG_DIR=/app/.config/Ultralytics
|
12 |
+
# --- END ADDED ---
|
13 |
|
14 |
# Set the working directory in the container
|
15 |
WORKDIR /app
|
16 |
|
17 |
+
# Install system dependencies required by OpenCV and potentially others
|
18 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
19 |
libgl1-mesa-glx \
|
20 |
libglib2.0-0 \
|
|
|
21 |
libxext6 \
|
22 |
libxrender-dev \
|
23 |
libfreetype6-dev \
|
24 |
+
|
25 |
+
|
26 |
&& rm -rf /var/lib/apt/lists/*
|
27 |
|
28 |
+
# Copy the requirements file into the container
|
29 |
COPY requirements.txt .
|
30 |
|
31 |
+
# Install Python dependencies
|
32 |
+
|
33 |
RUN pip install --no-cache-dir --upgrade pip && \
|
|
|
34 |
pip install --no-cache-dir -r requirements.txt
|
35 |
|
36 |
+
# Copy the application code into the container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
COPY mdr_pdf_parser.py .
|
38 |
COPY main.py .
|
39 |
|
40 |
+
# Create the default model directory, temp directory, AND the config/cache dirs
|
41 |
+
# --- MODIFIED: Added creation of the new cache/config dirs ---
|
42 |
+
RUN mkdir -p ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache/matplotlib /app/.config/Ultralytics && \
|
43 |
+
chmod -R 777 ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache /app/.config
|
44 |
+
# Note: chmod 777 is very permissive, but often necessary/easiest in restricted environments like HF Spaces.
|
45 |
+
# It ensures the directories are writable by the user running the application.
|
46 |
+
# --- END MODIFIED ---
|
47 |
+
|
48 |
# Expose the port the app runs on
|
49 |
EXPOSE 8000
|
50 |
|
51 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
52 |
+
|
53 |
|
54 |
+
# This allows mounting a host directory for persistent models
|
55 |
+
VOLUME ${MDR_MODEL_DIR}
|