rodrigomasini commited on
Commit
e3523c8
·
verified ·
1 Parent(s): a0949d8

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +21 -68
Dockerfile CHANGED
@@ -6,97 +6,50 @@ ENV PYTHONUNBUFFERED=1 \
6
  MDR_DEVICE=cpu \
7
  MDR_TABLE_FORMAT=MARKDOWN \
8
  LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
9
- # HF Hub cache - IMPORTANT for hf_hub_download used during build
10
- HF_HOME=/app/.cache/huggingface \
11
- # Cache/config dirs for libraries
12
  MPLCONFIGDIR=/app/.cache/matplotlib \
13
  YOLO_CONFIG_DIR=/app/.config/Ultralytics
 
14
 
15
  # Set the working directory in the container
16
  WORKDIR /app
17
 
18
- # Install system dependencies
19
  RUN apt-get update && apt-get install -y --no-install-recommends \
20
  libgl1-mesa-glx \
21
  libglib2.0-0 \
22
- libsm6 \
23
  libxext6 \
24
  libxrender-dev \
25
  libfreetype6-dev \
26
- git \
27
- # Add curl/wget if needed for direct downloads below
28
  && rm -rf /var/lib/apt/lists/*
29
 
30
- # Copy requirements first for layer caching
31
  COPY requirements.txt .
32
 
33
- # Install Python dependencies (including huggingface_hub CLI)
34
- # Ensure huggingface-hub is in requirements.txt or install it here
35
  RUN pip install --no-cache-dir --upgrade pip && \
36
- pip install --no-cache-dir huggingface-hub && \
37
  pip install --no-cache-dir -r requirements.txt
38
 
39
- # --- PRE-DOWNLOAD MODELS ---
40
- # Create directories needed for download AND runtime
41
- RUN mkdir -p ${MDR_MODEL_DIR}/onnx_ocr \
42
- ${MDR_MODEL_DIR}/struct_eqtable \
43
- ${MDR_MODEL_DIR}/latex \
44
- ${MDR_MODEL_DIR}/layoutreader \
45
- ${MDR_MODEL_DIR}/yolo_hf_cache \
46
- /app/temp_uploads \
47
- ${HF_HOME} \
48
- ${MPLCONFIGDIR} \
49
- ${YOLO_CONFIG_DIR} && \
50
- # Set permissions broadly - adjust if needed, but often required on Spaces
51
- chmod -R 777 ${MDR_MODEL_DIR} /app
52
-
53
- # Download ONNX OCR Models (using requests/curl/wget or a helper script)
54
- # Option 1: Direct download (if URLs are stable)
55
- RUN curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/det/det.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/det.onnx" && \
56
- curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/cls/cls.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/cls.onnx" && \
57
- curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/rec/rec.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/rec.onnx" && \
58
- curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ch_ppocr_server_v2.0/ppocr_keys_v1.txt" -o "${MDR_MODEL_DIR}/onnx_ocr/ppocr_keys_v1.txt"
59
-
60
- # Download YOLO Model (using huggingface-cli)
61
- RUN huggingface-cli download \
62
- juliozhao/DocLayout-YOLO-DocStructBench \
63
- doclayout_yolo_docstructbench_imgsz1024.pt \
64
- --local-dir ${MDR_MODEL_DIR}/yolo_hf_cache \
65
- --local-dir-use-symlinks False
66
-
67
- # Download LaTeX Models (using requests/curl/wget or helper script)
68
- RUN curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/weights.pth" -o "${MDR_MODEL_DIR}/latex/weights.pth" && \
69
- curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/image_resizer.pth" -o "${MDR_MODEL_DIR}/latex/image_resizer.pth"
70
- # Note: config.yaml for LaTeX might need to be created or copied if required by the library
71
-
72
- # Download LayoutReader Model (using huggingface-cli)
73
- # Assuming LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", ...)
74
- # will handle caching correctly if HF_HOME is set and writable.
75
- # If it uses a different mechanism or specific files are needed, download them explicitly.
76
- # For LayoutReader using Hantian/layoutreader:
77
- RUN huggingface-cli download \
78
- Hantian/layoutreader \
79
- --local-dir ${MDR_MODEL_DIR}/layoutreader \
80
- --local-dir-use-symlinks False
81
-
82
- # Download StructTable Model (using huggingface-cli)
83
- # Assuming build_model(model_ckpt="U4R/StructTable-InternVL2-1B", ...)
84
- RUN huggingface-cli download \
85
- U4R/StructTable-InternVL2-1B \
86
- --local-dir ${MDR_MODEL_DIR}/struct_eqtable \
87
- --local-dir-use-symlinks False
88
-
89
- # --- END PRE-DOWNLOAD MODELS ---
90
-
91
- # Copy the application code AFTER downloads
92
  COPY mdr_pdf_parser.py .
93
  COPY main.py .
94
 
 
 
 
 
 
 
 
 
95
  # Expose the port the app runs on
96
  EXPOSE 8000
97
 
98
- # Remove VOLUME instruction - less relevant for Spaces default usage
99
- # VOLUME ${MDR_MODEL_DIR}
100
 
101
- # Start the application
102
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
 
6
  MDR_DEVICE=cpu \
7
  MDR_TABLE_FORMAT=MARKDOWN \
8
  LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
9
+ # --- ADDED: Point config/cache dirs to writable locations within /app ---
 
 
10
  MPLCONFIGDIR=/app/.cache/matplotlib \
11
  YOLO_CONFIG_DIR=/app/.config/Ultralytics
12
+ # --- END ADDED ---
13
 
14
  # Set the working directory in the container
15
  WORKDIR /app
16
 
17
+ # Install system dependencies required by OpenCV and potentially others
18
  RUN apt-get update && apt-get install -y --no-install-recommends \
19
  libgl1-mesa-glx \
20
  libglib2.0-0 \
 
21
  libxext6 \
22
  libxrender-dev \
23
  libfreetype6-dev \
24
+
25
+
26
  && rm -rf /var/lib/apt/lists/*
27
 
28
+ # Copy the requirements file into the container
29
  COPY requirements.txt .
30
 
31
+ # Install Python dependencies
32
+
33
  RUN pip install --no-cache-dir --upgrade pip && \
 
34
  pip install --no-cache-dir -r requirements.txt
35
 
36
+ # Copy the application code into the container
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  COPY mdr_pdf_parser.py .
38
  COPY main.py .
39
 
40
+ # Create the default model directory, temp directory, AND the config/cache dirs
41
+ # --- MODIFIED: Added creation of the new cache/config dirs ---
42
+ RUN mkdir -p ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache/matplotlib /app/.config/Ultralytics && \
43
+ chmod -R 777 ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache /app/.config
44
+ # Note: chmod 777 is very permissive, but often necessary/easiest in restricted environments like HF Spaces.
45
+ # It ensures the directories are writable by the user running the application.
46
+ # --- END MODIFIED ---
47
+
48
  # Expose the port the app runs on
49
  EXPOSE 8000
50
 
51
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
52
+
53
 
54
+ # This allows mounting a host directory for persistent models
55
+ VOLUME ${MDR_MODEL_DIR}