rodrigomasini commited on
Commit
a0949d8
·
verified ·
1 Parent(s): 27b394b

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +67 -17
Dockerfile CHANGED
@@ -6,15 +6,16 @@ ENV PYTHONUNBUFFERED=1 \
6
  MDR_DEVICE=cpu \
7
  MDR_TABLE_FORMAT=MARKDOWN \
8
  LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
9
- # --- ADDED: Point config/cache dirs to writable locations within /app ---
 
 
10
  MPLCONFIGDIR=/app/.cache/matplotlib \
11
  YOLO_CONFIG_DIR=/app/.config/Ultralytics
12
- # --- END ADDED ---
13
 
14
  # Set the working directory in the container
15
  WORKDIR /app
16
 
17
- # Install system dependencies required by OpenCV and potentially others
18
  RUN apt-get update && apt-get install -y --no-install-recommends \
19
  libgl1-mesa-glx \
20
  libglib2.0-0 \
@@ -22,31 +23,80 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
22
  libxext6 \
23
  libxrender-dev \
24
  libfreetype6-dev \
 
 
25
  && rm -rf /var/lib/apt/lists/*
26
 
27
- # Copy the requirements file into the container
28
  COPY requirements.txt .
29
 
30
- # Install Python dependencies
 
31
  RUN pip install --no-cache-dir --upgrade pip && \
 
32
  pip install --no-cache-dir -r requirements.txt
33
 
34
- # Copy the application code into the container
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  COPY mdr_pdf_parser.py .
36
  COPY main.py .
37
 
38
- # Create the default model directory, temp directory, AND the config/cache dirs
39
- # --- MODIFIED: Added creation of the new cache/config dirs ---
40
- RUN mkdir -p ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache/matplotlib /app/.config/Ultralytics && \
41
- chmod -R 777 ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache /app/.config
42
- # Note: chmod 777 is very permissive, but often necessary/easiest in restricted environments like HF Spaces.
43
- # It ensures the directories are writable by the user running the application.
44
- # --- END MODIFIED ---
45
-
46
  # Expose the port the app runs on
47
  EXPOSE 8000
48
 
49
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
 
50
 
51
- # This allows mounting a host directory for persistent models
52
- VOLUME ${MDR_MODEL_DIR}
 
6
  MDR_DEVICE=cpu \
7
  MDR_TABLE_FORMAT=MARKDOWN \
8
  LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
9
+ # HF Hub cache - IMPORTANT for hf_hub_download used during build
10
+ HF_HOME=/app/.cache/huggingface \
11
+ # Cache/config dirs for libraries
12
  MPLCONFIGDIR=/app/.cache/matplotlib \
13
  YOLO_CONFIG_DIR=/app/.config/Ultralytics
 
14
 
15
  # Set the working directory in the container
16
  WORKDIR /app
17
 
18
+ # Install system dependencies
19
  RUN apt-get update && apt-get install -y --no-install-recommends \
20
  libgl1-mesa-glx \
21
  libglib2.0-0 \
 
23
  libxext6 \
24
  libxrender-dev \
25
  libfreetype6-dev \
26
+ git \
27
+ # Add curl/wget if needed for direct downloads below
28
  && rm -rf /var/lib/apt/lists/*
29
 
30
+ # Copy requirements first for layer caching
31
  COPY requirements.txt .
32
 
33
+ # Install Python dependencies (including huggingface_hub CLI)
34
+ # Ensure huggingface-hub is in requirements.txt or install it here
35
  RUN pip install --no-cache-dir --upgrade pip && \
36
+ pip install --no-cache-dir huggingface-hub && \
37
  pip install --no-cache-dir -r requirements.txt
38
 
39
+ # --- PRE-DOWNLOAD MODELS ---
40
+ # Create directories needed for download AND runtime
41
+ RUN mkdir -p ${MDR_MODEL_DIR}/onnx_ocr \
42
+ ${MDR_MODEL_DIR}/struct_eqtable \
43
+ ${MDR_MODEL_DIR}/latex \
44
+ ${MDR_MODEL_DIR}/layoutreader \
45
+ ${MDR_MODEL_DIR}/yolo_hf_cache \
46
+ /app/temp_uploads \
47
+ ${HF_HOME} \
48
+ ${MPLCONFIGDIR} \
49
+ ${YOLO_CONFIG_DIR} && \
50
+ # Set permissions broadly - adjust if needed, but often required on Spaces
51
+ chmod -R 777 ${MDR_MODEL_DIR} /app
52
+
53
+ # Download ONNX OCR Models (using requests/curl/wget or a helper script)
54
+ # Option 1: Direct download (if URLs are stable)
55
+ RUN curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/det/det.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/det.onnx" && \
56
+ curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/cls/cls.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/cls.onnx" && \
57
+ curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/rec/rec.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/rec.onnx" && \
58
+ curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ch_ppocr_server_v2.0/ppocr_keys_v1.txt" -o "${MDR_MODEL_DIR}/onnx_ocr/ppocr_keys_v1.txt"
59
+
60
+ # Download YOLO Model (using huggingface-cli)
61
+ RUN huggingface-cli download \
62
+ juliozhao/DocLayout-YOLO-DocStructBench \
63
+ doclayout_yolo_docstructbench_imgsz1024.pt \
64
+ --local-dir ${MDR_MODEL_DIR}/yolo_hf_cache \
65
+ --local-dir-use-symlinks False
66
+
67
+ # Download LaTeX Models (using requests/curl/wget or helper script)
68
+ RUN curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/weights.pth" -o "${MDR_MODEL_DIR}/latex/weights.pth" && \
69
+ curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/image_resizer.pth" -o "${MDR_MODEL_DIR}/latex/image_resizer.pth"
70
+ # Note: config.yaml for LaTeX might need to be created or copied if required by the library
71
+
72
+ # Download LayoutReader Model (using huggingface-cli)
73
+ # Assuming LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", ...)
74
+ # will handle caching correctly if HF_HOME is set and writable.
75
+ # If it uses a different mechanism or specific files are needed, download them explicitly.
76
+ # For LayoutReader using Hantian/layoutreader:
77
+ RUN huggingface-cli download \
78
+ Hantian/layoutreader \
79
+ --local-dir ${MDR_MODEL_DIR}/layoutreader \
80
+ --local-dir-use-symlinks False
81
+
82
+ # Download StructTable Model (using huggingface-cli)
83
+ # Assuming build_model(model_ckpt="U4R/StructTable-InternVL2-1B", ...)
84
+ RUN huggingface-cli download \
85
+ U4R/StructTable-InternVL2-1B \
86
+ --local-dir ${MDR_MODEL_DIR}/struct_eqtable \
87
+ --local-dir-use-symlinks False
88
+
89
+ # --- END PRE-DOWNLOAD MODELS ---
90
+
91
+ # Copy the application code AFTER downloads
92
  COPY mdr_pdf_parser.py .
93
  COPY main.py .
94
 
 
 
 
 
 
 
 
 
95
  # Expose the port the app runs on
96
  EXPOSE 8000
97
 
98
+ # Remove VOLUME instruction - less relevant for Spaces default usage
99
+ # VOLUME ${MDR_MODEL_DIR}
100
 
101
+ # Start the application
102
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]