Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Running

App Files Files Community

rodrigomasini commited on Apr 29

Commit

a0949d8

verified ·

1 Parent(s): 27b394b

Update Dockerfile

Browse files

Files changed (1) hide show

Dockerfile +67 -17

Dockerfile CHANGED Viewed

@@ -6,15 +6,16 @@ ENV PYTHONUNBUFFERED=1 \
     MDR_DEVICE=cpu \
     MDR_TABLE_FORMAT=MARKDOWN \
     LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
-    # --- ADDED: Point config/cache dirs to writable locations within /app ---
     MPLCONFIGDIR=/app/.cache/matplotlib \
     YOLO_CONFIG_DIR=/app/.config/Ultralytics
-    # --- END ADDED ---
 # Set the working directory in the container
 WORKDIR /app
-# Install system dependencies required by OpenCV and potentially others
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libgl1-mesa-glx \
     libglib2.0-0 \
@@ -22,31 +23,80 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libxext6 \
     libxrender-dev \
     libfreetype6-dev \
     && rm -rf /var/lib/apt/lists/*
-# Copy the requirements file into the container
 COPY requirements.txt .
-# Install Python dependencies
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
-# Copy the application code into the container
 COPY mdr_pdf_parser.py .
 COPY main.py .
-# Create the default model directory, temp directory, AND the config/cache dirs
-# --- MODIFIED: Added creation of the new cache/config dirs ---
-RUN mkdir -p ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache/matplotlib /app/.config/Ultralytics && \
-    chmod -R 777 ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache /app/.config
-# Note: chmod 777 is very permissive, but often necessary/easiest in restricted environments like HF Spaces.
-# It ensures the directories are writable by the user running the application.
-# --- END MODIFIED ---
 # Expose the port the app runs on
 EXPOSE 8000
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
-# This allows mounting a host directory for persistent models
-VOLUME ${MDR_MODEL_DIR}

     MDR_DEVICE=cpu \
     MDR_TABLE_FORMAT=MARKDOWN \
     LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
+    # HF Hub cache - IMPORTANT for hf_hub_download used during build
+    HF_HOME=/app/.cache/huggingface \
+    # Cache/config dirs for libraries
     MPLCONFIGDIR=/app/.cache/matplotlib \
     YOLO_CONFIG_DIR=/app/.config/Ultralytics
 # Set the working directory in the container
 WORKDIR /app
+# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libgl1-mesa-glx \
     libglib2.0-0 \
     libxext6 \
     libxrender-dev \
     libfreetype6-dev \
+    git \
+    # Add curl/wget if needed for direct downloads below
     && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for layer caching
 COPY requirements.txt .
+# Install Python dependencies (including huggingface_hub CLI)
+# Ensure huggingface-hub is in requirements.txt or install it here
 RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir huggingface-hub && \
     pip install --no-cache-dir -r requirements.txt
+# --- PRE-DOWNLOAD MODELS ---
+# Create directories needed for download AND runtime
+RUN mkdir -p ${MDR_MODEL_DIR}/onnx_ocr \
+               ${MDR_MODEL_DIR}/struct_eqtable \
+               ${MDR_MODEL_DIR}/latex \
+               ${MDR_MODEL_DIR}/layoutreader \
+               ${MDR_MODEL_DIR}/yolo_hf_cache \
+               /app/temp_uploads \
+               ${HF_HOME} \
+               ${MPLCONFIGDIR} \
+               ${YOLO_CONFIG_DIR} && \
+    # Set permissions broadly - adjust if needed, but often required on Spaces
+    chmod -R 777 ${MDR_MODEL_DIR} /app
+# Download ONNX OCR Models (using requests/curl/wget or a helper script)
+# Option 1: Direct download (if URLs are stable)
+RUN curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/det/det.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/det.onnx" && \
+    curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/cls/cls.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/cls.onnx" && \
+    curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/rec/rec.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/rec.onnx" && \
+    curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ch_ppocr_server_v2.0/ppocr_keys_v1.txt" -o "${MDR_MODEL_DIR}/onnx_ocr/ppocr_keys_v1.txt"
+# Download YOLO Model (using huggingface-cli)
+RUN huggingface-cli download \
+    juliozhao/DocLayout-YOLO-DocStructBench \
+    doclayout_yolo_docstructbench_imgsz1024.pt \
+    --local-dir ${MDR_MODEL_DIR}/yolo_hf_cache \
+    --local-dir-use-symlinks False
+# Download LaTeX Models (using requests/curl/wget or helper script)
+RUN curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/weights.pth" -o "${MDR_MODEL_DIR}/latex/weights.pth" && \
+    curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/image_resizer.pth" -o "${MDR_MODEL_DIR}/latex/image_resizer.pth"
+# Note: config.yaml for LaTeX might need to be created or copied if required by the library
+# Download LayoutReader Model (using huggingface-cli)
+# Assuming LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", ...)
+# will handle caching correctly if HF_HOME is set and writable.
+# If it uses a different mechanism or specific files are needed, download them explicitly.
+# For LayoutReader using Hantian/layoutreader:
+RUN huggingface-cli download \
+    Hantian/layoutreader \
+    --local-dir ${MDR_MODEL_DIR}/layoutreader \
+    --local-dir-use-symlinks False
+# Download StructTable Model (using huggingface-cli)
+# Assuming build_model(model_ckpt="U4R/StructTable-InternVL2-1B", ...)
+RUN huggingface-cli download \
+    U4R/StructTable-InternVL2-1B \
+    --local-dir ${MDR_MODEL_DIR}/struct_eqtable \
+    --local-dir-use-symlinks False
+# --- END PRE-DOWNLOAD MODELS ---
+# Copy the application code AFTER downloads
 COPY mdr_pdf_parser.py .
 COPY main.py .
 # Expose the port the app runs on
 EXPOSE 8000
+# Remove VOLUME instruction - less relevant for Spaces default usage
+# VOLUME ${MDR_MODEL_DIR}
+# Start the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]