Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Running

App Files Files Community

rodrigomasini commited on Apr 29

Commit

e3523c8

verified ·

1 Parent(s): a0949d8

Update Dockerfile

Browse files

Files changed (1) hide show

Dockerfile +21 -68

Dockerfile CHANGED Viewed

@@ -6,97 +6,50 @@ ENV PYTHONUNBUFFERED=1 \
     MDR_DEVICE=cpu \
     MDR_TABLE_FORMAT=MARKDOWN \
     LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
-    # HF Hub cache - IMPORTANT for hf_hub_download used during build
-    HF_HOME=/app/.cache/huggingface \
-    # Cache/config dirs for libraries
     MPLCONFIGDIR=/app/.cache/matplotlib \
     YOLO_CONFIG_DIR=/app/.config/Ultralytics
 # Set the working directory in the container
 WORKDIR /app
-# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libgl1-mesa-glx \
     libglib2.0-0 \
-    libsm6 \
     libxext6 \
     libxrender-dev \
     libfreetype6-dev \
-    git \
-    # Add curl/wget if needed for direct downloads below
     && rm -rf /var/lib/apt/lists/*
-# Copy requirements first for layer caching
 COPY requirements.txt .
-# Install Python dependencies (including huggingface_hub CLI)
-# Ensure huggingface-hub is in requirements.txt or install it here
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir huggingface-hub && \
     pip install --no-cache-dir -r requirements.txt
-# --- PRE-DOWNLOAD MODELS ---
-# Create directories needed for download AND runtime
-RUN mkdir -p ${MDR_MODEL_DIR}/onnx_ocr \
-               ${MDR_MODEL_DIR}/struct_eqtable \
-               ${MDR_MODEL_DIR}/latex \
-               ${MDR_MODEL_DIR}/layoutreader \
-               ${MDR_MODEL_DIR}/yolo_hf_cache \
-               /app/temp_uploads \
-               ${HF_HOME} \
-               ${MPLCONFIGDIR} \
-               ${YOLO_CONFIG_DIR} && \
-    # Set permissions broadly - adjust if needed, but often required on Spaces
-    chmod -R 777 ${MDR_MODEL_DIR} /app
-# Download ONNX OCR Models (using requests/curl/wget or a helper script)
-# Option 1: Direct download (if URLs are stable)
-RUN curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/det/det.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/det.onnx" && \
-    curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/cls/cls.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/cls.onnx" && \
-    curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/rec/rec.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/rec.onnx" && \
-    curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ch_ppocr_server_v2.0/ppocr_keys_v1.txt" -o "${MDR_MODEL_DIR}/onnx_ocr/ppocr_keys_v1.txt"
-# Download YOLO Model (using huggingface-cli)
-RUN huggingface-cli download \
-    juliozhao/DocLayout-YOLO-DocStructBench \
-    doclayout_yolo_docstructbench_imgsz1024.pt \
-    --local-dir ${MDR_MODEL_DIR}/yolo_hf_cache \
-    --local-dir-use-symlinks False
-# Download LaTeX Models (using requests/curl/wget or helper script)
-RUN curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/weights.pth" -o "${MDR_MODEL_DIR}/latex/weights.pth" && \
-    curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/image_resizer.pth" -o "${MDR_MODEL_DIR}/latex/image_resizer.pth"
-# Note: config.yaml for LaTeX might need to be created or copied if required by the library
-# Download LayoutReader Model (using huggingface-cli)
-# Assuming LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", ...)
-# will handle caching correctly if HF_HOME is set and writable.
-# If it uses a different mechanism or specific files are needed, download them explicitly.
-# For LayoutReader using Hantian/layoutreader:
-RUN huggingface-cli download \
-    Hantian/layoutreader \
-    --local-dir ${MDR_MODEL_DIR}/layoutreader \
-    --local-dir-use-symlinks False
-# Download StructTable Model (using huggingface-cli)
-# Assuming build_model(model_ckpt="U4R/StructTable-InternVL2-1B", ...)
-RUN huggingface-cli download \
-    U4R/StructTable-InternVL2-1B \
-    --local-dir ${MDR_MODEL_DIR}/struct_eqtable \
-    --local-dir-use-symlinks False
-# --- END PRE-DOWNLOAD MODELS ---
-# Copy the application code AFTER downloads
 COPY mdr_pdf_parser.py .
 COPY main.py .
 # Expose the port the app runs on
 EXPOSE 8000
-# Remove VOLUME instruction - less relevant for Spaces default usage
-# VOLUME ${MDR_MODEL_DIR}
-# Start the application
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

     MDR_DEVICE=cpu \
     MDR_TABLE_FORMAT=MARKDOWN \
     LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
+    # --- ADDED: Point config/cache dirs to writable locations within /app ---
     MPLCONFIGDIR=/app/.cache/matplotlib \
     YOLO_CONFIG_DIR=/app/.config/Ultralytics
+    # --- END ADDED ---
 # Set the working directory in the container
 WORKDIR /app
+# Install system dependencies required by OpenCV and potentially others
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libgl1-mesa-glx \
     libglib2.0-0 \
     libxext6 \
     libxrender-dev \
     libfreetype6-dev \
     && rm -rf /var/lib/apt/lists/*
+# Copy the requirements file into the container
 COPY requirements.txt .
+# Install Python dependencies
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
+# Copy the application code into the container
 COPY mdr_pdf_parser.py .
 COPY main.py .
+# Create the default model directory, temp directory, AND the config/cache dirs
+# --- MODIFIED: Added creation of the new cache/config dirs ---
+RUN mkdir -p ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache/matplotlib /app/.config/Ultralytics && \
+    chmod -R 777 ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache /app/.config
+# Note: chmod 777 is very permissive, but often necessary/easiest in restricted environments like HF Spaces.
+# It ensures the directories are writable by the user running the application.
+# --- END MODIFIED ---
 # Expose the port the app runs on
 EXPOSE 8000
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
+# This allows mounting a host directory for persistent models
+VOLUME ${MDR_MODEL_DIR}