# Base image FROM python:3.10-slim # System dependencies for OCR, PDF, DOCX, fonts, and more RUN apt-get update && apt-get install -y \ tesseract-ocr \ libglib2.0-0 \ libgl1 \ libpoppler-cpp-dev \ poppler-utils \ libsm6 \ libxext6 \ libxrender-dev \ ffmpeg \ build-essential \ git \ && rm -rf /var/lib/apt/lists/* # Ensure tesseract is in PATH and test it RUN which tesseract && tesseract --version # Set environment variables ENV PIP_NO_CACHE_DIR=1 \ PYTHONUNBUFFERED=1 \ HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ TESSERACT_PATH=/usr/bin/tesseract \ PATH="/usr/bin:$PATH" # Set working directory WORKDIR /home/user/app # Copy requirements and install COPY requirements.txt . RUN pip install --upgrade pip && pip install -r requirements.txt # Copy app files COPY . . # Download NLTK data RUN python -m nltk.downloader punkt # Start the app CMD ["python", "app.py"]