# Use a lightweight Python base image FROM python:3.10-slim # Install system-level dependencies RUN apt-get update && apt-get install -y \ tesseract-ocr \ libglib2.0-0 \ libgl1 \ libsm6 \ libxext6 \ libxrender-dev \ poppler-utils \ ffmpeg \ git \ build-essential \ curl \ && rm -rf /var/lib/apt/lists/* # Verify Tesseract installation RUN ln -s /usr/bin/tesseract /usr/local/bin/tesseract && \ tesseract --version # Set environment variables ENV PYTHONUNBUFFERED=1 ENV PIP_NO_CACHE_DIR=1 ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 ENV TESSERACT_PATH=/usr/bin/tesseract ENV PATH="${TESSERACT_PATH}:${PATH}" # Set working directory WORKDIR /home/user/app # Install Python dependencies COPY requirements.txt . RUN pip install --upgrade pip && pip install -r requirements.txt # Copy application code COPY . . # Download NLTK tokenizer RUN python -m nltk.downloader punkt # Expose port for Gradio EXPOSE 7860 # Optional: container health check HEALTHCHECK CMD curl --fail http://localhost:7860 || exit 1 # Start the application CMD ["python", "app.py"]