damoojeje commited on
Commit
43b8a1d
·
verified ·
1 Parent(s): d476ac9

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +11 -18
Dockerfile CHANGED
@@ -1,43 +1,36 @@
1
- # Base image
2
  FROM python:3.10-slim
3
 
4
- # System dependencies for OCR, PDF, DOCX, fonts, and more
5
  RUN apt-get update && apt-get install -y \
6
  tesseract-ocr \
7
  libglib2.0-0 \
8
  libgl1 \
9
- libpoppler-cpp-dev \
10
- poppler-utils \
11
  libsm6 \
12
  libxext6 \
13
  libxrender-dev \
 
14
  ffmpeg \
15
  build-essential \
16
  git \
17
- && rm -rf /var/lib/apt/lists/*
18
-
19
- # Ensure tesseract is in PATH and test it
20
- RUN which tesseract && tesseract --version
21
 
22
- # Set environment variables
23
- ENV PIP_NO_CACHE_DIR=1 \
24
- PYTHONUNBUFFERED=1 \
25
- HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
26
- TESSERACT_PATH=/usr/bin/tesseract \
27
- PATH="/usr/bin:$PATH"
28
 
29
- # Set working directory
30
  WORKDIR /home/user/app
31
 
32
- # Copy requirements and install
33
  COPY requirements.txt .
34
  RUN pip install --upgrade pip && pip install -r requirements.txt
35
 
36
  # Copy app files
37
  COPY . .
38
 
39
- # Download NLTK data
40
  RUN python -m nltk.downloader punkt
41
 
42
- # Start the app
43
  CMD ["python", "app.py"]
 
1
+ # Use a base image with Python and system utilities
2
  FROM python:3.10-slim
3
 
4
+ # Install system packages including Tesseract
5
  RUN apt-get update && apt-get install -y \
6
  tesseract-ocr \
7
  libglib2.0-0 \
8
  libgl1 \
 
 
9
  libsm6 \
10
  libxext6 \
11
  libxrender-dev \
12
+ poppler-utils \
13
  ffmpeg \
14
  build-essential \
15
  git \
16
+ && rm -rf /var/lib/apt/lists/*
 
 
 
17
 
18
+ # 🔧 Make sure Tesseract is in PATH
19
+ ENV TESSERACT_PATH=/usr/bin/tesseract
20
+ ENV PATH="${TESSERACT_PATH}:${PATH}"
 
 
 
21
 
22
+ # Work directory
23
  WORKDIR /home/user/app
24
 
25
+ # Python packages
26
  COPY requirements.txt .
27
  RUN pip install --upgrade pip && pip install -r requirements.txt
28
 
29
  # Copy app files
30
  COPY . .
31
 
32
+ # Download NLTK tokenizer data
33
  RUN python -m nltk.downloader punkt
34
 
35
+ # Run the app
36
  CMD ["python", "app.py"]