Spaces:

aioverlords-amnil
/

OCR-SMALL

Running

App Files Files Community

rockerritesh commited on May 9

Commit

1b5f903

verified ·

1 Parent(s): f70009a

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +60 -0
main.py +62 -0
requirements.txt +9 -0
utils.py +164 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,60 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+# Set the working directory
+WORKDIR /app
+# Install necessary system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    poppler-utils \
+    cmake \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgl1-mesa-glx \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Set the CC environment variable to ensure TorchInductor uses the correct compiler
+ENV CC=gcc
+# Copy the requirements file and install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Create cache and config directories with appropriate permissions
+RUN mkdir -p /app/cache && chmod 777 /app/cache
+RUN mkdir -p /app/config && chmod 777 /app/config
+RUN mkdir -p /app/triton_cache && chmod 777 /app/triton_cache
+RUN mkdir -p /app/torchinductor_cache && chmod 777 /app/torchinductor_cache
+RUN mkdir -p /mnt/data && chmod 777 /mnt/data
+RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
+# Create directories for Matplotlib and Fontconfig with appropriate permissions
+RUN mkdir -p /app/matplotlib && chmod 777 /app/matplotlib
+RUN mkdir -p /app/fontconfig && chmod 777 /app/fontconfig
+# Set environment variables for Hugging Face cache, config, Triton, and TorchInductor directories
+ENV HF_HOME=/app/cache
+ENV XDG_CACHE_HOME=/app/.cache
+ENV XDG_CONFIG_HOME=/app/config
+ENV TRITON_CACHE_DIR=/app/triton_cache
+ENV TORCHINDUCTOR_CACHE_DIR=/app/torchinductor_cache
+ENV MPLCONFIGDIR=/app/matplotlib
+ENV FONTCONFIG_PATH=/app/fontconfig
+ENV TORCH_HOME=/app/torchinductor_cache
+ENV TRITON_CACHE=/app/triton_cache
+ENV TOKENIZERS_PARALLELISM=false
+# Copy the application code
+COPY main.py .
+COPY utils.py ./
+COPY models /app/models
+COPY fonts /app/fonts
+# Expose the port FastAPI will run on
+EXPOSE 7860
+# Command to run the FastAPI app
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# ! pip uninstall -y tensorflow
+# ! pip install "python-doctr[torch,viz]"
+from fastapi import FastAPI, UploadFile, File
+from fastapi.responses import JSONResponse
+from utils import dev_number, roman_number, dev_letter, roman_letter
+import tempfile
+app = FastAPI()
+@app.post("/ocr_dev_number/")
+async def extract_dev_number(image: UploadFile = File(...)):
+    # Save uploaded image temporarily
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
+        content = await image.read()
+        tmp.write(content)
+        tmp_path = tmp.name
+    # predict the image
+    predicted_str = dev_number(tmp_path)
+    # Return result as JSON
+    return JSONResponse(content={"predicted_str": predicted_str})
+@app.post("/ocr_roman_number/")
+async def extract_roman_number(image: UploadFile = File(...)):
+    # Save uploaded image temporarily
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
+        content = await image.read()
+        tmp.write(content)
+        tmp_path = tmp.name
+    # predict the image
+    predicted_str = roman_number(tmp_path)
+    # Return result as JSON
+    return JSONResponse(content={"predicted_str": predicted_str})
+@app.post("/ocr_dev_letter/")
+async def extract_dev_letter(image: UploadFile = File(...)):
+    # Save uploaded image temporarily
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
+        content = await image.read()
+        tmp.write(content)
+        tmp_path = tmp.name
+    # predict the image
+    predicted_str = dev_letter(tmp_path)
+    # Return result as JSON
+    return JSONResponse(content={"predicted_str": predicted_str})
+@app.post("/ocr_roman_letter/")
+async def extract_roman_letter(image: UploadFile = File(...)):
+    # Save uploaded image temporarily
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
+        content = await image.read()
+        tmp.write(content)
+        tmp_path = tmp.name
+    # predict the image
+    predicted_str = roman_letter(tmp_path)
+    # Return result as JSON
+    return JSONResponse(content={"predicted_str": predicted_str})

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+python-doctr[torch,viz]
+torch
+torchvision
+numpy
+matplotlib
+pillow
+fastapi
+uvicorn
+pydantic

utils.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import torch
+import torch.nn as nn
+from PIL import Image
+import numpy as np
+import matplotlib.pyplot as plt
+import torchvision.transforms as transforms
+from doctr.io import DocumentFile
+from doctr.models import recognition_predictor
+character_num = "0123456789-"
+character_letter = ''' "()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^''' #"()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^"
+model_dev_digits_path = "models/devnagri_digits_20k_v2.pth"
+model_roman_digits_path = "models/roman_digits_20k_v5.pth"
+dev_letter_path = "models/small_devnagari_letter.pth"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Define the CRNN model
+class CRNN(nn.Module):
+    def __init__(self, num_classes, input_size=(1, 64, 256)):
+        super(CRNN, self).__init__()
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(input_size[0], 64, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),  # 64x128
+            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),  # 32x64
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),  # 16x32
+            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2)   # 8x16
+        )
+        # Dimensions after conv: batch x 512 x 8 x 16
+        feature_height = input_size[1] // 16  # 64 -> 4 pools → 64/2^4 = 4
+        self.rnn = nn.LSTM(
+            input_size=512 * feature_height,  # 512 * 4 = 2048
+            hidden_size=128,
+            num_layers=1,
+            bidirectional=True,
+            dropout=0.3,
+            batch_first=True
+        )
+        self.fc = nn.Linear(256, num_classes)  # 256*2 = 512
+    def forward(self, x):
+        x = self.conv_block(x)  # (B, 512, H=4, W=16)
+        b, c, h, w = x.size()
+        x = x.permute(0, 3, 1, 2)  # (B, W, C, H)
+        x = x.contiguous().view(b, w, c * h)  # (B, seq_len, input_size)
+        x, _ = self.rnn(x)  # (B, seq_len, 512)
+        x = self.fc(x)      # (B, seq_len, num_classes)
+        return x
+# Initialize the model
+def model_init(character, model_path):
+    # Initialize the model with the number of classes
+    model = CRNN(num_classes=len(character))
+    model.load_state_dict(torch.load(model_path, map_location=device))
+    model = model.to(device)
+    return model
+def predict_image(image_path,character, model_path):
+    image = Image.open(image_path).convert('L')
+    # if value < 128, set to 0, else set to 255
+    if model_path != dev_letter_path:
+        image = image.point(lambda x: 0 if x < 128 else 255, 'L')
+    image = image.resize((256, 64))  # Resize to match the input size of the model
+    image = np.array(image)
+    image = np.expand_dims(image, axis=0)[0]  # Add channel dimension
+    # to pil image
+    # print(image)
+    image = Image.fromarray(image).convert('L')
+    if model_path == dev_letter_path:
+        image = Image.eval(image, lambda x: 255 - x)
+    # plt.imshow(image, cmap='gray')
+    # plt.axis('off')
+    # plt.show()
+    transform = transforms.Compose([
+        transforms.Resize((64, 256)),
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,), (0.5,))
+    ])
+    image = transform(image).unsqueeze(0).to(device)  # Add batch dimension and move to GPU
+    # Load the model weights
+    model = model_init(character, model_path)
+    # token to string
+    # tokens to ids
+    id_to_char = {i: c for i, c in enumerate(character)}
+    def get_string_from_token(token):
+        """
+        Convert a list of character IDs back to the corresponding string.
+        """
+        return ''.join([id_to_char[i] for i in token])
+    with torch.no_grad():
+        output = model(image)
+        output = output.permute(1, 0, 2)  # (seq_len, batch_size, num_classes)
+        _, predicted = output.max(2)
+        predicted = predicted.permute(1, 0)  # (batch_size, seq_len)
+        predicted_str = get_string_from_token(predicted[0].cpu().numpy())
+    return predicted_str
+def dev_number(image):
+    # Load the model
+    model_path = model_dev_digits_path
+    character = character_num
+    # Predict the image
+    predicted_str = predict_image(image, character, model_path)
+    return predicted_str
+def roman_number(image):
+    # Load the model
+    model_path = model_roman_digits_path
+    character = character_num
+    # Predict the image
+    predicted_str = predict_image(image, character, model_path)
+    return predicted_str
+def dev_letter(image):
+    # Load the model
+    model_path = dev_letter_path
+    character = character_letter
+    # Predict the image
+    predicted_str = predict_image(image, character, model_path)
+    return predicted_str
+# roman_letter
+# Load OCR model once at startup
+model = recognition_predictor(pretrained=True)
+def roman_letter(image):
+    # Load image using doctr
+    img = DocumentFile.from_images(image)
+    # Perform OCR
+    result = model(img)
+    # Return result as JSON
+    return result