Spaces:

aioverlords-amnil
/

OCR-SMALL

Sleeping

App Files Files Community

AnkitShrestha commited on May 14

Commit

76e8a07

1 Parent(s): 415c25f

Add citizenship ocr endpoint

Browse files

Files changed (3) hide show

main.py +11 -1
requirements.txt +3 -1
utils.py +139 -168

main.py CHANGED Viewed

@@ -71,7 +71,7 @@ from pydantic import BaseModel
 import shutil
 # Import from optimized utils
-from utils import dev_number, roman_number, dev_letter, roman_letter, predict_ne
 app = FastAPI(
     title="OCR API",
@@ -193,6 +193,16 @@ async def classify_ne(image: UploadFile = File(...)):
     # Implement the logic as per your requirements
     return JSONResponse(content={"predicted": prediction})
 # Health check endpoint
 @app.get("/health")
 async def health_check():

 import shutil
 # Import from optimized utils
+from utils import dev_number, roman_number, dev_letter, roman_letter, predict_ne, ocr_citizenship_utils
 app = FastAPI(
     title="OCR API",
     # Implement the logic as per your requirements
     return JSONResponse(content={"predicted": prediction})
+@app.post("/ocr_citizenship/")
+async def ocr_citizenship(image: UploadFile = File(...)):
+    """OCR the provided Nepali Citizenship card"""
+    image_path  = await save_upload_file_tmp(image)
+    prediction = ocr_citizenship_utils(
+        image_path=image_path,
+    )
+    return JSONResponse(content=prediction)
 # Health check endpoint
 @app.get("/health")
 async def health_check():

requirements.txt CHANGED Viewed

@@ -8,4 +8,6 @@ fastapi
 uvicorn
 pydantic
 python-multipart
-scikit-learn==1.6.1

 uvicorn
 pydantic
 python-multipart
+scikit-learn==1.6.1
+opencv-python-headless
+surya-ocr

utils.py CHANGED Viewed

@@ -1,175 +1,21 @@
-# import torch
-# import torch.nn as nn
-# from PIL import Image
-# import numpy as np
-# import matplotlib.pyplot as plt
-# import torchvision.transforms as transforms
-# from doctr.io import DocumentFile
-# from doctr.models import recognition_predictor
-# character_num = "0123456789-"
-# character_letter = ''' "()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^''' #"()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^"
-# model_dev_digits_path = "models/devnagri_digits_20k_v2.pth"
-# model_roman_digits_path = "models/roman_digits_20k_v5.pth"
-# dev_letter_path = "models/small_devnagari_letter.pth"
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# # Define the CRNN model
-# class CRNN(nn.Module):
-#     def __init__(self, num_classes, input_size=(1, 64, 256)):
-#         super(CRNN, self).__init__()
-#         self.conv_block = nn.Sequential(
-#             nn.Conv2d(input_size[0], 64, kernel_size=3, stride=1, padding=1),
-#             nn.BatchNorm2d(64),
-#             nn.ReLU(),
-#             nn.MaxPool2d(kernel_size=2, stride=2),  # 64x128
-#             nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
-#             nn.BatchNorm2d(128),
-#             nn.ReLU(),
-#             nn.MaxPool2d(kernel_size=2, stride=2),  # 32x64
-#             nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
-#             nn.BatchNorm2d(256),
-#             nn.ReLU(),
-#             nn.MaxPool2d(kernel_size=2, stride=2),  # 16x32
-#             nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
-#             nn.BatchNorm2d(512),
-#             nn.ReLU(),
-#             nn.MaxPool2d(kernel_size=2, stride=2)   # 8x16
-#         )
-#         # Dimensions after conv: batch x 512 x 8 x 16
-#         feature_height = input_size[1] // 16  # 64 -> 4 pools → 64/2^4 = 4
-#         self.rnn = nn.LSTM(
-#             input_size=512 * feature_height,  # 512 * 4 = 2048
-#             hidden_size=128,
-#             num_layers=1,
-#             bidirectional=True,
-#             dropout=0.3,
-#             batch_first=True
-#         )
-#         self.fc = nn.Linear(256, num_classes)  # 256*2 = 512
-#     def forward(self, x):
-#         x = self.conv_block(x)  # (B, 512, H=4, W=16)
-#         b, c, h, w = x.size()
-#         x = x.permute(0, 3, 1, 2)  # (B, W, C, H)
-#         x = x.contiguous().view(b, w, c * h)  # (B, seq_len, input_size)
-#         x, _ = self.rnn(x)  # (B, seq_len, 512)
-#         x = self.fc(x)      # (B, seq_len, num_classes)
-#         return x
-# # Initialize the model
-# def model_init(character, model_path):
-#     # Initialize the model with the number of classes
-#     model = CRNN(num_classes=len(character))
-#     model.load_state_dict(torch.load(model_path, map_location=device))
-#     model = model.to(device)
-#     return model
-# def predict_image(image_path,character, model_path):
-#     image = Image.open(image_path).convert('L')
-#     # if value < 128, set to 0, else set to 255
-#     if model_path != dev_letter_path:
-#         image = image.point(lambda x: 0 if x < 128 else 255, 'L')
-#     image = image.resize((256, 64))  # Resize to match the input size of the model
-#     image = np.array(image)
-#     image = np.expand_dims(image, axis=0)[0]  # Add channel dimension
-#     # to pil image
-#     # print(image)
-#     image = Image.fromarray(image).convert('L')
-#     if model_path == dev_letter_path:
-#         image = Image.eval(image, lambda x: 255 - x)
-#     # plt.imshow(image, cmap='gray')
-#     # plt.axis('off')
-#     # plt.show()
-#     transform = transforms.Compose([
-#         transforms.Resize((64, 256)),
-#         transforms.ToTensor(),
-#         transforms.Normalize((0.5,), (0.5,))
-#     ])
-#     image = transform(image).unsqueeze(0).to(device)  # Add batch dimension and move to GPU
-#     # Load the model weights
-#     model = model_init(character, model_path)
-#     # token to string
-#     # tokens to ids
-#     id_to_char = {i: c for i, c in enumerate(character)}
-#     def get_string_from_token(token):
-#         """
-#         Convert a list of character IDs back to the corresponding string.
-#         """
-#         return ''.join([id_to_char[i] for i in token])
-#     with torch.no_grad():
-#         output = model(image)
-#         output = output.permute(1, 0, 2)  # (seq_len, batch_size, num_classes)
-#         _, predicted = output.max(2)
-#         predicted = predicted.permute(1, 0)  # (batch_size, seq_len)
-#         predicted_str = get_string_from_token(predicted[0].cpu().numpy())
-#     return predicted_str
-# def dev_number(image):
-#     # Load the model
-#     model_path = model_dev_digits_path
-#     character = character_num
-#     # Predict the image
-#     predicted_str = predict_image(image, character, model_path)
-#     return predicted_str
-# def roman_number(image):
-#     # Load the model
-#     model_path = model_roman_digits_path
-#     character = character_num
-#     # Predict the image
-#     predicted_str = predict_image(image, character, model_path)
-#     return predicted_str
-# def dev_letter(image):
-#     # Load the model
-#     model_path = dev_letter_path
-#     character = character_letter
-#     # Predict the image
-#     predicted_str = predict_image(image, character, model_path)
-#     return predicted_str
-# # roman_letter
-# # Load OCR model once at startup
-# model = recognition_predictor(pretrained=True)
-# def roman_letter(image):
-#     # Load image using doctr
-#     img = DocumentFile.from_images(image)
-#     # Perform OCR
-#     result = model(img)
-#     # Return result as JSON
-#     return result
 import torch
 import torch.nn as nn
-from PIL import Image
 import numpy as np
-import torchvision.transforms as transforms
-from doctr.io import DocumentFile
-from torchvision import models
-from doctr.models import recognition_predictor
-import os
-from functools import lru_cache
 import pickle
 # Character sets
 CHARACTER_NUM = "0123456789-"
 CHARACTER_LETTER = ''' "()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^''' #"()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^"
@@ -397,4 +243,129 @@ def predict_ne(image_path, device="cpu"):
         with torch.no_grad():
             output = model(image_tensor)
             _, predicted = torch.max(output, 1)
-        return le.inverse_transform([predicted.item()])[0]

+from doctr.models import detection_predictor, recognition_predictor
+from doctr.io import DocumentFile
+from surya.recognition import RecognitionPredictor
+from surya.detection import DetectionPredictor
+from PIL import Image
+# from functools import lru_cache
+from torchvision import models
+from typing import List
+import torchvision.transforms as transforms
 import torch
 import torch.nn as nn
 import numpy as np
+import cv2
+import regex as re
+# import os
 import pickle
 # Character sets
 CHARACTER_NUM = "0123456789-"
 CHARACTER_LETTER = ''' "()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^''' #"()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^"
         with torch.no_grad():
             output = model(image_tensor)
             _, predicted = torch.max(output, 1)
+        return le.inverse_transform([predicted.item()])[0]
+doctr_detector = None
+recognition_predictor = None
+detection_predictor = None
+def initialize_detector():
+    global doctr_detector, recognition_predictor, detection_predictor
+    if doctr_detector is None:
+        doctr_detector = detection_predictor('db_mobilenet_v3_large', pretrained=True, assume_straight_pages=True, preserve_aspect_ratio=True)
+    if recognition_predictor is None:
+        recognition_predictor = RecognitionPredictor()
+    if detection_predictor is None:
+        detection_predictor = DetectionPredictor()
+    return doctr_detector, recognition_predictor, detection_predictor
+def get_cleaned_boxes(out, page):
+    h, w, _ = page.shape
+    cleaned_boxes = []
+    for box in out[0]['words']:
+        coords = np.array(box[:4])  # 4 corner points (normalized)
+        coords *= np.array([w, h, w, h])
+        x1, y1, x2, y2 = coords
+        x_thresh = 0.7 * page.shape[1]
+        y_thresh = 0.3* page.shape[0]
+        if x1> x_thresh and y1 < y_thresh:
+            continue
+        if (x2 - x1) * (y2 - y1) < 100:
+            continue
+        cleaned_boxes.append(coords.astype('int'))
+    return cleaned_boxes
+# The most inefficient code in existence
+def merge_boxes_same_line(boxes, y_thresh=5, x_thresh=60):
+    # Sort boxes first by x and then by y
+    boxes = sorted(boxes, key=lambda b: (b[1],b[0]))
+    # Trying make all boxes within certain threshold have the same y coordinate for sorting
+    # Threshold for grouping rows
+    row_threshold = 15
+    aligned_boxes = []
+    current_row = []
+    current_y = boxes[0][1]
+    for box in boxes:
+        x1, y1, x2, y2 = box
+        if abs(y1 - current_y) <= row_threshold:
+            current_row.append(box)
+        else:
+            # Align all y1 and y2 in the row
+            avg_y1 = int(np.mean([b[1] for b in current_row]))
+            avg_y2 = int(np.mean([b[3] for b in current_row]))
+            aligned_boxes.extend([(b[0], avg_y1, b[2], avg_y2) for b in current_row])
+            current_row = [box]
+            current_y = y1
+    # Handle the last row
+    if current_row:
+        avg_y1 = int(np.mean([b[1] for b in current_row]))
+        avg_y2 = int(np.mean([b[3] for b in current_row]))
+        aligned_boxes.extend([(b[0], avg_y1, b[2], avg_y2) for b in current_row])
+    # After aligning all boxes on y axis, re sort them
+    aligned_boxes = sorted(aligned_boxes, key=lambda b: (b[1],b[0]))
+    # Merge adjacent boxes within certain threshold
+    merged = []
+    p_x1, p_y1, p_x2, p_y2 = aligned_boxes[0]
+    for i in range(1,len(aligned_boxes)):
+        x1, y1, x2, y2 = aligned_boxes[i]
+        if abs(p_y1 - y1) < y_thresh and abs(x1 - p_x2) < x_thresh:
+            p_x1 = min(p_x1, x1)
+            p_y1 = min(p_y1, y1)
+            p_x2 = max(p_x2, x2)
+            p_y2 = max(p_y2, y2)
+        else:
+            merged.append([p_x1, p_y1, p_x2, p_y2])
+            p_x1, p_y1, p_x2, p_y2 = x1, y1, x2, y2
+    merged.append([p_x1, p_y1, p_x2, p_y2])
+    return np.array(merged)
+def ocr_citizenship_utils(image_path: str) -> List[List[str]]:
+    doctr_detector, recognition_predictor, detection_predictor = initialize_detector()
+    page = cv2.imread(image_path)
+    page = cv2.convertScaleAbs(page, alpha=1.5, beta=0)
+    page = cv2.resize(page, (720,480))
+    out = doctr_detector([page])
+    cleaned_boxes = get_cleaned_boxes(out,page)
+    merged = merge_boxes_same_line(cleaned_boxes)
+    pattern = r'(नेपाली\s*नागरिकताको\s*प्रमाणपत्र){e<=6}'
+    prev_y = 0
+    start = False
+    first_start = True
+    y_thresh = 5
+    text_combine = ''
+    full_result = []
+    line_result = []
+    for boxes in merged[3:]:
+        x1, y1, x2, y2 = boxes[0],boxes[1],boxes[2],boxes[3]
+        crop = page[y1:y2,x1:x2]
+        pil_image = Image.fromarray(crop)
+        # OCR PART
+        langs = ["en",'ne']
+        predictions = recognition_predictor_surya([pil_image], [langs], detection_predictor_surya)
+        text_combo = ''
+        for text_line in predictions[0].text_lines:
+            text_combo = text_combo + " " + text_line.text.strip()
+        text_combo = text_combo.strip()
+        # OCR PART END
+        if not start:
+            match = re.search(pattern, text_combo)
+            if match:
+                start = True
+            continue
+        if first_start:
+            first_start = False
+            prev_y = boxes[1]
+        if y1 - prev_y > y_thresh:
+            full_result.append(line_result)
+            line_result = []
+        line_result.append(text_combo)
+        prev_y = boxes[1]
+    return full_result