import gradio as gr import torch import cv2 import os import numpy as np from PIL import Image, ImageEnhance from ultralytics import YOLO from decord import VideoReader, cpu from torchvision.transforms.functional import InterpolationMode from transformers import AutoModel, AutoTokenizer from backPrompt import main as main_b from frontPrompt import main as main_f import sentencepiece as spm model_path = "best.pt" modelY = YOLO(model_path) os.environ["TRANSFORMERS_CACHE"] = "./.cache" cache_folder = "./.cache" path = "OpenGVLab/InternVL2_5-2B" # Load the Hugging Face model and tokenizer globally (downloaded only once) model = AutoModel.from_pretrained( path, cache_dir=cache_folder, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, # load_in_8bit=True, low_cpu_mem_usage=True, use_flash_attn=True, trust_remote_code=True ).eval().cpu() tokenizer = AutoTokenizer.from_pretrained( path, cache_dir=cache_folder, trust_remote_code=True, use_fast=False ) def preprocessing(image): """Apply three enhancement filters without resizing or cropping.""" # Ensure the image is a PIL Image if not isinstance(image, Image.Image): image = Image.fromarray(np.array(image)) # Apply enhancements image = ImageEnhance.Sharpness(image).enhance(2.0) # Increase sharpness image = ImageEnhance.Contrast(image).enhance(1.5) # Increase contrast image = ImageEnhance.Brightness(image).enhance(0.8) # Reduce brightness # Convert to tensor without resizing image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).float() / 255.0 # Shape: [C, H, W] return image_tensor def imageRotation(image): return image def detect_document(image): """Detects front and back of the document using YOLO.""" image = np.array(image) results = modelY(image, conf=0.85) detected_classes = set() labels = [] bounding_boxes = [] if isinstance(image, np.ndarray): if image.dtype != np.uint8: image = (image * 255).clip(0, 255).astype(np.uint8) # Convert float to uint8 # Ensure correct shape (H, W, C) if image.shape[0] == 1 and image.shape[1] == 1: image = np.squeeze(image) for result in results: for box in result.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) conf = box.conf[0] cls = int(box.cls[0]) class_name = modelY.names[cls] detected_classes.add(class_name) label = f"{class_name} {conf:.2f}" labels.append(label) bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) # Store bounding box with class and confidence cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) possible_classes = {"front", "back"} missing_classes = possible_classes - detected_classes if missing_classes: labels.append(f"Missing: {', '.join(missing_classes)}") return Image.fromarray(image), labels, bounding_boxes def crop_image(image, bounding_boxes): """Crops detected bounding boxes from the image.""" cropped_images = {} image = np.array(image) for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: cropped = image[y1:y2, x1:x2] cropped_images[class_name] = Image.fromarray(cropped) return cropped_images def vision_ai_api(image, doc_type): if doc_type == "front": results = main_f(image,model,tokenizer) if doc_type == "back": results = main_b(image,model,tokenizer) return results def ensure_numpy(image): """Ensure image is a valid NumPy array.""" if isinstance(image, torch.Tensor): # Convert PyTorch tensor to NumPy array image = image.permute(1, 2, 0).cpu().numpy() elif isinstance(image, Image.Image): # Convert PIL image to NumPy array image = np.array(image) if len(image.shape) == 2: # Convert grayscale to 3-channel image image = np.stack([image] * 3, axis=-1) return image def predict(image): """Pipeline: Preprocess -> Detect -> Crop -> Vision AI API.""" processed_image = preprocessing(image) rotated_image = ensure_numpy(processed_image) detected_image, labels, bounding_boxes = detect_document(rotated_image) cropped_images = crop_image(rotated_image, bounding_boxes) # Call Vision AI separately for front and back if detected front_result, back_result = None, None if "front" in cropped_images: front_result = vision_ai_api(cropped_images["front"], "front") if "back" in cropped_images: back_result = vision_ai_api(cropped_images["back"], "back") api_results = { "front": front_result, "back": back_result } single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image return single_image, labels, api_results iface = gr.Interface( fn=predict, inputs="image", outputs=["image", "text", "json"], title="License Field Detection (Front & Back Card)" ) iface.launch()