import gradio as gr import torch import cv2 import numpy as np from PIL import Image, ImageEnhance from ultralytics import YOLO from torchvision.transforms.functional import InterpolationMode from transformers import AutoModel, AutoTokenizer import torchvision.transforms as T from decord import VideoReader, cpu model_path = "best.pt" model = YOLO(model_path) def preprocessing(image): """Apply three enhancement filters, including brightness reduction, and resize.""" image = Image.fromarray(np.array(image)) # Apply enhancements image = ImageEnhance.Sharpness(image).enhance(2.0) # Increase sharpness image = ImageEnhance.Contrast(image).enhance(1.5) # Increase contrast image = ImageEnhance.Brightness(image).enhance(0.8) # Reduce brightness # Resize image to 800px width while maintaining aspect ratio width = 800 aspect_ratio = image.height / image.width height = int(width * aspect_ratio) image = image.resize((width, height)) return image def imageRotation(image): """Dummy function for image rotation.""" return image def detect_document(image): """Detects front and back of the document using YOLO.""" image = np.array(image) results = model(image, conf=0.85) detected_classes = set() labels = [] bounding_boxes = [] for result in results: for box in result.boxes: x1, y1, x2, y2 = map(int, box.xyxy[0]) conf = box.conf[0] cls = int(box.cls[0]) class_name = model.names[cls] detected_classes.add(class_name) label = f"{class_name} {conf:.2f}" labels.append(label) bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) # Store bounding box with class and confidence cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) possible_classes = {"front", "back"} missing_classes = possible_classes - detected_classes if missing_classes: labels.append(f"Missing: {', '.join(missing_classes)}") return Image.fromarray(image), labels, bounding_boxes def crop_image(image, bounding_boxes): """Crops detected bounding boxes from the image.""" cropped_images = {} image = np.array(image) for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: cropped = image[y1:y2, x1:x2] cropped_images[class_name] = Image.fromarray(cropped) return cropped_images def vision_ai_api(image, doc_type): """Dummy API call for Vision AI, returns a fake JSON response.""" return { "document_type": doc_type, "extracted_text": "Dummy OCR result for " + doc_type, "confidence": 0.99 } def predict(image): """Pipeline: Preprocess -> Detect -> Crop -> Vision AI API.""" processed_image = preprocessing(image) rotated_image = imageRotation(processed_image) # Placeholder for rotation detected_image, labels, bounding_boxes = detect_document(rotated_image) cropped_images = crop_image(rotated_image, bounding_boxes) # Call Vision AI separately for front and back if detected front_result, back_result = None, None if "front" in cropped_images: front_result = vision_ai_api(cropped_images["front"], "front") if "back" in cropped_images: back_result = vision_ai_api(cropped_images["back"], "back") # Combine API results into a single response api_results = { "front": front_result, "back": back_result } single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image return single_image, labels, api_results iface = gr.Interface( fn=predict, inputs="image", outputs=["image", "text", "json"], title="License Field Detection (Front & Back Card)" ) iface.launch()