|
import gradio as gr |
|
import torch |
|
import cv2 |
|
import numpy as np |
|
import supervision as sv |
|
from ultralytics import YOLO |
|
from PIL import Image |
|
import requests |
|
import io |
|
import os |
|
import matplotlib.pyplot as plt |
|
import pandas as pd |
|
from pathly import Path |
|
import json |
|
|
|
|
|
os.makedirs("models", exist_ok=True) |
|
|
|
|
|
model_path = "models/yolov11x_best.pt" |
|
if not os.path.exists(model_path): |
|
url = "https://github.com/moured/YOLOv11-Document-Layout-Analysis/releases/download/doclaynet_weights/yolov11x_best.pt" |
|
print(f"Downloading model from {url}...") |
|
r = requests.get(url) |
|
with open(model_path, 'wb') as f: |
|
f.write(r.content) |
|
print(f"Model downloaded to {model_path}") |
|
|
|
|
|
model = YOLO(model_path) |
|
print("Model loaded successfully!") |
|
|
|
|
|
CLASSES = ["Caption", "Footnote", "Formula", "List-item", "Page-footer", "Page-header", |
|
"Picture", "Section-header", "Table", "Text", "Title"] |
|
|
|
|
|
VISUAL_ELEMENTS = ["Picture", "Caption", "Table", "Formula"] |
|
|
|
|
|
COLORS = sv.ColorPalette.default() |
|
|
|
|
|
box_annotator = sv.BoxAnnotator(color=COLORS) |
|
|
|
def predict_layout(image): |
|
if image is None: |
|
return None, None, None |
|
|
|
|
|
if isinstance(image, np.ndarray): |
|
img = image |
|
else: |
|
img = np.array(image) |
|
|
|
|
|
img_height, img_width = img.shape[:2] |
|
|
|
|
|
results = model(img)[0] |
|
|
|
|
|
detections = sv.Detections.from_ultralytics(results) |
|
|
|
|
|
class_ids = detections.class_id |
|
labels = [f"{CLASSES[class_id]} {confidence:.2f}" |
|
for class_id, confidence in zip(class_ids, detections.confidence)] |
|
|
|
|
|
annotated_image = box_annotator.annotate( |
|
scene=img.copy(), |
|
detections=detections, |
|
labels=labels |
|
) |
|
|
|
|
|
boxes_data = [] |
|
for i, (class_id, xyxy, confidence) in enumerate(zip(detections.class_id, detections.xyxy, detections.confidence)): |
|
class_name = CLASSES[class_id] |
|
|
|
|
|
|
|
if class_name in VISUAL_ELEMENTS: |
|
x1, y1, x2, y2 = map(int, xyxy) |
|
width = x2 - x1 |
|
height = y2 - y1 |
|
|
|
boxes_data.append({ |
|
"class": class_name, |
|
"confidence": float(confidence), |
|
"x1": int(x1), |
|
"y1": int(y1), |
|
"x2": int(x2), |
|
"y2": int(y2), |
|
"width": int(width), |
|
"height": int(height) |
|
}) |
|
|
|
|
|
if boxes_data: |
|
df = pd.DataFrame(boxes_data) |
|
df = df[["class", "confidence", "x1", "y1", "x2", "y2", "width", "height"]] |
|
else: |
|
df = pd.DataFrame(columns=["class", "confidence", "x1", "y1", "x2", "y2", "width", "height"]) |
|
|
|
|
|
json_data = json.dumps(boxes_data, indent=2) |
|
|
|
return annotated_image, df, json_data |
|
|
|
|
|
def download_json(json_data): |
|
if not json_data: |
|
return None |
|
return json_data |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# YOLOv11x Document Layout Analysis for Visual Elements") |
|
gr.Markdown("Upload a document image to extract visual elements including diagrams, tables, formulas, and captions.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
input_image = gr.Image(label="Input Document") |
|
analyze_btn = gr.Button("Analyze Layout", variant="primary") |
|
|
|
with gr.Column(): |
|
output_image = gr.Image(label="Detected Layout") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
output_table = gr.DataFrame(label="Visual Elements Bounding Boxes") |
|
json_output = gr.JSON(label="JSON Output") |
|
download_btn = gr.Button("Download JSON") |
|
json_file = gr.File(label="Download JSON File", interactive=False) |
|
|
|
analyze_btn.click( |
|
fn=predict_layout, |
|
inputs=input_image, |
|
outputs=[output_image, output_table, json_output] |
|
) |
|
|
|
download_btn.click( |
|
fn=download_json, |
|
inputs=[json_output], |
|
outputs=[json_file] |
|
) |
|
|
|
gr.Markdown("## Detected Visual Elements") |
|
gr.Markdown(""" |
|
This application detects and extracts coordinates for the following visual elements: |
|
|
|
- **Pictures**: Diagrams, photos, illustrations, flowcharts, etc. |
|
- **Tables**: Structured data presented in rows and columns |
|
- **Formulas**: Mathematical equations and expressions |
|
- **Captions**: Text describing pictures or tables |
|
|
|
For each element, the system returns: |
|
- Element type (class) |
|
- Confidence score (0-1) |
|
- Coordinates (x1, y1, x2, y2) |
|
- Width and height in pixels |
|
""") |
|
|
|
gr.Markdown("## About") |
|
gr.Markdown(""" |
|
This demo uses YOLOv11x for document layout analysis, with a focus on extracting visual elements. |
|
Model from [moured/YOLOv11-Document-Layout-Analysis](https://github.com/moured/YOLOv11-Document-Layout-Analysis) |
|
""") |
|
|
|
|
|
gr.Examples( |
|
examples=[ |
|
"https://raw.githubusercontent.com/moured/YOLOv11-Document-Layout-Analysis/main/assets/sample1.png", |
|
"https://raw.githubusercontent.com/moured/YOLOv11-Document-Layout-Analysis/main/assets/sample2.png", |
|
], |
|
inputs=input_image |
|
) |
|
|
|
demo.launch() |