Spaces:

banao-tech
/

omniapi

Sleeping

File size: 5,373 Bytes

a1c932a
 
 
 
056fb25
 
a1c932a
d03c47c
ab332bc
a1c932a
ea2ade6
a1c932a
 
 
 
 
 
056fb25
ea2ade6
056fb25
 
 
9c616dc
056fb25
 
9c616dc
056fb25
 
 
 
 
 
ea2ade6
056fb25
beccd45
056fb25
 
 
beccd45
 
ea2ade6
 
 
056fb25
 
beccd45
 
ea2ade6
 
 
056fb25
a1c932a
056fb25
a1c932a
 
 
 
 
 
 
 
ab332bc
a1c932a
056fb25
ab332bc
 
 
 
 
 
 
 
a1c932a
712f1db
a1c932a
 
 
 
 
 
 
ab332bc
 
 
 
a1c932a
ab332bc
 
 
 
 
a1c932a
 
056fb25
ab332bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13c1ab1
ab332bc
13c1ab1
ab332bc
a1c932a
ab332bc
a1c932a
 
ab332bc
a1c932a
 
 
 
056fb25
a1c932a
 
 
 
 
 
 
 
 
 
 
 
056fb25
 
 
 
 
 
 
 
ab332bc
056fb25
 
 
 
 
 
 
 
a1c932a
056fb25
 
 
ab332bc

from fastapi import FastAPI, File, UploadFile, HTTPException
from pydantic import BaseModel
import base64
import io
import os
import logging
from PIL import Image
import torch
import asyncio  # Import asyncio for asynchronous operations

# Existing imports
from utils import (
    check_ocr_box,
    get_yolo_model,
    get_caption_model_processor,
    get_som_labeled_img,
)
from transformers import AutoProcessor, AutoModelForCausalLM

# Configure logging
logging.basicConfig(level=logging.DEBUG)  # Changed to DEBUG for more verbosity
logger = logging.getLogger(__name__)

# Load YOLO model
yolo_model = get_yolo_model(model_path="weights/best.pt")

# Handle device placement
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if str(device) == "cuda":
    yolo_model = yolo_model.cuda()
else:
    yolo_model = yolo_model.cpu()

# Load caption model and processor
try:
    processor = AutoProcessor.from_pretrained(
        "microsoft/Florence-2-base", trust_remote_code=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        "weights/icon_caption_florence",
        torch_dtype=torch.float16,
        trust_remote_code=True,
    ).to("cuda")
except Exception as e:
    logger.warning(f"Failed to load caption model on GPU: {e}. Falling back to CPU.")
    model = AutoModelForCausalLM.from_pretrained(
        "weights/icon_caption_florence",
        torch_dtype=torch.float16,
        trust_remote_code=True,
    )

caption_model_processor = {"processor": processor, "model": model}
logger.info("Finished loading models!!!")

app = FastAPI()

class ProcessResponse(BaseModel):
    image: str  # Base64 encoded image
    parsed_content_list: str
    label_coordinates: str

async def process(image_input: Image.Image, box_threshold: float, iou_threshold: float) -> ProcessResponse:
    image_save_path = "imgs/saved_image_demo.png"
    os.makedirs(os.path.dirname(image_save_path), exist_ok=True)

    # Save the image asynchronously
    loop = asyncio.get_event_loop()
    await loop.run_in_executor(None, image_input.save, image_save_path)

    logger.info(f"Saved image for processing: {image_save_path}")

    # Open image and prepare it for further processing
    image = Image.open(image_save_path)
    box_overlay_ratio = image.size[0] / 3200
    draw_bbox_config = {
        "text_scale": 0.8 * box_overlay_ratio,
        "text_thickness": max(int(2 * box_overlay_ratio), 1),
        "text_padding": max(int(3 * box_overlay_ratio), 1),
        "thickness": max(int(3 * box_overlay_ratio), 1),
    }

    # OCR and YOLO box processing (run in a thread pool to avoid blocking the event loop)
    ocr_bbox_rslt, is_goal_filtered = await loop.run_in_executor(
        None,
        check_ocr_box,
        image_save_path,
        False,  # display_img
        "xyxy",  # output_bb_format
        None,  # goal_filtering
        {"paragraph": False, "text_threshold": 0.9},  # easyocr_args
        True,  # use_paddleocr
    )
    text, ocr_bbox = ocr_bbox_rslt

    # Process image and get result (run in a thread pool)
    try:
        dino_labled_img, label_coordinates, parsed_content_list = await loop.run_in_executor(
            None,
            get_som_labeled_img,
            image_save_path,
            yolo_model,
            box_threshold,  # BOX_TRESHOLD
            True,  # output_coord_in_ratio
            ocr_bbox,  # ocr_bbox
            draw_bbox_config,  # draw_bbox_config
            caption_model_processor,  # caption_model_processor
            text,  # ocr_text
            iou_threshold,  # iou_threshold
        )
    except Exception as e:
        logger.error(f"Error during labeling and captioning: {e}")
        raise

    logger.info("Finished processing image with YOLO and captioning.")

    # Convert the image to base64 string
    image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
    parsed_content_list_str = "\n".join(parsed_content_list)

    buffered = io.BytesIO()
    await loop.run_in_executor(None, image.save, buffered, "PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")

    return ProcessResponse(
        image=img_str,
        parsed_content_list=parsed_content_list_str,
        label_coordinates=str(label_coordinates),
    )

@app.post("/process_image", response_model=ProcessResponse)
async def process_image(
    image_file: UploadFile = File(...),
    box_threshold: float = 0.05,
    iou_threshold: float = 0.1,
):
    try:
        contents = await image_file.read()
        image_input = Image.open(io.BytesIO(contents)).convert("RGB")
        
        logger.info(f"Processing image: {image_file.filename}")
        logger.info(f"Image size: {image_input.size}")
        
        # Debugging the input image
        if not image_input:
            raise ValueError("Image input is empty or invalid.")
        
        response = await process(image_input, box_threshold, iou_threshold)
        
        # Ensure the response contains an image
        if not response.image:
            raise ValueError("Empty image in response")
        
        logger.info("Processing complete, returning response.")
        return response
        
    except Exception as e:
        logger.error(f"Error processing image: {e}")
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=str(e))