File size: 2,823 Bytes
7e327f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import modal
from transformers import AutoModelForObjectDetection, AutoImageProcessor
import torch
from smolagents import Tool

from .app import app
from .image import image


@app.cls(gpu="T4", image=image)
class RemoteObjectDetectionModalApp:
    model_name: str = modal.parameter()

    @modal.method()
    def forward(self, image):
        self.model = AutoModelForObjectDetection.from_pretrained(self.model_name)
        self.processor = AutoImageProcessor.from_pretrained(self.model_name)
        self.model.eval()

        # Preprocess image
        inputs = self.processor(images=image, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        target_sizes = torch.tensor([image.size[::-1]])  # (height, width)
        results = self.processor.post_process_object_detection(
            outputs, target_sizes=target_sizes, threshold=0.5
        )[0]

        boxes = []
        for score, label, box in zip(
            results["scores"], results["labels"], results["boxes"]
        ):
            boxes.append(
                {
                    "box": box.tolist(),  # [xmin, ymin, xmax, ymax]
                    "score": score.item(),
                    "label": self.model.config.id2label[label.item()],
                }
            )
        return boxes


class RemoteObjectDetectionTool(Tool):
    name = "object_detection"
    description = """
        Given an image, detect objects and return bounding boxes.
        The image is a PIL image.
        The output is a list of dictionaries containing the bounding boxes with the following keys:
        - box: a list of 4 numbers [xmin, ymin, xmax, ymax]
        - score: a number between 0 and 1
        - label: a string
        The bounding boxes are in the format of [xmin, ymin, xmax, ymax].
        You need to provide the model name to use for object detection.
        The tool returns a list of bounding boxes for all the objects in the image.
    """

    inputs = {
        "image": {
            "type": "image",
            "description": "The image to detect objects in",
        },
        "model_name": {
            "type": "string",
            "description": "The name of the model to use for object detection",
        },
    }
    output_type = "object"

    def __init__(self):
        super().__init__()
        self.tool_class = modal.Cls.from_name(
            app.name, RemoteObjectDetectionModalApp.__name__
        )

    def forward(
        self,
        image,
        model_name: str,
    ):
        self.tool = self.tool_class(model_name=model_name)
        bboxes = self.tool.forward.remote(image)
        for bbox in bboxes:
            print(
                f"Found {bbox['label']} with score: {bbox['score']} at box: {bbox['box']}"
            )
        return bboxes