File size: 6,171 Bytes
011bd45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import datetime
import json
import cv2
import httpx
from app.config import env
from app.utils import image_w_box, encode_image
from agents import RunContextWrapper, function_tool
from app.memory import Memory,Snapshot




def task(name, image):
    resp = httpx.post(f"{env.end_task}",
        data={"name": name},
        files={"file": ("frame.jpg", image.tobytes(), "image/jpeg")},
        timeout=10,
        headers={"Authorization": env.api_key},
    )
    resp.raise_for_status()
    return resp.json()['result']

def completion(messages, model):
    response = env.client.chat.completions.create(
        model=model,
        messages=messages
    )
    return response.choices[0].message.content


def completion_image(images, prompt, model):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}},
            ],
        }
        for b64, mime in map(encode_image, images)
    ]
    return completion(messages, model=model)

# ------------------------ Function Tools ------------------------
@function_tool
def caption(wrapper: RunContextWrapper[Memory]) -> str:  
    """
    Generate a descriptive caption for the most recent frame, record it as a snapshot, and return it.
    Returns:
        str:
            The generated caption for the current view (i.e., the latest frame).
    """
    mem = wrapper.context
    prompt = "Describe the image with rich details but in a concise manner."
    result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
    mem.snapshots.append(Snapshot(sender='caption', data=result))
    return result

@function_tool
def ocr(wrapper: RunContextWrapper[Memory]) -> str:  
    """
    Perform OCR on the most recent frame, record it as a snapshot, and return the extracted text.
    Returns:
        str:
            The extracted text from the current view (i.e., the latest frame).
    """
    mem = wrapper.context
    prompt = "Extract all text from image/payslip without miss anything."
    result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
    mem.snapshots.append(Snapshot(sender='ocr', data=result))
    return result

@function_tool
def qa(wrapper: RunContextWrapper[Memory], question: str) -> str:  
    """
    Answer a question based on the most recent frame, record it as a snapshot, and return the answer.

    Args:
        question (str): The question to be answered.
    Returns:
        str:
            The answer to the question based on the current view (i.e., the latest frame).
    """
    mem = wrapper.context
    prompt = f"Answer the question based on the image. Question: {question}"
    result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
    mem.snapshots.append(Snapshot(sender='qa', data=result))
    return result


@function_tool
def localize(wrapper: RunContextWrapper[Memory]) -> str:
    """
    Localize all objects in the most recent frame
    Returns:
        str:
            The localization result for the current view (i.e., the latest frame).
            the format is {name:list of bboxes}
    """
    mem = wrapper.context
    frame = mem.frames[-1]
    _, img = cv2.imencode('.jpg', frame)
    objxbox = task(env.model_loc, img)
    mem.snapshots.append(Snapshot(sender='localize', data=image_w_box(frame, objxbox)))
    return json.dumps(objxbox, indent=2)


@function_tool
def time(wrapper: RunContextWrapper[Memory]) -> str:  
    """
    Get the current time, record it as a snapshot, and return the time.
    Returns:
        str:
            The current time.
    """
    mem = wrapper.context
    result = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    mem.snapshots.append(Snapshot(sender='time', data=result))
    return result

def sample_frames(mem: Memory, n: int) -> list:
    """
    Sample frames from the past n seconds of video.
    
    Args:
        mem (Memory): The memory context containing frames.
        n (int): Number of seconds to look back for video frames.
    Returns:
        list: Sampled frames from the video sequence.
    """
    if len(mem.frames) == 0:
        return []
    
    available_frames = min(n * env.fps, len(mem.frames))
    recent_frames = mem.frames[-available_frames:]
    sampled_frames = recent_frames[::env.fps // 2]
    
    return sampled_frames

@function_tool
def video_caption(wrapper: RunContextWrapper[Memory], n=2) -> str:
    """
    Generate a descriptive caption for a video sequence from the past n seconds of frames.
    The n is a required parameter that specifies how many seconds of video frames to consider.
    
    Args:
        n (int): Number of seconds to look back for video frames.
    Returns:
        str:
            The generated caption for the video sequence from the past n seconds.
    """
    mem = wrapper.context
    sampled_frames = sample_frames(mem, n)
    
    if len(sampled_frames) == 0:
        return "No frames available for video caption."
    
    prompt = "Describe this video sequence focusing on any changes or actions that occur over time."
    result = completion_image(sampled_frames, prompt, env.model_mllm)
    mem.snapshots.append(Snapshot(sender='video caption', data=result))
    return result

@function_tool
def video_qa(wrapper: RunContextWrapper[Memory], question: str, n=2) -> str:
    """
    Answer a question based on a video sequence from the past n seconds of frames.
    
    Args:
        question (str): The question to be answered.
        n (int): Number of seconds to look back for video frames.
    Returns:
        str:
            The answer to the question based on the video sequence from the past n seconds.
    """
    mem = wrapper.context
    sampled_frames = sample_frames(mem, n)
    
    if len(sampled_frames) == 0:
        return "No frames available for video Q&A."
    
    prompt = f"Answer the question based on this video sequence. Question: {question}"
    result = completion_image(sampled_frames, prompt, env.model_mllm)
    mem.snapshots.append(Snapshot(sender='video qa', data=result))
    return result