File size: 2,145 Bytes
31af2b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests
from smolagents import tool
import openai
import base64

@tool
def analyse_audio(audio_url: str) -> str:
    """
    analyse the provided audio file, and return a description or transcription of the contents.

    Args:
        audio_url (str): The URL of the audio file to be analysed. Usually with an audio extension like mp3, aac, etc.

    Returns:
        str: description or transcription of the contents of the provided audio
    """

    # some security:
    if "https://agents-course-unit4-scoring.hf.space" not in audio_url:
        return "the requested URL is not whitelisted, refusing to fetch data"

    resp = requests.get(audio_url)
    if resp.status_code != 200:
        return f"failed to fetch the requested audio file: (status={resp.status_code})\n{resp.text}"
    mime = resp.headers.get("content-type")
    # todo filer mimetypes for security and correctness

    audio_bytes = base64.b64encode(resp.content).decode("utf-8")

    # Create the message to GPT-4o (vision)
    response = openai.chat.completions.create(
        model="gpt-4o-audio-preview",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Please analyze the contents of this audio file. Provide a short (two sentence) description of the contents, and then output your analysis. The analysis should be in the most appropriate format. e.g. if the audio is a conversation, a transcription (indicating who says what) is best, for a monologue, maybe a simple transcription is best. if it's nature noises, describe what they are, the likely locations, etc."},
                    {
                        "type": "input_audio",
                        "input_audio": {
                            "data": audio_bytes,  # Use the base64 string here
                            "format": "mp3" # mime.split("/")[-1], # TODO this is pretty poor parsing of a content-type response header
                        },
                    }
                ]
            }
        ],
        max_tokens=500,
    )

    return response.choices[0].message.content