Spaces:
Sleeping
Sleeping
File size: 2,145 Bytes
31af2b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import requests
from smolagents import tool
import openai
import base64
@tool
def analyse_audio(audio_url: str) -> str:
"""
analyse the provided audio file, and return a description or transcription of the contents.
Args:
audio_url (str): The URL of the audio file to be analysed. Usually with an audio extension like mp3, aac, etc.
Returns:
str: description or transcription of the contents of the provided audio
"""
# some security:
if "https://agents-course-unit4-scoring.hf.space" not in audio_url:
return "the requested URL is not whitelisted, refusing to fetch data"
resp = requests.get(audio_url)
if resp.status_code != 200:
return f"failed to fetch the requested audio file: (status={resp.status_code})\n{resp.text}"
mime = resp.headers.get("content-type")
# todo filer mimetypes for security and correctness
audio_bytes = base64.b64encode(resp.content).decode("utf-8")
# Create the message to GPT-4o (vision)
response = openai.chat.completions.create(
model="gpt-4o-audio-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Please analyze the contents of this audio file. Provide a short (two sentence) description of the contents, and then output your analysis. The analysis should be in the most appropriate format. e.g. if the audio is a conversation, a transcription (indicating who says what) is best, for a monologue, maybe a simple transcription is best. if it's nature noises, describe what they are, the likely locations, etc."},
{
"type": "input_audio",
"input_audio": {
"data": audio_bytes, # Use the base64 string here
"format": "mp3" # mime.split("/")[-1], # TODO this is pretty poor parsing of a content-type response header
},
}
]
}
],
max_tokens=500,
)
return response.choices[0].message.content
|