|
import os |
|
import gradio as gr |
|
import requests |
|
import pandas as pd |
|
from io import BytesIO |
|
import re |
|
import subprocess |
|
import base64 |
|
|
|
|
|
from pytube import YouTube |
|
from langchain_huggingface import HuggingFaceInferenceAPI |
|
|
|
|
|
from groq import Groq |
|
from langchain_groq import ChatGroq |
|
from langchain.agents import AgentExecutor, create_tool_calling_agent |
|
from langchain_tavily import TavilySearchResults |
|
from langchain_core.prompts import ChatPromptTemplate |
|
from langchain.tools import Tool |
|
|
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
TEMP_DIR = "/tmp" |
|
|
|
|
|
def transcribe_audio_file(task_id: str) -> str: |
|
|
|
print(f"Tool 'transcribe_audio_file' called with task_id: {task_id}") |
|
try: |
|
file_url = f"{DEFAULT_API_URL}/files/{task_id}" |
|
audio_response = requests.get(file_url) |
|
audio_response.raise_for_status() |
|
audio_bytes = BytesIO(audio_response.content) |
|
audio_bytes.name = f"{task_id}.mp3" |
|
client = Groq(api_key=os.getenv("GROQ_API_KEY")) |
|
transcription = client.audio.transcriptions.create(file=audio_bytes, model="whisper-large-v3", response_format="text") |
|
return str(transcription) |
|
except Exception as e: |
|
return f"Error during audio file transcription: {e}" |
|
|
|
|
|
def transcribe_youtube_video(video_url: str) -> str: |
|
|
|
print(f"Tool 'transcribe_youtube_video' (ffmpeg) called with URL: {video_url}") |
|
video_path, audio_path = None, None |
|
try: |
|
os.makedirs(TEMP_DIR, exist_ok=True) |
|
yt = YouTube(video_url) |
|
stream = yt.streams.filter(only_audio=True).first() |
|
video_path = stream.download(output_path=TEMP_DIR) |
|
audio_path = os.path.join(TEMP_DIR, "output.mp3") |
|
command = ["ffmpeg", "-i", video_path, "-y", "-q:a", "0", "-map", "a", audio_path] |
|
subprocess.run(command, check=True, capture_output=True, text=True) |
|
client = Groq(api_key=os.getenv("GROQ_API_KEY")) |
|
with open(audio_path, "rb") as audio_file: |
|
transcription = client.audio.transcriptions.create(file=audio_file, model="whisper-large-v3", response_format="text") |
|
return str(transcription) |
|
except Exception as e: |
|
return f"Error during YouTube transcription: {e}" |
|
finally: |
|
if video_path and os.path.exists(video_path): os.remove(video_path) |
|
if audio_path and os.path.exists(audio_path): os.remove(audio_path) |
|
|
|
|
|
def analyze_image_from_task_id(task_id: str) -> str: |
|
""" |
|
Downloads an image file for a given task_id and analyzes it using a Vision-Language Model. |
|
Use this tool ONLY when a question explicitly mentions an image. |
|
""" |
|
print(f"Tool 'analyze_image_from_task_id' called with task_id: {task_id}") |
|
try: |
|
file_url = f"{DEFAULT_API_URL}/files/{task_id}" |
|
print(f"Downloading image from: {file_url}") |
|
response = requests.get(file_url) |
|
response.raise_for_status() |
|
|
|
|
|
vlm_client = HuggingFaceInferenceAPI( |
|
model_id="llava-hf/llava-1.5-7b-hf", |
|
token=os.getenv("HF_TOKEN") |
|
) |
|
|
|
print("Analyzing image with Llava...") |
|
|
|
|
|
text_prompt = "Describe the image in detail." |
|
result = vlm_client.image_to_text(image=response.content, prompt=text_prompt) |
|
print(f"Image analysis successful. Result: {result}") |
|
return result |
|
|
|
except Exception as e: |
|
return f"Error during image analysis: {e}" |
|
|
|
|
|
class LangChainAgent: |
|
def __init__(self, groq_api_key: str, tavily_api_key: str, hf_token: str): |
|
self.llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=groq_api_key, temperature=0.0) |
|
|
|
self.tools = [ |
|
TavilySearchResults(name="web_search", max_results=3, tavily_api_key=tavily_api_key, description="A search engine for finding up-to-date information on the internet."), |
|
Tool(name="audio_file_transcriber", func=transcribe_audio_file, description="Use this for questions mentioning an audio file (.mp3, recording). Input MUST be the task_id."), |
|
Tool(name="youtube_video_transcriber", func=transcribe_youtube_video, description="Use this for questions with a youtube.com URL. Input MUST be the URL."), |
|
Tool(name="image_analyzer", func=analyze_image_from_task_id, description="Use this for questions mentioning an image. Input MUST be the task_id."), |
|
] |
|
|
|
prompt = ChatPromptTemplate.from_messages([ |
|
("system", ( |
|
"You are a powerful problem-solving agent. Your goal is to answer the user's question accurately. " |
|
"You have access to a web search tool, an audio file transcriber, a YouTube video transcriber, and an image analyzer.\n\n" |
|
"**REASONING PROCESS:**\n" |
|
"1. **Analyze the question:** Determine if a tool is needed. Is it a general knowledge question, or does it mention a specific file type (audio, video, image) or URL?\n" |
|
"2. **Select ONE tool based on the question:**\n" |
|
" - For general knowledge, facts, or current events: use `web_search`.\n" |
|
" - For an audio file, .mp3, or voice memo: use `audio_file_transcriber` with the `task_id`.\n" |
|
" - For a youtube.com URL: use `youtube_video_transcriber` with the URL.\n" |
|
" - For an image: use `image_analyzer` with the `task_id`.\n" |
|
" - For math or simple logic: answer directly.\n" |
|
"3. **Execute and Answer:** After using a tool, analyze the result and provide ONLY THE FINAL ANSWER." |
|
)), |
|
("human", "Question: {input}\nTask ID: {task_id}"), |
|
("placeholder", "{agent_scratchpad}"), |
|
]) |
|
|
|
agent = create_tool_calling_agent(self.llm, self.tools, prompt) |
|
self.agent_executor = AgentExecutor(agent=agent, tools=self.tools, verbose=True, handle_parsing_errors=True) |
|
|
|
def __call__(self, question: str, task_id: str) -> str: |
|
urls = re.findall(r'https?://[^\s]+', question) |
|
input_for_agent = {"input": question, "task_id": task_id} |
|
if urls and "youtube.com" in urls[0]: |
|
input_for_agent['video_url'] = urls[0] |
|
try: |
|
response = self.agent_executor.invoke(input_for_agent) |
|
return response.get("output", "Agent failed to produce an answer.") |
|
except Exception as e: |
|
return f"Agent execution failed with an error: {e}" |
|
|
|
|
|
def run_and_submit_all(profile: gr.OAuthProfile | None): |
|
space_id = os.getenv("SPACE_ID") |
|
if not profile: return "Please Login to Hugging Face with the button.", None |
|
username = profile.username |
|
try: |
|
groq_api_key = os.getenv("GROQ_API_KEY") |
|
tavily_api_key = os.getenv("TAVILY_API_KEY") |
|
hf_token = os.getenv("HF_TOKEN") |
|
if not all([groq_api_key, tavily_api_key, hf_token]): raise ValueError("An API key (GROQ, TAVILY, or HF) is missing.") |
|
agent = LangChainAgent(groq_api_key=groq_api_key, tavily_api_key=tavily_api_key, hf_token=hf_token) |
|
except Exception as e: return f"Error initializing agent: {e}", None |
|
|
|
questions_url = f"{DEFAULT_API_URL}/questions" |
|
try: |
|
response = requests.get(questions_url, timeout=20) |
|
response.raise_for_status() |
|
questions_data = response.json() |
|
except Exception as e: return f"Error fetching questions: {e}", None |
|
|
|
results_log, answers_payload = [], [] |
|
for item in questions_data: |
|
task_id, q_text = item.get("task_id"), item.get("question") |
|
if not task_id or not q_text: continue |
|
answer = agent(question=q_text, task_id=task_id) |
|
answers_payload.append({"task_id": task_id, "submitted_answer": answer}) |
|
results_log.append({"Task ID": task_id, "Question": q_text, "Submitted Answer": answer}) |
|
|
|
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" |
|
submission_data = {"username": username, "agent_code": agent_code, "answers": answers_payload} |
|
submit_url = f"{DEFAULT_API_URL}/submit" |
|
try: |
|
response = requests.post(submit_url, json=submission_data, timeout=300) |
|
response.raise_for_status() |
|
result_data = response.json() |
|
final_status = (f"Submission Successful!\nUser: {result_data.get('username')}\n" |
|
f"Overall Score: {result_data.get('score', 'N/A')}% " |
|
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n" |
|
f"Message: {result_data.get('message', 'No message received.')}") |
|
return final_status, pd.DataFrame(results_log) |
|
except Exception as e: return f"Submission Failed: {e}", pd.DataFrame(results_log) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Ultimate Agent Runner (Search, Audio, Video, Vision)") |
|
gr.Markdown("This agent can search, transcribe audio files, transcribe YouTube videos, and analyze images.") |
|
gr.LoginButton() |
|
run_button = gr.Button("Run Evaluation & Submit All Answers") |
|
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False) |
|
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True) |
|
run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table]) |
|
|
|
if __name__ == "__main__": |
|
print("\n" + "-"*30 + " App Starting " + "-"*30) |
|
for key in ["GROQ_API_KEY", "TAVILY_API_KEY", "HF_TOKEN"]: |
|
print(f"✅ {key} secret is set." if os.getenv(key) else f"⚠️ WARNING: {key} secret is not set.") |
|
print("-"*(60 + len(" App Starting ")) + "\n") |
|
demo.launch(debug=True, share=False) |