dalybuilds commited on
Commit
3a1e7f5
·
verified ·
1 Parent(s): 991459e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -53
app.py CHANGED
@@ -4,12 +4,14 @@ import requests
4
  import pandas as pd
5
  from io import BytesIO
6
  import re
 
 
7
 
8
- # --- Video & Audio Tool Imports ---
9
  from pytube import YouTube
10
- import moviepy.editor as mp
11
 
12
- # --- LangChain & Dependency Imports ---
13
  from groq import Groq
14
  from langchain_groq import ChatGroq
15
  from langchain.agents import AgentExecutor, create_tool_calling_agent
@@ -17,18 +19,13 @@ from langchain_tavily import TavilySearchResults
17
  from langchain_core.prompts import ChatPromptTemplate
18
  from langchain.tools import Tool
19
 
20
-
21
  # --- Constants ---
22
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
23
  TEMP_DIR = "/tmp"
24
 
25
-
26
  # --- Tool Definition: Audio File Transcription ---
27
  def transcribe_audio_file(task_id: str) -> str:
28
- """
29
- Downloads an audio file (.mp3) for a given task_id, transcribes it, and returns the text.
30
- Use this tool ONLY when a question explicitly mentions an audio file, .mp3, recording, or voice memo.
31
- """
32
  print(f"Tool 'transcribe_audio_file' called with task_id: {task_id}")
33
  try:
34
  file_url = f"{DEFAULT_API_URL}/files/{task_id}"
@@ -36,31 +33,25 @@ def transcribe_audio_file(task_id: str) -> str:
36
  audio_response.raise_for_status()
37
  audio_bytes = BytesIO(audio_response.content)
38
  audio_bytes.name = f"{task_id}.mp3"
39
-
40
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
41
  transcription = client.audio.transcriptions.create(file=audio_bytes, model="whisper-large-v3", response_format="text")
42
  return str(transcription)
43
  except Exception as e:
44
  return f"Error during audio file transcription: {e}"
45
 
46
- # --- Tool Definition: Video Transcription ---
47
  def transcribe_youtube_video(video_url: str) -> str:
48
- """
49
- Downloads a YouTube video from a URL, extracts its audio, and transcribes it to text.
50
- Use this tool ONLY when a question provides a youtube.com URL.
51
- """
52
- print(f"Tool 'transcribe_youtube_video' called with URL: {video_url}")
53
  video_path, audio_path = None, None
54
  try:
55
  os.makedirs(TEMP_DIR, exist_ok=True)
56
  yt = YouTube(video_url)
57
- stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
58
  video_path = stream.download(output_path=TEMP_DIR)
59
-
60
- video_clip = mp.VideoFileClip(video_path)
61
- audio_path = os.path.join(TEMP_DIR, "temp_audio.mp3")
62
- video_clip.audio.write_audiofile(audio_path, codec='mp3', logger=None)
63
-
64
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
65
  with open(audio_path, "rb") as audio_file:
66
  transcription = client.audio.transcriptions.create(file=audio_file, model="whisper-large-v3", response_format="text")
@@ -71,43 +62,61 @@ def transcribe_youtube_video(video_url: str) -> str:
71
  if video_path and os.path.exists(video_path): os.remove(video_path)
72
  if audio_path and os.path.exists(audio_path): os.remove(audio_path)
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # --- Agent Definition ---
76
  class LangChainAgent:
77
- def __init__(self, groq_api_key: str, tavily_api_key: str):
78
  self.llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=groq_api_key, temperature=0.0)
79
 
80
  self.tools = [
81
- TavilySearchResults(
82
- name="web_search",
83
- max_results=3,
84
- tavily_api_key=tavily_api_key,
85
- description="A search engine for finding up-to-date information, facts, and news on the internet."
86
- ),
87
- Tool(
88
- name="audio_file_transcriber",
89
- func=transcribe_audio_file,
90
- description="Use this ONLY for questions mentioning an audio file (.mp3, recording). Input MUST be the task_id.",
91
- ),
92
- Tool(
93
- name="youtube_video_transcriber",
94
- func=transcribe_youtube_video,
95
- description="Use this ONLY for questions providing a youtube.com URL. Input MUST be the URL.",
96
- ),
97
  ]
98
 
99
  prompt = ChatPromptTemplate.from_messages([
100
  ("system", (
101
  "You are a powerful problem-solving agent. Your goal is to answer the user's question accurately. "
102
- "You have access to a web search tool, an audio file transcriber, and a YouTube video transcriber.\n\n"
103
  "**REASONING PROCESS:**\n"
104
- "1. **Analyze the question:** Is it a general knowledge question, or does it mention a file/URL?\n"
105
- "2. **Select ONE tool:**\n"
106
- " - If the question requires current events, facts, or general knowledge, use `web_search`.\n"
107
- " - If the question *explicitly* mentions an audio file, .mp3, or voice memo, use `audio_file_transcriber` with the provided `task_id`.\n"
108
- " - If the question *explicitly* provides a `youtube.com` URL, use `youtube_video_transcriber` with that URL.\n"
109
- " - If no tool is needed (e.g., math, logic puzzles), answer directly.\n"
110
- "3. **Execute and Answer:** After using a tool, analyze the result and provide ONLY THE FINAL ANSWER. Do not explain your actions or apologize for errors."
 
111
  )),
112
  ("human", "Question: {input}\nTask ID: {task_id}"),
113
  ("placeholder", "{agent_scratchpad}"),
@@ -121,7 +130,6 @@ class LangChainAgent:
121
  input_for_agent = {"input": question, "task_id": task_id}
122
  if urls and "youtube.com" in urls[0]:
123
  input_for_agent['video_url'] = urls[0]
124
-
125
  try:
126
  response = self.agent_executor.invoke(input_for_agent)
127
  return response.get("output", "Agent failed to produce an answer.")
@@ -136,8 +144,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
136
  try:
137
  groq_api_key = os.getenv("GROQ_API_KEY")
138
  tavily_api_key = os.getenv("TAVILY_API_KEY")
139
- if not all([groq_api_key, tavily_api_key]): raise ValueError("GROQ or TAVILY API key is missing.")
140
- agent = LangChainAgent(groq_api_key=groq_api_key, tavily_api_key=tavily_api_key)
 
141
  except Exception as e: return f"Error initializing agent: {e}", None
142
 
143
  questions_url = f"{DEFAULT_API_URL}/questions"
@@ -171,8 +180,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
171
 
172
  # --- Gradio Interface ---
173
  with gr.Blocks() as demo:
174
- gr.Markdown("# Ultimate Agent Runner (Search + Audio + Video)")
175
- gr.Markdown("This agent can search, transcribe audio files, and transcribe YouTube videos.")
176
  gr.LoginButton()
177
  run_button = gr.Button("Run Evaluation & Submit All Answers")
178
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
@@ -181,7 +190,7 @@ with gr.Blocks() as demo:
181
 
182
  if __name__ == "__main__":
183
  print("\n" + "-"*30 + " App Starting " + "-"*30)
184
- for key in ["GROQ_API_KEY", "TAVILY_API_KEY"]:
185
  print(f"✅ {key} secret is set." if os.getenv(key) else f"⚠️ WARNING: {key} secret is not set.")
186
  print("-"*(60 + len(" App Starting ")) + "\n")
187
  demo.launch(debug=True, share=False)
 
4
  import pandas as pd
5
  from io import BytesIO
6
  import re
7
+ import subprocess
8
+ import base64
9
 
10
+ # --- Tool-specific Imports ---
11
  from pytube import YouTube
12
+ from langchain_huggingface import HuggingFaceInferenceAPI
13
 
14
+ # --- LangChain & Groq Imports ---
15
  from groq import Groq
16
  from langchain_groq import ChatGroq
17
  from langchain.agents import AgentExecutor, create_tool_calling_agent
 
19
  from langchain_core.prompts import ChatPromptTemplate
20
  from langchain.tools import Tool
21
 
 
22
  # --- Constants ---
23
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
24
  TEMP_DIR = "/tmp"
25
 
 
26
  # --- Tool Definition: Audio File Transcription ---
27
  def transcribe_audio_file(task_id: str) -> str:
28
+ # (This function is complete and correct from the previous version)
 
 
 
29
  print(f"Tool 'transcribe_audio_file' called with task_id: {task_id}")
30
  try:
31
  file_url = f"{DEFAULT_API_URL}/files/{task_id}"
 
33
  audio_response.raise_for_status()
34
  audio_bytes = BytesIO(audio_response.content)
35
  audio_bytes.name = f"{task_id}.mp3"
 
36
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
37
  transcription = client.audio.transcriptions.create(file=audio_bytes, model="whisper-large-v3", response_format="text")
38
  return str(transcription)
39
  except Exception as e:
40
  return f"Error during audio file transcription: {e}"
41
 
42
+ # --- Tool Definition: Video Transcription via FFmpeg ---
43
  def transcribe_youtube_video(video_url: str) -> str:
44
+ # (This function is complete and correct from the previous version)
45
+ print(f"Tool 'transcribe_youtube_video' (ffmpeg) called with URL: {video_url}")
 
 
 
46
  video_path, audio_path = None, None
47
  try:
48
  os.makedirs(TEMP_DIR, exist_ok=True)
49
  yt = YouTube(video_url)
50
+ stream = yt.streams.filter(only_audio=True).first()
51
  video_path = stream.download(output_path=TEMP_DIR)
52
+ audio_path = os.path.join(TEMP_DIR, "output.mp3")
53
+ command = ["ffmpeg", "-i", video_path, "-y", "-q:a", "0", "-map", "a", audio_path]
54
+ subprocess.run(command, check=True, capture_output=True, text=True)
 
 
55
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
56
  with open(audio_path, "rb") as audio_file:
57
  transcription = client.audio.transcriptions.create(file=audio_file, model="whisper-large-v3", response_format="text")
 
62
  if video_path and os.path.exists(video_path): os.remove(video_path)
63
  if audio_path and os.path.exists(audio_path): os.remove(audio_path)
64
 
65
+ # --- NEW TOOL Definition: Image Analysis ---
66
+ def analyze_image_from_task_id(task_id: str) -> str:
67
+ """
68
+ Downloads an image file for a given task_id and analyzes it using a Vision-Language Model.
69
+ Use this tool ONLY when a question explicitly mentions an image.
70
+ """
71
+ print(f"Tool 'analyze_image_from_task_id' called with task_id: {task_id}")
72
+ try:
73
+ file_url = f"{DEFAULT_API_URL}/files/{task_id}"
74
+ print(f"Downloading image from: {file_url}")
75
+ response = requests.get(file_url)
76
+ response.raise_for_status()
77
+
78
+ # Initialize the VLM client
79
+ vlm_client = HuggingFaceInferenceAPI(
80
+ model_id="llava-hf/llava-1.5-7b-hf",
81
+ token=os.getenv("HF_TOKEN")
82
+ )
83
+
84
+ print("Analyzing image with Llava...")
85
+ # The prompt for the VLM needs to be specific.
86
+ # We can just ask it to describe the image in detail.
87
+ text_prompt = "Describe the image in detail."
88
+ result = vlm_client.image_to_text(image=response.content, prompt=text_prompt)
89
+ print(f"Image analysis successful. Result: {result}")
90
+ return result
91
+
92
+ except Exception as e:
93
+ return f"Error during image analysis: {e}"
94
 
95
  # --- Agent Definition ---
96
  class LangChainAgent:
97
+ def __init__(self, groq_api_key: str, tavily_api_key: str, hf_token: str):
98
  self.llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=groq_api_key, temperature=0.0)
99
 
100
  self.tools = [
101
+ TavilySearchResults(name="web_search", max_results=3, tavily_api_key=tavily_api_key, description="A search engine for finding up-to-date information on the internet."),
102
+ Tool(name="audio_file_transcriber", func=transcribe_audio_file, description="Use this for questions mentioning an audio file (.mp3, recording). Input MUST be the task_id."),
103
+ Tool(name="youtube_video_transcriber", func=transcribe_youtube_video, description="Use this for questions with a youtube.com URL. Input MUST be the URL."),
104
+ Tool(name="image_analyzer", func=analyze_image_from_task_id, description="Use this for questions mentioning an image. Input MUST be the task_id."),
 
 
 
 
 
 
 
 
 
 
 
 
105
  ]
106
 
107
  prompt = ChatPromptTemplate.from_messages([
108
  ("system", (
109
  "You are a powerful problem-solving agent. Your goal is to answer the user's question accurately. "
110
+ "You have access to a web search tool, an audio file transcriber, a YouTube video transcriber, and an image analyzer.\n\n"
111
  "**REASONING PROCESS:**\n"
112
+ "1. **Analyze the question:** Determine if a tool is needed. Is it a general knowledge question, or does it mention a specific file type (audio, video, image) or URL?\n"
113
+ "2. **Select ONE tool based on the question:**\n"
114
+ " - For general knowledge, facts, or current events: use `web_search`.\n"
115
+ " - For an audio file, .mp3, or voice memo: use `audio_file_transcriber` with the `task_id`.\n"
116
+ " - For a youtube.com URL: use `youtube_video_transcriber` with the URL.\n"
117
+ " - For an image: use `image_analyzer` with the `task_id`.\n"
118
+ " - For math or simple logic: answer directly.\n"
119
+ "3. **Execute and Answer:** After using a tool, analyze the result and provide ONLY THE FINAL ANSWER."
120
  )),
121
  ("human", "Question: {input}\nTask ID: {task_id}"),
122
  ("placeholder", "{agent_scratchpad}"),
 
130
  input_for_agent = {"input": question, "task_id": task_id}
131
  if urls and "youtube.com" in urls[0]:
132
  input_for_agent['video_url'] = urls[0]
 
133
  try:
134
  response = self.agent_executor.invoke(input_for_agent)
135
  return response.get("output", "Agent failed to produce an answer.")
 
144
  try:
145
  groq_api_key = os.getenv("GROQ_API_KEY")
146
  tavily_api_key = os.getenv("TAVILY_API_KEY")
147
+ hf_token = os.getenv("HF_TOKEN")
148
+ if not all([groq_api_key, tavily_api_key, hf_token]): raise ValueError("An API key (GROQ, TAVILY, or HF) is missing.")
149
+ agent = LangChainAgent(groq_api_key=groq_api_key, tavily_api_key=tavily_api_key, hf_token=hf_token)
150
  except Exception as e: return f"Error initializing agent: {e}", None
151
 
152
  questions_url = f"{DEFAULT_API_URL}/questions"
 
180
 
181
  # --- Gradio Interface ---
182
  with gr.Blocks() as demo:
183
+ gr.Markdown("# Ultimate Agent Runner (Search, Audio, Video, Vision)")
184
+ gr.Markdown("This agent can search, transcribe audio files, transcribe YouTube videos, and analyze images.")
185
  gr.LoginButton()
186
  run_button = gr.Button("Run Evaluation & Submit All Answers")
187
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
190
 
191
  if __name__ == "__main__":
192
  print("\n" + "-"*30 + " App Starting " + "-"*30)
193
+ for key in ["GROQ_API_KEY", "TAVILY_API_KEY", "HF_TOKEN"]:
194
  print(f"✅ {key} secret is set." if os.getenv(key) else f"⚠️ WARNING: {key} secret is not set.")
195
  print("-"*(60 + len(" App Starting ")) + "\n")
196
  demo.launch(debug=True, share=False)