dalybuilds commited on
Commit
80e8087
·
verified ·
1 Parent(s): 6acbbf1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -51
app.py CHANGED
@@ -5,11 +5,9 @@ import pandas as pd
5
  from io import BytesIO
6
  import re
7
  import subprocess
8
- import base64
9
 
10
  # --- Tool-specific Imports ---
11
  from pytube import YouTube
12
- from langchain_huggingface import HuggingFaceInferenceAPI
13
 
14
  # --- LangChain & Groq Imports ---
15
  from groq import Groq
@@ -25,7 +23,10 @@ TEMP_DIR = "/tmp"
25
 
26
  # --- Tool Definition: Audio File Transcription ---
27
  def transcribe_audio_file(task_id: str) -> str:
28
- # (This function is complete and correct from the previous version)
 
 
 
29
  print(f"Tool 'transcribe_audio_file' called with task_id: {task_id}")
30
  try:
31
  file_url = f"{DEFAULT_API_URL}/files/{task_id}"
@@ -39,9 +40,12 @@ def transcribe_audio_file(task_id: str) -> str:
39
  except Exception as e:
40
  return f"Error during audio file transcription: {e}"
41
 
42
- # --- Tool Definition: Video Transcription via FFmpeg ---
43
  def transcribe_youtube_video(video_url: str) -> str:
44
- # (This function is complete and correct from the previous version)
 
 
 
45
  print(f"Tool 'transcribe_youtube_video' (ffmpeg) called with URL: {video_url}")
46
  video_path, audio_path = None, None
47
  try:
@@ -62,66 +66,30 @@ def transcribe_youtube_video(video_url: str) -> str:
62
  if video_path and os.path.exists(video_path): os.remove(video_path)
63
  if audio_path and os.path.exists(audio_path): os.remove(audio_path)
64
 
65
- # --- NEW TOOL Definition: Image Analysis ---
66
- def analyze_image_from_task_id(task_id: str) -> str:
67
- """
68
- Downloads an image file for a given task_id and analyzes it using a Vision-Language Model.
69
- Use this tool ONLY when a question explicitly mentions an image.
70
- """
71
- print(f"Tool 'analyze_image_from_task_id' called with task_id: {task_id}")
72
- try:
73
- file_url = f"{DEFAULT_API_URL}/files/{task_id}"
74
- print(f"Downloading image from: {file_url}")
75
- response = requests.get(file_url)
76
- response.raise_for_status()
77
-
78
- # Initialize the VLM client
79
- vlm_client = HuggingFaceInferenceAPI(
80
- model_id="llava-hf/llava-1.5-7b-hf",
81
- token=os.getenv("HF_TOKEN")
82
- )
83
-
84
- print("Analyzing image with Llava...")
85
- # The prompt for the VLM needs to be specific.
86
- # We can just ask it to describe the image in detail.
87
- text_prompt = "Describe the image in detail."
88
- result = vlm_client.image_to_text(image=response.content, prompt=text_prompt)
89
- print(f"Image analysis successful. Result: {result}")
90
- return result
91
-
92
- except Exception as e:
93
- return f"Error during image analysis: {e}"
94
-
95
  # --- Agent Definition ---
96
  class LangChainAgent:
97
- def __init__(self, groq_api_key: str, tavily_api_key: str, hf_token: str):
98
  self.llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=groq_api_key, temperature=0.0)
99
-
100
  self.tools = [
101
  TavilySearchResults(name="web_search", max_results=3, tavily_api_key=tavily_api_key, description="A search engine for finding up-to-date information on the internet."),
102
  Tool(name="audio_file_transcriber", func=transcribe_audio_file, description="Use this for questions mentioning an audio file (.mp3, recording). Input MUST be the task_id."),
103
  Tool(name="youtube_video_transcriber", func=transcribe_youtube_video, description="Use this for questions with a youtube.com URL. Input MUST be the URL."),
104
- Tool(name="image_analyzer", func=analyze_image_from_task_id, description="Use this for questions mentioning an image. Input MUST be the task_id."),
105
  ]
106
-
107
  prompt = ChatPromptTemplate.from_messages([
108
  ("system", (
109
- "You are a powerful problem-solving agent. Your goal is to answer the user's question accurately. "
110
- "You have access to a web search tool, an audio file transcriber, a YouTube video transcriber, and an image analyzer.\n\n"
111
  "**REASONING PROCESS:**\n"
112
- "1. **Analyze the question:** Determine if a tool is needed. Is it a general knowledge question, or does it mention a specific file type (audio, video, image) or URL?\n"
113
  "2. **Select ONE tool based on the question:**\n"
114
  " - For general knowledge, facts, or current events: use `web_search`.\n"
115
  " - For an audio file, .mp3, or voice memo: use `audio_file_transcriber` with the `task_id`.\n"
116
  " - For a youtube.com URL: use `youtube_video_transcriber` with the URL.\n"
117
- " - For an image: use `image_analyzer` with the `task_id`.\n"
118
- " - For math or simple logic: answer directly.\n"
119
  "3. **Execute and Answer:** After using a tool, analyze the result and provide ONLY THE FINAL ANSWER."
120
  )),
121
  ("human", "Question: {input}\nTask ID: {task_id}"),
122
  ("placeholder", "{agent_scratchpad}"),
123
  ])
124
-
125
  agent = create_tool_calling_agent(self.llm, self.tools, prompt)
126
  self.agent_executor = AgentExecutor(agent=agent, tools=self.tools, verbose=True, handle_parsing_errors=True)
127
 
@@ -144,9 +112,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
144
  try:
145
  groq_api_key = os.getenv("GROQ_API_KEY")
146
  tavily_api_key = os.getenv("TAVILY_API_KEY")
147
- hf_token = os.getenv("HF_TOKEN")
148
- if not all([groq_api_key, tavily_api_key, hf_token]): raise ValueError("An API key (GROQ, TAVILY, or HF) is missing.")
149
- agent = LangChainAgent(groq_api_key=groq_api_key, tavily_api_key=tavily_api_key, hf_token=hf_token)
150
  except Exception as e: return f"Error initializing agent: {e}", None
151
 
152
  questions_url = f"{DEFAULT_API_URL}/questions"
@@ -180,8 +147,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
180
 
181
  # --- Gradio Interface ---
182
  with gr.Blocks() as demo:
183
- gr.Markdown("# Ultimate Agent Runner (Search, Audio, Video, Vision)")
184
- gr.Markdown("This agent can search, transcribe audio files, transcribe YouTube videos, and analyze images.")
185
  gr.LoginButton()
186
  run_button = gr.Button("Run Evaluation & Submit All Answers")
187
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
@@ -190,7 +157,7 @@ with gr.Blocks() as demo:
190
 
191
  if __name__ == "__main__":
192
  print("\n" + "-"*30 + " App Starting " + "-"*30)
193
- for key in ["GROQ_API_KEY", "TAVILY_API_KEY", "HF_TOKEN"]:
194
  print(f"✅ {key} secret is set." if os.getenv(key) else f"⚠️ WARNING: {key} secret is not set.")
195
  print("-"*(60 + len(" App Starting ")) + "\n")
196
  demo.launch(debug=True, share=False)
 
5
  from io import BytesIO
6
  import re
7
  import subprocess
 
8
 
9
  # --- Tool-specific Imports ---
10
  from pytube import YouTube
 
11
 
12
  # --- LangChain & Groq Imports ---
13
  from groq import Groq
 
23
 
24
  # --- Tool Definition: Audio File Transcription ---
25
  def transcribe_audio_file(task_id: str) -> str:
26
+ """
27
+ Downloads an audio file (.mp3) for a given task_id, transcribes it, and returns the text.
28
+ Use this tool ONLY when a question explicitly mentions an audio file, .mp3, recording, or voice memo.
29
+ """
30
  print(f"Tool 'transcribe_audio_file' called with task_id: {task_id}")
31
  try:
32
  file_url = f"{DEFAULT_API_URL}/files/{task_id}"
 
40
  except Exception as e:
41
  return f"Error during audio file transcription: {e}"
42
 
43
+ # --- Tool Definition: Video Transcription (using FFmpeg) ---
44
  def transcribe_youtube_video(video_url: str) -> str:
45
+ """
46
+ Downloads a YouTube video from a URL, extracts its audio using FFmpeg, and transcribes it.
47
+ Use this tool ONLY when a question provides a youtube.com URL.
48
+ """
49
  print(f"Tool 'transcribe_youtube_video' (ffmpeg) called with URL: {video_url}")
50
  video_path, audio_path = None, None
51
  try:
 
66
  if video_path and os.path.exists(video_path): os.remove(video_path)
67
  if audio_path and os.path.exists(audio_path): os.remove(audio_path)
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # --- Agent Definition ---
70
  class LangChainAgent:
71
+ def __init__(self, groq_api_key: str, tavily_api_key: str):
72
  self.llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=groq_api_key, temperature=0.0)
 
73
  self.tools = [
74
  TavilySearchResults(name="web_search", max_results=3, tavily_api_key=tavily_api_key, description="A search engine for finding up-to-date information on the internet."),
75
  Tool(name="audio_file_transcriber", func=transcribe_audio_file, description="Use this for questions mentioning an audio file (.mp3, recording). Input MUST be the task_id."),
76
  Tool(name="youtube_video_transcriber", func=transcribe_youtube_video, description="Use this for questions with a youtube.com URL. Input MUST be the URL."),
 
77
  ]
 
78
  prompt = ChatPromptTemplate.from_messages([
79
  ("system", (
80
+ "You are a powerful problem-solving agent. You have access to a web search tool, an audio file transcriber, and a YouTube video transcriber.\n\n"
 
81
  "**REASONING PROCESS:**\n"
82
+ "1. **Analyze the question:** Determine if a tool is needed. Is it a general knowledge question, or does it mention an audio file or a YouTube URL?\n"
83
  "2. **Select ONE tool based on the question:**\n"
84
  " - For general knowledge, facts, or current events: use `web_search`.\n"
85
  " - For an audio file, .mp3, or voice memo: use `audio_file_transcriber` with the `task_id`.\n"
86
  " - For a youtube.com URL: use `youtube_video_transcriber` with the URL.\n"
87
+ " - For anything else (like images, which you cannot see, or math), you must answer directly without using a tool.\n"
 
88
  "3. **Execute and Answer:** After using a tool, analyze the result and provide ONLY THE FINAL ANSWER."
89
  )),
90
  ("human", "Question: {input}\nTask ID: {task_id}"),
91
  ("placeholder", "{agent_scratchpad}"),
92
  ])
 
93
  agent = create_tool_calling_agent(self.llm, self.tools, prompt)
94
  self.agent_executor = AgentExecutor(agent=agent, tools=self.tools, verbose=True, handle_parsing_errors=True)
95
 
 
112
  try:
113
  groq_api_key = os.getenv("GROQ_API_KEY")
114
  tavily_api_key = os.getenv("TAVILY_API_KEY")
115
+ if not all([groq_api_key, tavily_api_key]): raise ValueError("GROQ or TAVILY API key is missing.")
116
+ agent = LangChainAgent(groq_api_key=groq_api_key, tavily_api_key=tavily_api_key)
 
117
  except Exception as e: return f"Error initializing agent: {e}", None
118
 
119
  questions_url = f"{DEFAULT_API_URL}/questions"
 
147
 
148
  # --- Gradio Interface ---
149
  with gr.Blocks() as demo:
150
+ gr.Markdown("# Ultimate Agent Runner (Search, Audio, Video)")
151
+ gr.Markdown("This agent can search, transcribe audio files, and transcribe YouTube videos.")
152
  gr.LoginButton()
153
  run_button = gr.Button("Run Evaluation & Submit All Answers")
154
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
157
 
158
  if __name__ == "__main__":
159
  print("\n" + "-"*30 + " App Starting " + "-"*30)
160
+ for key in ["GROQ_API_KEY", "TAVILY_API_KEY"]:
161
  print(f"✅ {key} secret is set." if os.getenv(key) else f"⚠️ WARNING: {key} secret is not set.")
162
  print("-"*(60 + len(" App Starting ")) + "\n")
163
  demo.launch(debug=True, share=False)