Abbasid commited on
Commit
6a09b39
·
verified ·
1 Parent(s): 2da7120

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -63
app.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
  app.py
3
  This script provides the Gradio web interface to run the evaluation.
4
- This version properly handles multimodal inputs including images, videos, and audio.
5
  """
6
 
7
  import os
@@ -10,6 +10,8 @@ import gradio as gr
10
  import requests
11
  import pandas as pd
12
  from urllib.parse import urlparse
 
 
13
 
14
  from agent import create_agent_executor
15
 
@@ -24,74 +26,130 @@ def parse_final_answer(agent_response: str) -> str:
24
  if lines: return lines[-1].strip()
25
  return "Could not parse a final answer."
26
 
27
- def detect_file_type(url: str) -> str:
28
- """Detect the type of file from URL."""
29
- if not url:
30
- return "unknown"
 
 
 
31
 
32
- url_lower = url.lower()
 
33
 
34
- # Image extensions
35
- if any(ext in url_lower for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg']):
36
- return "image"
 
 
 
 
 
 
 
37
 
38
- # Video extensions and YouTube
39
- if any(domain in url_lower for domain in ['youtube.com', 'youtu.be', 'vimeo.com']):
40
- return "youtube"
41
- if any(ext in url_lower for ext in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']):
42
- return "video"
43
 
44
- # Audio extensions
45
- if any(ext in url_lower for ext in ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a']):
46
- return "audio"
 
 
 
47
 
48
- # Try to detect from headers if possible
49
  try:
50
- response = requests.head(url, timeout=5)
 
51
  content_type = response.headers.get('content-type', '').lower()
 
 
52
 
53
- if 'image' in content_type:
54
- return "image"
55
- elif 'audio' in content_type:
56
- return "audio"
57
- elif 'video' in content_type:
58
- return "video"
59
- except:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  pass
61
 
62
- return "unknown"
63
 
64
- def create_enhanced_prompt(question_text: str, file_url: str = None) -> str:
65
- """Create an enhanced prompt that guides the agent to use appropriate tools."""
66
-
 
67
  if not file_url:
68
- return question_text
69
-
70
- file_type = detect_file_type(file_url)
 
 
 
71
 
72
  if file_type == "image":
73
- return f"""{question_text}
74
 
75
- [IMAGE ATTACHMENT]: {file_url}
76
- INSTRUCTION: There is an image attached to this question. You MUST use the 'describe_image' tool to analyze this image before answering the question."""
 
 
 
77
 
78
- elif file_type == "youtube":
79
- return f"""{question_text}
 
 
80
 
81
- [YOUTUBE VIDEO]: {file_url}
82
- INSTRUCTION: There is a YouTube video attached to this question. You MUST use the 'process_youtube_video' tool to analyze this video before answering the question."""
83
 
84
- elif file_type == "audio":
85
- return f"""{question_text}
 
86
 
87
- [AUDIO FILE]: {file_url}
88
- INSTRUCTION: There is an audio file attached to this question. You MUST use the 'process_audio_file' tool to analyze this audio before answering the question."""
 
 
 
89
 
90
- else:
91
- return f"""{question_text}
 
 
92
 
93
- [ATTACHMENT]: {file_url}
94
- INSTRUCTION: There is a file attachment. Analyze the URL and use the appropriate tool to process this content before answering the question."""
95
 
96
  def run_and_submit_all(profile: gr.OAuthProfile | None):
97
  """
@@ -112,7 +170,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
112
  # 1. Instantiate Agent
113
  print("Initializing your custom agent...")
114
  try:
115
- agent_executor = create_agent_executor(provider="google") # Using Google for better multimodal support
116
  except Exception as e:
117
  return f"Fatal Error: Could not initialize agent. Check logs. Details: {e}", None
118
 
@@ -141,18 +199,22 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
141
  # Get file URL if it exists
142
  file_url = item.get("file_url")
143
 
144
- # Create enhanced prompt that instructs the agent to use appropriate tools
145
- full_question_text = create_enhanced_prompt(question_text, file_url)
146
 
147
  if file_url:
148
- file_type = detect_file_type(file_url)
149
- print(f"File detected: {file_url} (Type: {file_type})")
 
 
 
 
150
 
151
- print(f"Enhanced Prompt for Agent:\n{full_question_text}")
152
 
153
  try:
154
- # Pass the enhanced question to the agent
155
- result = agent_executor.invoke({"messages": [("user", full_question_text)]})
156
 
157
  raw_answer = result['messages'][-1].content
158
  submitted_answer = parse_final_answer(raw_answer)
@@ -165,7 +227,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
165
  "Task ID": task_id,
166
  "Question": question_text,
167
  "File URL": file_url or "None",
168
- "File Type": detect_file_type(file_url) if file_url else "None",
 
169
  "Submitted Answer": submitted_answer
170
  })
171
 
@@ -177,7 +240,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
177
  "Task ID": task_id,
178
  "Question": question_text,
179
  "File URL": file_url or "None",
180
- "File Type": detect_file_type(file_url) if file_url else "None",
 
181
  "Submitted Answer": error_msg
182
  })
183
 
@@ -201,9 +265,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
201
  return status_message, pd.DataFrame(results_log)
202
 
203
  # --- Gradio UI ---
204
- with gr.Blocks(title="Multimodal Agent Evaluation") as demo:
205
- gr.Markdown("# Multimodal Agent Evaluation Runner")
206
- gr.Markdown("This agent can process images, YouTube videos, audio files, and perform web searches.")
207
 
208
  gr.LoginButton()
209
  run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
@@ -212,11 +276,11 @@ with gr.Blocks(title="Multimodal Agent Evaluation") as demo:
212
  label="Questions and Agent Answers",
213
  wrap=True,
214
  row_count=10,
215
- column_widths=[80, 200, 150, 80, 200]
216
  )
217
 
218
  run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
219
 
220
  if __name__ == "__main__":
221
- print("\n" + "-"*30 + " Multimodal App Starting " + "-"*30)
222
  demo.launch()
 
1
  """
2
  app.py
3
  This script provides the Gradio web interface to run the evaluation.
4
+ This version focuses on robust image detection and processing.
5
  """
6
 
7
  import os
 
10
  import requests
11
  import pandas as pd
12
  from urllib.parse import urlparse
13
+ import mimetypes
14
+ from typing import Optional, Tuple
15
 
16
  from agent import create_agent_executor
17
 
 
26
  if lines: return lines[-1].strip()
27
  return "Could not parse a final answer."
28
 
29
+ def detect_file_type_robust(url: str) -> Tuple[str, dict]:
30
+ """
31
+ Robust file type detection with multiple validation methods.
32
+ Returns (file_type, metadata_dict)
33
+ """
34
+ if not url or not url.strip():
35
+ return "unknown", {"error": "Empty URL"}
36
 
37
+ url = url.strip()
38
+ metadata = {"original_url": url}
39
 
40
+ # Normalize URL
41
+ if not url.startswith(('http://', 'https://')):
42
+ return "unknown", {"error": "Invalid URL format - must start with http/https"}
43
+
44
+ try:
45
+ parsed = urlparse(url)
46
+ metadata["domain"] = parsed.netloc
47
+ metadata["path"] = parsed.path
48
+ except Exception as e:
49
+ return "unknown", {"error": f"URL parsing failed: {e}"}
50
 
51
+ # Method 1: File extension analysis
52
+ url_lower = url.lower()
53
+ image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.tiff', '.ico'}
 
 
54
 
55
+ # Check for image extensions
56
+ for ext in image_extensions:
57
+ if url_lower.endswith(ext) or ext in url_lower.split('?')[0]: # Handle query params
58
+ metadata["detection_method"] = "file_extension"
59
+ metadata["extension"] = ext
60
+ return "image", metadata
61
 
62
+ # Method 2: Content-Type header check
63
  try:
64
+ print(f"Checking content type for: {url}")
65
+ response = requests.head(url, timeout=10, allow_redirects=True)
66
  content_type = response.headers.get('content-type', '').lower()
67
+ metadata["content_type"] = content_type
68
+ metadata["status_code"] = response.status_code
69
 
70
+ if response.status_code == 200:
71
+ if any(img_type in content_type for img_type in ['image/', 'image/jpeg', 'image/png', 'image/gif', 'image/webp']):
72
+ metadata["detection_method"] = "content_type"
73
+ return "image", metadata
74
+ else:
75
+ metadata["error"] = f"HTTP {response.status_code}"
76
+
77
+ except requests.RequestException as e:
78
+ metadata["error"] = f"Network error: {e}"
79
+ print(f"Network error checking {url}: {e}")
80
+
81
+ # Method 3: Domain-based detection for common image hosts
82
+ image_domains = {
83
+ 'imgur.com', 'i.imgur.com',
84
+ 'cdn.discordapp.com', 'media.discordapp.net',
85
+ 'pbs.twimg.com', 'abs.twimg.com',
86
+ 'i.redd.it', 'preview.redd.it',
87
+ 'images.unsplash.com',
88
+ 'via.placeholder.com',
89
+ 'picsum.photos'
90
+ }
91
+
92
+ domain_lower = metadata.get("domain", "").lower()
93
+ if any(img_domain in domain_lower for img_domain in image_domains):
94
+ metadata["detection_method"] = "domain_based"
95
+ return "image", metadata
96
+
97
+ # Method 4: Guess from MIME types
98
+ try:
99
+ mime_type, _ = mimetypes.guess_type(url)
100
+ if mime_type and mime_type.startswith('image/'):
101
+ metadata["detection_method"] = "mime_guess"
102
+ metadata["mime_type"] = mime_type
103
+ return "image", metadata
104
+ except Exception:
105
  pass
106
 
107
+ return "unknown", metadata
108
 
109
+ def create_structured_prompt(question_text: str, file_url: str = None) -> str:
110
+ """
111
+ Create a structured prompt that provides clear task analysis for the agent.
112
+ """
113
  if not file_url:
114
+ return f"""TASK: {question_text}
115
+
116
+ ANALYSIS: This is a text-only question with no attachments.
117
+ APPROACH: Use available tools (web search, Wikipedia, etc.) as needed to answer accurately."""
118
+
119
+ file_type, metadata = detect_file_type_robust(file_url)
120
 
121
  if file_type == "image":
122
+ return f"""TASK: {question_text}
123
 
124
+ ATTACHMENT ANALYSIS:
125
+ - Type: Image file detected
126
+ - URL: {file_url}
127
+ - Detection method: {metadata.get('detection_method', 'unknown')}
128
+ - Metadata: {metadata}
129
 
130
+ REASONING REQUIRED:
131
+ 1. This question involves an image that needs to be analyzed
132
+ 2. You must examine the image content to answer the question
133
+ 3. The image URL should be processed directly by your vision capabilities
134
 
135
+ APPROACH: Process the image URL directly with your vision model, then provide a comprehensive answer based on what you see."""
 
136
 
137
+ else:
138
+ error_info = metadata.get('error', 'Unknown file type')
139
+ return f"""TASK: {question_text}
140
 
141
+ ATTACHMENT ANALYSIS:
142
+ - URL: {file_url}
143
+ - Type: Could not identify as supported file type
144
+ - Error: {error_info}
145
+ - Metadata: {metadata}
146
 
147
+ REASONING REQUIRED:
148
+ 1. There is an attachment but it's not a recognized image format
149
+ 2. You should attempt to process it as a regular web resource
150
+ 3. Use web search or other tools to gather information about the URL content
151
 
152
+ APPROACH: Use web search or other available tools to gather information about this resource."""
 
153
 
154
  def run_and_submit_all(profile: gr.OAuthProfile | None):
155
  """
 
170
  # 1. Instantiate Agent
171
  print("Initializing your custom agent...")
172
  try:
173
+ agent_executor = create_agent_executor(provider="groq")
174
  except Exception as e:
175
  return f"Fatal Error: Could not initialize agent. Check logs. Details: {e}", None
176
 
 
199
  # Get file URL if it exists
200
  file_url = item.get("file_url")
201
 
202
+ # Create structured prompt with robust file analysis
203
+ structured_prompt = create_structured_prompt(question_text, file_url)
204
 
205
  if file_url:
206
+ file_type, metadata = detect_file_type_robust(file_url)
207
+ print(f"File analysis: {file_url}")
208
+ print(f" - Type: {file_type}")
209
+ print(f" - Detection method: {metadata.get('detection_method', 'unknown')}")
210
+ if metadata.get('error'):
211
+ print(f" - Error: {metadata['error']}")
212
 
213
+ print(f"Structured Prompt for Agent:\n{structured_prompt}")
214
 
215
  try:
216
+ # Pass the structured prompt to the agent
217
+ result = agent_executor.invoke({"messages": [("user", structured_prompt)]})
218
 
219
  raw_answer = result['messages'][-1].content
220
  submitted_answer = parse_final_answer(raw_answer)
 
227
  "Task ID": task_id,
228
  "Question": question_text,
229
  "File URL": file_url or "None",
230
+ "File Type": detect_file_type_robust(file_url)[0] if file_url else "None",
231
+ "Detection Method": detect_file_type_robust(file_url)[1].get('detection_method', 'N/A') if file_url else "N/A",
232
  "Submitted Answer": submitted_answer
233
  })
234
 
 
240
  "Task ID": task_id,
241
  "Question": question_text,
242
  "File URL": file_url or "None",
243
+ "File Type": detect_file_type_robust(file_url)[0] if file_url else "None",
244
+ "Detection Method": "Error",
245
  "Submitted Answer": error_msg
246
  })
247
 
 
265
  return status_message, pd.DataFrame(results_log)
266
 
267
  # --- Gradio UI ---
268
+ with gr.Blocks(title="Image-Capable Agent Evaluation") as demo:
269
+ gr.Markdown("# Image-Capable Agent Evaluation Runner")
270
+ gr.Markdown("This agent can process images and perform web searches using Groq's vision-capable models.")
271
 
272
  gr.LoginButton()
273
  run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
 
276
  label="Questions and Agent Answers",
277
  wrap=True,
278
  row_count=10,
279
+ column_widths=[80, 200, 120, 100, 80, 200]
280
  )
281
 
282
  run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
283
 
284
  if __name__ == "__main__":
285
+ print("\n" + "-"*30 + " Image Agent App Starting " + "-"*30)
286
  demo.launch()