Final_Assignment_Template

Sleeping

App Files Files Community

Abbasid commited on 23 days ago

Commit

1376719

verified ·

1 Parent(s): d59e1c2

Update agent.py

Browse files

Files changed (1) hide show

agent.py +186 -127

agent.py CHANGED Viewed

@@ -2,9 +2,7 @@
 agent.py
 This file defines the core logic for a sophisticated AI agent using LangGraph.
-## MODIFICATION: This version introduces a 'multimodal_router' node.
-This node intelligently inspects user input to identify, classify (using HEAD requests),
-and pre-process URLs for images, audio, and video before the main LLM reasoning step.
 """
 # ----------------------------------------------------------
@@ -38,14 +36,14 @@ from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
 from langchain_core.tools import Tool, tool
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_groq import ChatGroq
-from langchain_huggingface import HuggingFaceEmbeddings
 from langgraph.graph import START, StateGraph, MessagesState
 from langgraph.prebuilt import ToolNode, tools_condition
 from dotenv import load_dotenv
 load_dotenv()
-# --- Configuration and Caching (remains the same) ---
 JSONL_PATH, FAISS_CACHE, EMBED_MODEL = Path("metadata.jsonl"), Path("faiss_index.pkl"), "sentence-transformers/all-mpnet-base-v2"
 RETRIEVER_K, CACHE_TTL = 5, 600
 API_CACHE = TTLCache(maxsize=256, ttl=CACHE_TTL)
@@ -56,12 +54,11 @@ def cached_get(key: str, fetch_fn):
     return val
 # ----------------------------------------------------------
-# Section 2: Standalone Tool Functions (remains the same)
 # ----------------------------------------------------------
 @tool
 def python_repl(code: str) -> str:
     """Executes a string of Python code and returns the stdout/stderr."""
-    # ... (implementation unchanged)
     code = textwrap.dedent(code).strip()
     try:
         result = subprocess.run(["python", "-c", code], capture_output=True, text=True, timeout=10, check=False)
@@ -69,36 +66,99 @@ def python_repl(code: str) -> str:
         else: return f"Execution failed.\nSTDOUT:\n```\n{result.stdout}\n```\nSTDERR:\n```\n{result.stderr}\n```"
     except subprocess.TimeoutExpired: return "Execution timed out (>10s)."
 @tool
 def process_youtube_video(url: str) -> str:
     """Downloads and processes a YouTube video, extracting audio and converting to text."""
-    # ... (implementation unchanged)
     try:
         print(f"Processing YouTube video: {url}")
         with tempfile.TemporaryDirectory() as temp_dir:
             ydl_opts = {
-                'format': 'bestaudio/best', 'outtmpl': f'{temp_dir}/%(title)s.%(ext)s',
-                'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav'}],
             }
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 info = ydl.extract_info(url, download=True)
                 title = info.get('title', 'Unknown')
             audio_files = list(Path(temp_dir).glob("*.wav"))
-            if not audio_files: return "Error: Could not download audio from YouTube video"
-            r, transcript_parts = sr.Recognizer(), []
-            audio = AudioSegment.from_wav(str(audio_files[0])).set_channels(1).set_frame_rate(16000)
-            chunks = [audio[i:i + 30000] for i in range(0, len(audio), 30000)]
-            for i, chunk in enumerate(chunks[:10]):
                 chunk_file = Path(temp_dir) / f"chunk_{i}.wav"
                 chunk.export(chunk_file, format="wav")
                 try:
                     with sr.AudioFile(str(chunk_file)) as source:
-                        text = r.recognize_google(r.record(source))
                         transcript_parts.append(text)
-                except (sr.UnknownValueError, sr.RequestError) as e:
-                    transcript_parts.append(f"[Speech recognition error or unintelligible audio: {e}]")
-            return f"YouTube Video: {title}\n\nTranscript (first 5 minutes):\n{' '.join(transcript_parts)}"
     except Exception as e:
         print(f"Error processing YouTube video: {e}")
         return f"Error processing YouTube video: {e}"
@@ -106,61 +166,110 @@ def process_youtube_video(url: str) -> str:
 @tool
 def process_audio_file(file_url: str) -> str:
     """Downloads and processes an audio file (MP3, WAV, etc.) and converts to text."""
-    # ... (implementation unchanged)
     try:
         print(f"Processing audio file: {file_url}")
         with tempfile.TemporaryDirectory() as temp_dir:
             response = requests.get(file_url, timeout=30)
             response.raise_for_status()
-            ext = os.path.splitext(file_url)[1][1:] or 'mp3'
             audio_file = Path(temp_dir) / f"audio.{ext}"
-            with open(audio_file, 'wb') as f: f.write(response.content)
-            wav_file = Path(temp_dir) / "audio.wav"
-            AudioSegment.from_file(str(audio_file)).export(wav_file, format="wav")
-            r, transcript_parts = sr.Recognizer(), []
-            audio = AudioSegment.from_wav(str(wav_file)).set_channels(1).set_frame_rate(16000)
-            chunks = [audio[i:i + 30000] for i in range(0, len(audio), 30000)]
-            for i, chunk in enumerate(chunks[:20]):
                 chunk_file = Path(temp_dir) / f"chunk_{i}.wav"
                 chunk.export(chunk_file, format="wav")
                 try:
                     with sr.AudioFile(str(chunk_file)) as source:
-                        text = r.recognize_google(r.record(source))
                         transcript_parts.append(text)
-                except (sr.UnknownValueError, sr.RequestError) as e:
-                    transcript_parts.append(f"[Speech recognition error or unintelligible audio: {e}]")
-            return f"Audio file transcript:\n{' '.join(transcript_parts)}"
     except Exception as e:
         print(f"Error processing audio file: {e}")
         return f"Error processing audio file: {e}"
 def web_search_func(query: str, cache_func) -> str:
     """Performs a web search using Tavily and returns a compilation of results."""
-    # ... (implementation unchanged)
     key = f"web:{query}"
     results = cache_func(key, lambda: TavilySearchResults(max_results=5).invoke(query))
     return "\n\n---\n\n".join([f"Source: {res['url']}\nContent: {res['content']}" for res in results])
 def wiki_search_func(query: str, cache_func) -> str:
     """Searches Wikipedia and returns the top 2 results."""
-    # ... (implementation unchanged)
     key = f"wiki:{query}"
     docs = cache_func(key, lambda: WikipediaLoader(query=query, load_max_docs=2, doc_content_chars_max=2000).load())
     return "\n\n---\n\n".join([f"Source: {d.metadata['source']}\n\n{d.page_content}" for d in docs])
 def arxiv_search_func(query: str, cache_func) -> str:
     """Searches Arxiv for scientific papers and returns the top 2 results."""
-    # ... (implementation unchanged)
     key = f"arxiv:{query}"
     docs = cache_func(key, lambda: ArxivLoader(query=query, load_max_docs=2).load())
     return "\n\n---\n\n".join([f"Source: {d.metadata['source']}\nPublished: {d.metadata['Published']}\nTitle: {d.metadata['Title']}\n\nSummary:\n{d.page_content}" for d in docs])
 # ----------------------------------------------------------
-# Section 3: DYNAMIC SYSTEM PROMPT (remains the same)
 # ----------------------------------------------------------
 SYSTEM_PROMPT_TEMPLATE = (
-    """You are an expert-level multimodal research assistant...""" # Unchanged
 )
 # ----------------------------------------------------------
@@ -172,130 +281,80 @@ def create_agent_executor(provider: str = "groq"):
     """
     print(f"Initializing agent with provider: {provider}")
-    # Step 1: Build LLM (remains the same)
-    if provider == "groq":
-        llm = ChatGroq(model_name="llama-3.1-70b-vision-preview", temperature=0)
     else:
-        raise ValueError(f"Provider '{provider}' not currently configured for this router.")
-    # Step 2: Build Retriever (remains the same, but will be called inside the router)
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
     if FAISS_CACHE.exists():
         with open(FAISS_CACHE, "rb") as f: vector_store = pickle.load(f)
     else:
-        # ... logic to build vector_store from JSONL or create empty ...
-        docs = []
         if JSONL_PATH.exists():
             docs = [Document(page_content=f"Question: {rec['Question']}\n\nFinal answer: {rec['Final answer']}", metadata={"source": rec["task_id"]}) for rec in (json.loads(line) for line in open(JSONL_PATH, "rt", encoding="utf-8"))]
-        if not docs:
             docs = [Document(page_content="Sample document", metadata={"source": "sample"})]
-        vector_store = FAISS.from_documents(docs, embeddings)
-        with open(FAISS_CACHE, "wb") as f: pickle.dump(vector_store, f)
     retriever = vector_store.as_retriever(search_kwargs={"k": RETRIEVER_K})
-    # Step 3: Create the final list of tools (remains the same)
     tools_list = [
-        python_repl, process_youtube_video, process_audio_file,
         Tool(name="web_search", func=functools.partial(web_search_func, cache_func=cached_get), description="Performs a web search using Tavily."),
         Tool(name="wiki_search", func=functools.partial(wiki_search_func, cache_func=cached_get), description="Searches Wikipedia."),
         Tool(name="arxiv_search", func=functools.partial(arxiv_search_func, cache_func=cached_get), description="Searches Arxiv for scientific papers."),
         create_retriever_tool(retriever=retriever, name="retrieve_examples", description="Retrieve solved questions similar to the user's query."),
     ]
-    # Step 4: Format prompt and bind tools (remains the same)
     tool_definitions = "\n".join([f"- `{tool.name}`: {tool.description}" for tool in tools_list])
     final_system_prompt = SYSTEM_PROMPT_TEMPLATE.format(tools=tool_definitions)
-    llm_with_tools = llm.bind_tools(tools_list)
-    # Step 5: Define Graph Nodes
-    ## MODIFICATION: A new, powerful router node that replaces the previous pre-processing.
-    def multimodal_router(state: MessagesState):
-        """
-        Inspects the user's message, classifies URLs, and prepares the state for the LLM.
-        This node acts as a central dispatcher.
-        """
-        print("--- Entering Multimodal Router ---")
-        messages = state["messages"]
-        last_message = messages[-1]
-        # 1. Perform knowledge base retrieval first
-        # We consolidate this logic here from the old retriever_node
-        user_query_text = ""
-        if isinstance(last_message.content, str):
-            user_query_text = last_message.content
-        elif isinstance(last_message.content, list): # For multimodal messages
-            user_query_text = " ".join(item['text'] for item in last_message.content if item['type'] == 'text')
-        docs = retriever.invoke(user_query_text)
-        system_messages = [SystemMessage(content=final_system_prompt)]
         if docs:
             example_text = "\n\n---\n\n".join(d.page_content for d in docs)
-            system_messages.append(AIMessage(content=f"I have found {len(docs)} similar solved examples:\n\n{example_text}", name="ExampleRetriever"))
-        # 2. Extract and classify URLs
-        urls = re.findall(r'(https?://[^\s]+)', user_query_text)
-        image_processed = False
-        for url in urls:
-            try:
-                print(f"Routing URL: {url}")
-                # Simple classification first
-                if "youtube.com" in url or "youtu.be" in url:
-                    system_messages.append(SystemMessage(content=f"[System Note: A YouTube URL has been detected. Use the 'process_youtube_video' tool if the user asks about it.]"))
-                    continue
-                # Use a HEAD request for robust classification
-                headers = requests.head(url, timeout=5, allow_redirects=True).headers
-                content_type = headers.get('Content-Type', '')
-                if 'image/' in content_type and not image_processed:
-                    print(f"  -> Classified as Image. Processing for vision model.")
-                    response = requests.get(url, timeout=10)
-                    response.raise_for_status()
-                    img = Image.open(BytesIO(response.content))
-                    buffered = BytesIO()
-                    img.convert("RGB").save(buffered, format="JPEG")
-                    b64_string = base64.b64encode(buffered.getvalue()).decode()
-                    # Embed the image into the last message
-                    new_content = [
-                        {"type": "text", "text": user_query_text},
-                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_string}"}}
-                    ]
-                    messages[-1] = HumanMessage(content=new_content)
-                    image_processed = True # Process only the first image for now
-                elif 'audio/' in content_type:
-                    print(f"  -> Classified as Audio.")
-                    system_messages.append(SystemMessage(content=f"[System Note: An audio URL has been detected. Use the 'process_audio_file' tool if the user asks about it.]"))
-                else:
-                    print(f"  -> Classified as Web Page/Other.")
-            except Exception as e:
-                print(f"  -> Could not process URL {url}: {e}")
-        # Rebuild the final state
-        final_messages = system_messages + messages
-        return {"messages": final_messages}
     def assistant_node(state: MessagesState):
         result = llm_with_tools.invoke(state["messages"])
         return {"messages": [result]}
     # Step 6: Build Graph
-    ## MODIFICATION: The graph is now simpler and more robust.
     builder = StateGraph(MessagesState)
-    builder.add_node("multimodal_router", multimodal_router) # The new, powerful starting node
     builder.add_node("assistant", assistant_node)
     builder.add_node("tools", ToolNode(tools_list))
-    builder.add_edge(START, "multimodal_router")
-    builder.add_edge("multimodal_router", "assistant")
     builder.add_conditional_edges("assistant", tools_condition, {"tools": "tools", "__end__": "__end__"})
     builder.add_edge("tools", "assistant")
     agent_executor = builder.compile()
-    print("Agent Executor with Multimodal Router created successfully.")
     return agent_executor

 agent.py
 This file defines the core logic for a sophisticated AI agent using LangGraph.
+This version includes proper multimodal support for images, YouTube videos, and audio files.
 """
 # ----------------------------------------------------------
 from langchain_core.tools import Tool, tool
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_groq import ChatGroq
+from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint, ChatHuggingFace
 from langgraph.graph import START, StateGraph, MessagesState
 from langgraph.prebuilt import ToolNode, tools_condition
 from dotenv import load_dotenv
 load_dotenv()
+# --- Configuration and Caching ---
 JSONL_PATH, FAISS_CACHE, EMBED_MODEL = Path("metadata.jsonl"), Path("faiss_index.pkl"), "sentence-transformers/all-mpnet-base-v2"
 RETRIEVER_K, CACHE_TTL = 5, 600
 API_CACHE = TTLCache(maxsize=256, ttl=CACHE_TTL)
     return val
 # ----------------------------------------------------------
+# Section 2: Standalone Tool Functions
 # ----------------------------------------------------------
 @tool
 def python_repl(code: str) -> str:
     """Executes a string of Python code and returns the stdout/stderr."""
     code = textwrap.dedent(code).strip()
     try:
         result = subprocess.run(["python", "-c", code], capture_output=True, text=True, timeout=10, check=False)
         else: return f"Execution failed.\nSTDOUT:\n```\n{result.stdout}\n```\nSTDERR:\n```\n{result.stderr}\n```"
     except subprocess.TimeoutExpired: return "Execution timed out (>10s)."
+def describe_image_func(image_source: str, vision_llm_instance) -> str:
+    """Describes an image from a local file path or a URL using a provided vision LLM."""
+    try:
+        print(f"Processing image: {image_source}")
+        # Download and process image
+        if image_source.startswith("http"):
+            response = requests.get(image_source, timeout=10)
+            response.raise_for_status()
+            img = Image.open(BytesIO(response.content))
+        else:
+            img = Image.open(image_source)
+        # Convert to base64
+        buffered = BytesIO()
+        img.convert("RGB").save(buffered, format="JPEG")
+        b64_string = base64.b64encode(buffered.getvalue()).decode()
+        # Create multimodal message
+        msg = HumanMessage(content=[
+            {"type": "text", "text": "Describe this image in detail. Include all objects, people, text, colors, setting, and any other relevant information you can see."},
+            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_string}"}}
+        ])
+        result = vision_llm_instance.invoke([msg])
+        return f"Image description: {result.content}"
+    except Exception as e:
+        print(f"Error in describe_image_func: {e}")
+        return f"Error processing image: {e}"
 @tool
 def process_youtube_video(url: str) -> str:
     """Downloads and processes a YouTube video, extracting audio and converting to text."""
     try:
         print(f"Processing YouTube video: {url}")
+        # Create temporary directory
         with tempfile.TemporaryDirectory() as temp_dir:
+            # Download audio from YouTube video
             ydl_opts = {
+                'format': 'bestaudio/best',
+                'outtmpl': f'{temp_dir}/%(title)s.%(ext)s',
+                'postprocessors': [{
+                    'key': 'FFmpegExtractAudio',
+                    'preferredcodec': 'wav',
+                }],
             }
             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                 info = ydl.extract_info(url, download=True)
                 title = info.get('title', 'Unknown')
+            # Find the downloaded audio file
             audio_files = list(Path(temp_dir).glob("*.wav"))
+            if not audio_files:
+                return "Error: Could not download audio from YouTube video"
+            audio_file = audio_files[0]
+            # Convert audio to text using speech recognition
+            r = sr.Recognizer()
+            # Load audio file
+            audio = AudioSegment.from_wav(str(audio_file))
+            # Convert to mono and set sample rate
+            audio = audio.set_channels(1)
+            audio = audio.set_frame_rate(16000)
+            # Convert to smaller chunks for processing (30 seconds each)
+            chunk_length_ms = 30000
+            chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
+            transcript_parts = []
+            for i, chunk in enumerate(chunks[:10]):  # Limit to first 5 minutes
                 chunk_file = Path(temp_dir) / f"chunk_{i}.wav"
                 chunk.export(chunk_file, format="wav")
                 try:
                     with sr.AudioFile(str(chunk_file)) as source:
+                        audio_data = r.record(source)
+                        text = r.recognize_google(audio_data)
                         transcript_parts.append(text)
+                except sr.UnknownValueError:
+                    transcript_parts.append("[Unintelligible audio]")
+                except sr.RequestError as e:
+                    transcript_parts.append(f"[Speech recognition error: {e}]")
+            transcript = " ".join(transcript_parts)
+            return f"YouTube Video: {title}\n\nTranscript (first 5 minutes):\n{transcript}"
     except Exception as e:
         print(f"Error processing YouTube video: {e}")
         return f"Error processing YouTube video: {e}"
 @tool
 def process_audio_file(file_url: str) -> str:
     """Downloads and processes an audio file (MP3, WAV, etc.) and converts to text."""
     try:
         print(f"Processing audio file: {file_url}")
         with tempfile.TemporaryDirectory() as temp_dir:
+            # Download audio file
             response = requests.get(file_url, timeout=30)
             response.raise_for_status()
+            # Determine file extension from URL or content type
+            if file_url.lower().endswith('.mp3'):
+                ext = 'mp3'
+            elif file_url.lower().endswith('.wav'):
+                ext = 'wav'
+            else:
+                content_type = response.headers.get('content-type', '')
+                if 'mp3' in content_type:
+                    ext = 'mp3'
+                elif 'wav' in content_type:
+                    ext = 'wav'
+                else:
+                    ext = 'mp3'  # Default assumption
             audio_file = Path(temp_dir) / f"audio.{ext}"
+            with open(audio_file, 'wb') as f:
+                f.write(response.content)
+            # Convert to WAV if necessary
+            if ext != 'wav':
+                audio = AudioSegment.from_file(str(audio_file))
+                wav_file = Path(temp_dir) / "audio.wav"
+                audio.export(wav_file, format="wav")
+                audio_file = wav_file
+            # Convert audio to text
+            r = sr.Recognizer()
+            # Load and process audio
+            audio = AudioSegment.from_wav(str(audio_file))
+            audio = audio.set_channels(1).set_frame_rate(16000)
+            # Process in chunks
+            chunk_length_ms = 30000
+            chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
+            transcript_parts = []
+            for i, chunk in enumerate(chunks[:20]):  # Limit to first 10 minutes
                 chunk_file = Path(temp_dir) / f"chunk_{i}.wav"
                 chunk.export(chunk_file, format="wav")
                 try:
                     with sr.AudioFile(str(chunk_file)) as source:
+                        audio_data = r.record(source)
+                        text = r.recognize_google(audio_data)
                         transcript_parts.append(text)
+                except sr.UnknownValueError:
+                    transcript_parts.append("[Unintelligible audio]")
+                except sr.RequestError as e:
+                    transcript_parts.append(f"[Speech recognition error: {e}]")
+            transcript = " ".join(transcript_parts)
+            return f"Audio file transcript:\n{transcript}"
     except Exception as e:
         print(f"Error processing audio file: {e}")
         return f"Error processing audio file: {e}"
 def web_search_func(query: str, cache_func) -> str:
     """Performs a web search using Tavily and returns a compilation of results."""
     key = f"web:{query}"
     results = cache_func(key, lambda: TavilySearchResults(max_results=5).invoke(query))
     return "\n\n---\n\n".join([f"Source: {res['url']}\nContent: {res['content']}" for res in results])
 def wiki_search_func(query: str, cache_func) -> str:
     """Searches Wikipedia and returns the top 2 results."""
     key = f"wiki:{query}"
     docs = cache_func(key, lambda: WikipediaLoader(query=query, load_max_docs=2, doc_content_chars_max=2000).load())
     return "\n\n---\n\n".join([f"Source: {d.metadata['source']}\n\n{d.page_content}" for d in docs])
 def arxiv_search_func(query: str, cache_func) -> str:
     """Searches Arxiv for scientific papers and returns the top 2 results."""
     key = f"arxiv:{query}"
     docs = cache_func(key, lambda: ArxivLoader(query=query, load_max_docs=2).load())
     return "\n\n---\n\n".join([f"Source: {d.metadata['source']}\nPublished: {d.metadata['Published']}\nTitle: {d.metadata['Title']}\n\nSummary:\n{d.page_content}" for d in docs])
 # ----------------------------------------------------------
+# Section 3: DYNAMIC SYSTEM PROMPT
 # ----------------------------------------------------------
 SYSTEM_PROMPT_TEMPLATE = (
+    """You are an expert-level multimodal research assistant. Your goal is to answer the user's question accurately using all available tools.
+**CRITICAL INSTRUCTIONS:**
+1.  **USE YOUR TOOLS:** You have been given a set of tools to find information. You MUST use them when the answer is not immediately known to you. Do not make up answers.
+2.  **MULTIMODAL PROCESSING:** When you encounter URLs or attachments:
+    - For image URLs (jpg, png, gif, etc.): Use the `describe_image` tool
+    - For YouTube URLs: Use the `process_youtube_video` tool
+    - For audio files (mp3, wav, etc.): Use the `process_audio_file` tool
+    - For other content: Use appropriate search tools
+3.  **AVAILABLE TOOLS:** Here is the exact list of tools you have access to:
+    {tools}
+4.  **REASONING:** Think step-by-step. First, analyze the user's question and any attachments. Second, decide which tools are appropriate. Third, call the tools with correct parameters. Finally, synthesize the results.
+5.  **URL DETECTION:** Look for URLs in the user's message, especially in brackets like [Attachment URL: ...]. Process these appropriately.
+6.  **FINAL ANSWER FORMAT:** Your final response MUST strictly follow this format:
+    `FINAL ANSWER: [Your comprehensive answer incorporating all tool results]`
+"""
 )
 # ----------------------------------------------------------
     """
     print(f"Initializing agent with provider: {provider}")
+    # Step 1: Build LLMs - Use Google for vision capabilities
+    if provider == "google":
+        main_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", temperature=0)
+        vision_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", temperature=0)
+    elif provider == "groq":
+        main_llm = ChatGroq(model_name="llama-3.2-90b-vision-preview", temperature=0)
+        # Use Google for vision since Groq's vision support may be limited
+        vision_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", temperature=0)
+    elif provider == "huggingface":
+        main_llm = ChatHuggingFace(llm=HuggingFaceEndpoint(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", temperature=0.1))
+        vision_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", temperature=0)
     else:
+        raise ValueError("Invalid provider selected")
+    # Step 2: Build Retriever
     embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
     if FAISS_CACHE.exists():
         with open(FAISS_CACHE, "rb") as f: vector_store = pickle.load(f)
     else:
         if JSONL_PATH.exists():
             docs = [Document(page_content=f"Question: {rec['Question']}\n\nFinal answer: {rec['Final answer']}", metadata={"source": rec["task_id"]}) for rec in (json.loads(line) for line in open(JSONL_PATH, "rt", encoding="utf-8"))]
+            vector_store = FAISS.from_documents(docs, embeddings)
+            with open(FAISS_CACHE, "wb") as f: pickle.dump(vector_store, f)
+        else:
+            # Create empty vector store if no metadata file exists
             docs = [Document(page_content="Sample document", metadata={"source": "sample"})]
+            vector_store = FAISS.from_documents(docs, embeddings)
     retriever = vector_store.as_retriever(search_kwargs={"k": RETRIEVER_K})
+    # Step 3: Create the final list of tools
     tools_list = [
+        python_repl,
+        Tool(name="describe_image", func=functools.partial(describe_image_func, vision_llm_instance=vision_llm), description="Describes an image from a local file path or a URL. Use this for any image files or image URLs."),
+        process_youtube_video,
+        process_audio_file,
         Tool(name="web_search", func=functools.partial(web_search_func, cache_func=cached_get), description="Performs a web search using Tavily."),
         Tool(name="wiki_search", func=functools.partial(wiki_search_func, cache_func=cached_get), description="Searches Wikipedia."),
         Tool(name="arxiv_search", func=functools.partial(arxiv_search_func, cache_func=cached_get), description="Searches Arxiv for scientific papers."),
         create_retriever_tool(retriever=retriever, name="retrieve_examples", description="Retrieve solved questions similar to the user's query."),
     ]
+    # Step 4: Format the tool list into a string for the prompt
     tool_definitions = "\n".join([f"- `{tool.name}`: {tool.description}" for tool in tools_list])
     final_system_prompt = SYSTEM_PROMPT_TEMPLATE.format(tools=tool_definitions)
+    llm_with_tools = main_llm.bind_tools(tools_list)
+    # Step 5: Define Graph Nodes
+    def retriever_node(state: MessagesState):
+        user_query = state["messages"][-1].content
+        docs = retriever.invoke(user_query)
+        messages = [SystemMessage(content=final_system_prompt)]
         if docs:
             example_text = "\n\n---\n\n".join(d.page_content for d in docs)
+            messages.append(AIMessage(content=f"I have found {len(docs)} similar solved examples:\n\n{example_text}", name="ExampleRetriever"))
+        messages.extend(state["messages"])
+        return {"messages": messages}
     def assistant_node(state: MessagesState):
         result = llm_with_tools.invoke(state["messages"])
         return {"messages": [result]}
     # Step 6: Build Graph
     builder = StateGraph(MessagesState)
+    builder.add_node("retriever", retriever_node)
     builder.add_node("assistant", assistant_node)
     builder.add_node("tools", ToolNode(tools_list))
+    builder.add_edge(START, "retriever")
+    builder.add_edge("retriever", "assistant")
     builder.add_conditional_edges("assistant", tools_condition, {"tools": "tools", "__end__": "__end__"})
     builder.add_edge("tools", "assistant")
     agent_executor = builder.compile()
+    print("Agent Executor created successfully.")
     return agent_executor