from langchain_openai import ChatOpenAI from sentence_transformers import SentenceTransformer, util import pytesseract from PIL import Image import base64 import os from dotenv import load_dotenv # Load environment variables load_dotenv() # Debug: Verify OPENAI_API_KEY if not os.getenv("OPENAI_API_KEY"): print("Error: OPENAI_API_KEY not loaded in image_parser.py") class ImageParserTool: def __init__(self): self.name = "image_parser" self.description = "Analyzes images to extract text, identify objects, or match descriptions." self.inputs = { "image_path": {"type": "string", "description": "Path to image file"}, "task": {"type": "string", "description": "Task type (ocr, describe, match)"}, "match_query": {"type": "string", "description": "Query for semantic matching (optional)"} } self.output_type = str api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY environment variable not set") self.vlm = ChatOpenAI(model="gpt-4o", api_key=api_key) self.embedder = SentenceTransformer("all-MiniLM-L6-v2") async def aparse(self, image_path: str, task: str = "describe", match_query: str = "") -> str: try: # Read image with open(image_path, "rb") as f: image_data = base64.b64encode(f.read()).decode() img = Image.open(image_path) if task == "ocr": # Extract text with Tesseract text = pytesseract.image_to_string(img) return text if text.strip() else "No text found in image." elif task == "describe": # Describe image with VLM response = await self.vlm.ainvoke([ {"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"}, {"type": "text", "text": "Describe objects in the image in detail."} ]) return response.content elif task == "match" and match_query: # Semantic matching with sentence-transformers description = await self.vlm.ainvoke([ {"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"}, {"type": "text", "text": "List objects in the image."} ]) objects = description.content.split(", ") query_embedding = self.embedder.encode(match_query, convert_to_tensor=True) object_embeddings = self.embedder.encode(objects, convert_to_tensor=True) similarities = util.cos_sim(query_embedding, object_embeddings)[0] best_match = objects[similarities.argmax()] return f"Best match for '{match_query}': {best_match}" else: return "Invalid task or missing match_query for matching." except Exception as e: return f"Error analyzing image: {str(e)}" image_parser_tool = ImageParserTool()