jarvis_gaia_agent / tools /image_parser.py
onisj's picture
Add .gitignore and clean tracked files
1bbca12
raw
history blame
3.08 kB
from langchain_openai import ChatOpenAI
from sentence_transformers import SentenceTransformer, util
import pytesseract
from PIL import Image
import base64
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Debug: Verify OPENAI_API_KEY
if not os.getenv("OPENAI_API_KEY"):
print("Error: OPENAI_API_KEY not loaded in image_parser.py")
class ImageParserTool:
def __init__(self):
self.name = "image_parser"
self.description = "Analyzes images to extract text, identify objects, or match descriptions."
self.inputs = {
"image_path": {"type": "string", "description": "Path to image file"},
"task": {"type": "string", "description": "Task type (ocr, describe, match)"},
"match_query": {"type": "string", "description": "Query for semantic matching (optional)"}
}
self.output_type = str
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
self.vlm = ChatOpenAI(model="gpt-4o", api_key=api_key)
self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
async def aparse(self, image_path: str, task: str = "describe", match_query: str = "") -> str:
try:
# Read image
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
img = Image.open(image_path)
if task == "ocr":
# Extract text with Tesseract
text = pytesseract.image_to_string(img)
return text if text.strip() else "No text found in image."
elif task == "describe":
# Describe image with VLM
response = await self.vlm.ainvoke([
{"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"},
{"type": "text", "text": "Describe objects in the image in detail."}
])
return response.content
elif task == "match" and match_query:
# Semantic matching with sentence-transformers
description = await self.vlm.ainvoke([
{"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"},
{"type": "text", "text": "List objects in the image."}
])
objects = description.content.split(", ")
query_embedding = self.embedder.encode(match_query, convert_to_tensor=True)
object_embeddings = self.embedder.encode(objects, convert_to_tensor=True)
similarities = util.cos_sim(query_embedding, object_embeddings)[0]
best_match = objects[similarities.argmax()]
return f"Best match for '{match_query}': {best_match}"
else:
return "Invalid task or missing match_query for matching."
except Exception as e:
return f"Error analyzing image: {str(e)}"
image_parser_tool = ImageParserTool()