Spaces:

onisj
/

jarvis_gaia_agent

Starting

App Files Files Community

jarvis_gaia_agent / tools /image_parser.py

onisj

Add .gitignore and clean tracked files

1bbca12 about 2 months ago

raw

history blame

3.08 kB

	from langchain_openai import ChatOpenAI
	from sentence_transformers import SentenceTransformer, util
	import pytesseract
	from PIL import Image
	import base64
	import os
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()
	# Debug: Verify OPENAI_API_KEY
	if not os.getenv("OPENAI_API_KEY"):
	print("Error: OPENAI_API_KEY not loaded in image_parser.py")

	class ImageParserTool:
	def __init__(self):
	self.name = "image_parser"
	self.description = "Analyzes images to extract text, identify objects, or match descriptions."
	self.inputs = {
	"image_path": {"type": "string", "description": "Path to image file"},
	"task": {"type": "string", "description": "Task type (ocr, describe, match)"},
	"match_query": {"type": "string", "description": "Query for semantic matching (optional)"}
	}
	self.output_type = str
	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	raise ValueError("OPENAI_API_KEY environment variable not set")
	self.vlm = ChatOpenAI(model="gpt-4o", api_key=api_key)
	self.embedder = SentenceTransformer("all-MiniLM-L6-v2")

	async def aparse(self, image_path: str, task: str = "describe", match_query: str = "") -> str:
	try:
	# Read image
	with open(image_path, "rb") as f:
	image_data = base64.b64encode(f.read()).decode()
	img = Image.open(image_path)

	if task == "ocr":
	# Extract text with Tesseract
	text = pytesseract.image_to_string(img)
	return text if text.strip() else "No text found in image."
	elif task == "describe":
	# Describe image with VLM
	response = await self.vlm.ainvoke([
	{"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"},
	{"type": "text", "text": "Describe objects in the image in detail."}
	])
	return response.content
	elif task == "match" and match_query:
	# Semantic matching with sentence-transformers
	description = await self.vlm.ainvoke([
	{"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"},
	{"type": "text", "text": "List objects in the image."}
	])
	objects = description.content.split(", ")
	query_embedding = self.embedder.encode(match_query, convert_to_tensor=True)
	object_embeddings = self.embedder.encode(objects, convert_to_tensor=True)
	similarities = util.cos_sim(query_embedding, object_embeddings)[0]
	best_match = objects[similarities.argmax()]
	return f"Best match for '{match_query}': {best_match}"
	else:
	return "Invalid task or missing match_query for matching."
	except Exception as e:
	return f"Error analyzing image: {str(e)}"

	image_parser_tool = ImageParserTool()