Spaces:

onisj
/

jarvis_gaia_agent

Starting

File size: 3,075 Bytes

1bbca12

from langchain_openai import ChatOpenAI
from sentence_transformers import SentenceTransformer, util
import pytesseract
from PIL import Image
import base64
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
# Debug: Verify OPENAI_API_KEY
if not os.getenv("OPENAI_API_KEY"):
    print("Error: OPENAI_API_KEY not loaded in image_parser.py")

class ImageParserTool:
    def __init__(self):
        self.name = "image_parser"
        self.description = "Analyzes images to extract text, identify objects, or match descriptions."
        self.inputs = {
            "image_path": {"type": "string", "description": "Path to image file"},
            "task": {"type": "string", "description": "Task type (ocr, describe, match)"},
            "match_query": {"type": "string", "description": "Query for semantic matching (optional)"}
        }
        self.output_type = str
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY environment variable not set")
        self.vlm = ChatOpenAI(model="gpt-4o", api_key=api_key)
        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")

    async def aparse(self, image_path: str, task: str = "describe", match_query: str = "") -> str:
        try:
            # Read image
            with open(image_path, "rb") as f:
                image_data = base64.b64encode(f.read()).decode()
            img = Image.open(image_path)

            if task == "ocr":
                # Extract text with Tesseract
                text = pytesseract.image_to_string(img)
                return text if text.strip() else "No text found in image."
            elif task == "describe":
                # Describe image with VLM
                response = await self.vlm.ainvoke([
                    {"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"},
                    {"type": "text", "text": "Describe objects in the image in detail."}
                ])
                return response.content
            elif task == "match" and match_query:
                # Semantic matching with sentence-transformers
                description = await self.vlm.ainvoke([
                    {"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"},
                    {"type": "text", "text": "List objects in the image."}
                ])
                objects = description.content.split(", ")
                query_embedding = self.embedder.encode(match_query, convert_to_tensor=True)
                object_embeddings = self.embedder.encode(objects, convert_to_tensor=True)
                similarities = util.cos_sim(query_embedding, object_embeddings)[0]
                best_match = objects[similarities.argmax()]
                return f"Best match for '{match_query}': {best_match}"
            else:
                return "Invalid task or missing match_query for matching."
        except Exception as e:
            return f"Error analyzing image: {str(e)}"

image_parser_tool = ImageParserTool()