Spaces:
Starting
Starting
from langchain_openai import ChatOpenAI | |
from sentence_transformers import SentenceTransformer, util | |
import pytesseract | |
from PIL import Image | |
import base64 | |
import os | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
# Debug: Verify OPENAI_API_KEY | |
if not os.getenv("OPENAI_API_KEY"): | |
print("Error: OPENAI_API_KEY not loaded in image_parser.py") | |
class ImageParserTool: | |
def __init__(self): | |
self.name = "image_parser" | |
self.description = "Analyzes images to extract text, identify objects, or match descriptions." | |
self.inputs = { | |
"image_path": {"type": "string", "description": "Path to image file"}, | |
"task": {"type": "string", "description": "Task type (ocr, describe, match)"}, | |
"match_query": {"type": "string", "description": "Query for semantic matching (optional)"} | |
} | |
self.output_type = str | |
api_key = os.getenv("OPENAI_API_KEY") | |
if not api_key: | |
raise ValueError("OPENAI_API_KEY environment variable not set") | |
self.vlm = ChatOpenAI(model="gpt-4o", api_key=api_key) | |
self.embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
async def aparse(self, image_path: str, task: str = "describe", match_query: str = "") -> str: | |
try: | |
# Read image | |
with open(image_path, "rb") as f: | |
image_data = base64.b64encode(f.read()).decode() | |
img = Image.open(image_path) | |
if task == "ocr": | |
# Extract text with Tesseract | |
text = pytesseract.image_to_string(img) | |
return text if text.strip() else "No text found in image." | |
elif task == "describe": | |
# Describe image with VLM | |
response = await self.vlm.ainvoke([ | |
{"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"}, | |
{"type": "text", "text": "Describe objects in the image in detail."} | |
]) | |
return response.content | |
elif task == "match" and match_query: | |
# Semantic matching with sentence-transformers | |
description = await self.vlm.ainvoke([ | |
{"type": "image_url", "image_url": f"data:image/jpeg;base64,{image_data}"}, | |
{"type": "text", "text": "List objects in the image."} | |
]) | |
objects = description.content.split(", ") | |
query_embedding = self.embedder.encode(match_query, convert_to_tensor=True) | |
object_embeddings = self.embedder.encode(objects, convert_to_tensor=True) | |
similarities = util.cos_sim(query_embedding, object_embeddings)[0] | |
best_match = objects[similarities.argmax()] | |
return f"Best match for '{match_query}': {best_match}" | |
else: | |
return "Invalid task or missing match_query for matching." | |
except Exception as e: | |
return f"Error analyzing image: {str(e)}" | |
image_parser_tool = ImageParserTool() | |