Spaces:

levalencia
/

doctorecord

Running

doctorecord / src /agents /index_agent.py

Update Dockerfile to use new app entry point and enhance requirements.txt with additional dependencies. Remove obsolete streamlit_app.py file.

0a40afa about 2 months ago

raw

history blame

5.61 kB

	"""Create a semantic index of document content using embeddings."""
	from typing import Dict, Any, List, Tuple
	import logging
	import numpy as np
	from .base_agent import BaseAgent
	from services.embedding_client import EmbeddingClient

	class IndexAgent(BaseAgent):
	def __init__(self):
	self.logger = logging.getLogger(__name__)
	self.embedding_client = EmbeddingClient()
	self.logger.info("IndexAgent initialized")

	def execute(self, ctx: Dict[str, Any]):
	"""Create a semantic index of document content."""
	try:
	self.logger.info("Starting index creation")

	# Get text from PDF agent
	text = ctx.get("text", "")
	if not text:
	self.logger.warning("No text content found in context")
	return {}
	self.logger.info(f"Found text content of length {len(text)}")

	# Get tables from Table agent
	tables = ctx.get("tables", [])
	self.logger.info(f"Found {len(tables)} tables in context")

	# Combine all content
	all_content = text
	if tables:
	all_content += "\n".join(tables)
	self.logger.info(f"Combined content length: {len(all_content)}")

	# Create chunks with metadata
	chunks = self._create_chunks(all_content)
	self.logger.info(f"Created {len(chunks)} content chunks")
	for i, chunk in enumerate(chunks):
	self.logger.debug(f"Chunk {i}: {chunk['text'][:100]}...")

	# Get embeddings for chunks
	chunk_texts = [chunk["text"] for chunk in chunks]
	self.logger.info(f"Getting embeddings for {len(chunk_texts)} chunks")
	embeddings = self.embedding_client.embed(chunk_texts)
	self.logger.info(f"Generated {len(embeddings)} embeddings")

	# Create semantic index
	index = {
	"chunks": chunks,
	"embeddings": embeddings,
	"text": all_content, # Keep full text for non-semantic search
	}

	# Store in context
	ctx["index"] = index
	self.logger.info(f"Created semantic index with {len(chunks)} chunks")
	return index

	except Exception as e:
	self.logger.error(f"Error in IndexAgent: {str(e)}", exc_info=True)
	return {}

	def _create_chunks(self, text: str, chunk_size: int = 1000) -> List[Dict[str, Any]]:
	"""Split text into chunks with metadata."""
	self.logger.info(f"Creating chunks from text of length {len(text)}")
	chunks = []
	sentences = text.split(". ")
	self.logger.info(f"Split into {len(sentences)} sentences")
	current_chunk = []
	current_size = 0
	total_length = 0

	for sentence in sentences:
	sentence = sentence.strip() + ". "
	sentence_size = len(sentence)

	if current_size + sentence_size > chunk_size and current_chunk:
	# Save current chunk
	chunk_text = "".join(current_chunk)
	chunks.append({
	"text": chunk_text,
	"start": total_length,
	"end": total_length + len(chunk_text),
	"type": "text"
	})
	total_length += len(chunk_text)
	self.logger.debug(f"Created chunk of size {len(chunk_text)}")
	current_chunk = []
	current_size = 0

	current_chunk.append(sentence)
	current_size += sentence_size

	# Add last chunk if any
	if current_chunk:
	chunk_text = "".join(current_chunk)
	chunks.append({
	"text": chunk_text,
	"start": total_length,
	"end": total_length + len(chunk_text),
	"type": "text"
	})
	self.logger.debug(f"Created final chunk of size {len(chunk_text)}")

	self.logger.info(f"Created {len(chunks)} total chunks")
	return chunks

	def find_similar_chunks(self, query: str, index: Dict[str, Any], top_k: int = 3) -> List[Dict[str, Any]]:
	"""Find chunks semantically similar to the query."""
	try:
	self.logger.info(f"Finding similar chunks for query: {query}")
	# Get query embedding
	query_embedding = self.embedding_client.embed([query])[0]

	# Calculate similarities
	similarities = []
	for chunk, embedding in zip(index["chunks"], index["embeddings"]):
	similarity = self._cosine_similarity(query_embedding, embedding)
	similarities.append((similarity, chunk))
	self.logger.debug(f"Chunk similarity: {similarity:.3f}")

	# Sort by similarity and return top k
	similarities.sort(reverse=True)
	results = [chunk for _, chunk in similarities[:top_k]]
	self.logger.info(f"Found {len(results)} similar chunks")
	return results

	except Exception as e:
	self.logger.error(f"Error finding similar chunks: {str(e)}", exc_info=True)
	return []

	def _cosine_similarity(self, a: List[float], b: List[float]) -> float:
	"""Calculate cosine similarity between two vectors."""
	return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))