File size: 3,301 Bytes
1bbca12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import PyPDF2
import os
from typing import List, Dict

class DocumentRetrieverTool:
    def __init__(self):
        self.name = "document_retriever"
        self.description = "Retrieves relevant text from GAIA text-heavy files (CSV, TXT, PDF) using semantic search."
        self.inputs = {
            "task_id": {"type": "string", "description": "GAIA task ID for the file"},
            "query": {"type": "string", "description": "Question or query to search for"},
            "file_type": {"type": "string", "description": "File type (csv, txt, pdf, default: txt)"}
        }
        self.output_type = str
        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            length_function=len
        )
        self.chunks: List[str] = []
        self.embeddings: np.ndarray = None

    async def aparse(self, task_id: str, query: str, file_type: str = "txt") -> str:
        """
        Loads a GAIA file, splits it into chunks, embeds them, and retrieves relevant text for the query.
        Supports CSV, TXT, and PDF files.
        """
        try:
            file_path = f"temp_{task_id}.{file_type}"
            if not os.path.exists(file_path):
                return f"File not found for task ID {task_id}"

            # Load and preprocess file
            text = ""
            if file_type == "csv":
                df = pd.read_csv(file_path)
                text = df.to_string()
            elif file_type == "txt":
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read()
            elif file_type == "pdf":
                with open(file_path, "rb") as f:
                    pdf = PyPDF2.PdfReader(f)
                    text = "".join(page.extract_text() or "" for page in pdf.pages)
            else:
                return f"Unsupported file type: {file_type}"

            # Check if text was extracted
            if not text.strip():
                return "No extractable text found in file."

            # Split text into chunks
            self.chunks = self.text_splitter.split_text(text)
            if not self.chunks:
                return "No content found in file."

            # Embed chunks and query
            self.embeddings = self.embedder.encode(self.chunks, convert_to_tensor=True)
            query_embedding = self.embedder.encode(query, convert_to_tensor=True)

            # Compute cosine similarities
            from sentence_transformers import util
            similarities = util.cos_sim(query_embedding, self.embeddings)[0]
            
            # Get top 3 most relevant chunks
            top_k = min(3, len(self.chunks))
            top_indices = similarities.argsort(descending=True)[:top_k]
            relevant_chunks = [self.chunks[idx] for idx in top_indices]
            
            # Combine results
            return "\n\n".join(relevant_chunks)
        except Exception as e:
            return f"Error retrieving documents: {str(e)}"

document_retriever_tool = DocumentRetrieverTool()