GaiaAgentEvaluator / utils /text_processing.py
davidgturner's picture
- changes for app.py
08e2c16
# Utility functions for GAIA Agent Evaluator
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from config import GAIA_KNOWLEDGE, ANSWER_PREFIXES_TO_REMOVE, LLM_RESPONSE_MARKERS, LLM_END_MARKERS
def create_knowledge_documents():
"""Create knowledge base documents from GAIA_KNOWLEDGE."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
knowledge_chunks = text_splitter.split_text(GAIA_KNOWLEDGE)
return [Document(page_content=chunk) for chunk in knowledge_chunks]
def clean_llm_response(response, prompt):
"""Clean up the LLM response to extract the answer."""
# Remove the prompt from the beginning if it's included
if response.startswith(prompt):
response = response[len(prompt):]
# Try to find where the model's actual answer begins
for marker in LLM_RESPONSE_MARKERS:
if marker.lower() in response.lower():
parts = response.lower().split(marker.lower(), 1)
if len(parts) > 1:
response = parts[1].strip()
# Remove any closing tags if they exist
for marker in LLM_END_MARKERS:
if marker.lower() in response.lower():
response = response.lower().split(marker.lower())[0].strip()
return response.strip()
def extract_final_answer(answer):
"""Extract and clean the final answer for exact matching."""
clean_answer = answer.strip()
# Remove prefixes (case insensitive)
for prefix in ANSWER_PREFIXES_TO_REMOVE:
if clean_answer.lower().startswith(prefix.lower()):
clean_answer = clean_answer[len(prefix):].strip()
# Remove quotes if the entire answer is quoted
if clean_answer.startswith('"') and clean_answer.endswith('"'):
clean_answer = clean_answer[1:-1]
elif clean_answer.startswith("'") and clean_answer.endswith("'"):
clean_answer = clean_answer[1:-1]
# Remove trailing periods if they seem extraneous
if clean_answer.endswith('.') and not clean_answer.replace('.', '').isdigit():
# Don't remove decimal points from numbers
if not (clean_answer.count('.') == 1 and clean_answer.replace('.', '').isdigit()):
clean_answer = clean_answer[:-1]
# Clean up extra whitespace
clean_answer = ' '.join(clean_answer.split())
return clean_answer
def format_prompt(question, context=""):
"""Format the question into a proper prompt for the LLM."""
if context:
return f"""You are a precise AI assistant that answers questions using available information. Your answer will be evaluated with exact string matching, so provide only the specific answer requested without additional text.
Context Information:
{context}
Question: {question}
Critical Instructions:
- Provide ONLY the exact answer requested, nothing else
- Do not include phrases like "The answer is", "Final answer", or "Based on the context"
- For numerical answers, use the exact format requested (integers, decimals, etc.)
- For lists, use the exact formatting specified in the question (commas, spaces, etc.)
- For names, use proper capitalization as would appear in official sources
- Be concise and precise - extra words will cause evaluation failure
- If the question asks for multiple items, provide them in the exact format requested
Direct Answer:"""
else:
return f"""You are a precise AI assistant that answers questions accurately. Your answer will be evaluated with exact string matching, so provide only the specific answer requested without additional text.
Question: {question}
Critical Instructions:
- Provide ONLY the exact answer requested, nothing else
- Do not include phrases like "The answer is", "Final answer", or explanations
- For numerical answers, use the exact format that would be expected
- For lists, use appropriate formatting (commas, spaces, etc.)
- For names, use proper capitalization
- Be concise and precise - extra words will cause evaluation failure
- Answer based on your knowledge and reasoning
Direct Answer:"""