causal-agent / auto_causal /tools /input_parser_tool.py
FireShadow's picture
Initial clean commit
1721aea
"""
Tool for parsing causal inference queries.
This module provides a LangChain tool for parsing causal inference queries,
extracting key elements, and guiding the workflow to the next step.
"""
import logging
import re
from typing import Dict, Any, Optional
from langchain_core.tools import tool
from auto_causal.components.input_parser import parse_input
from auto_causal.config import get_llm_client
from auto_causal.components.state_manager import create_workflow_state_update
import json
logger = logging.getLogger(__name__)
@tool
def input_parser_tool(input_text: str) -> Dict[str, Any]:
"""
Parse the user's initial input text to extract query, dataset path, and description.
This tool uses regex to find structured information within the input text
and then leverages an LLM for more complex NLP tasks on the extracted query.
Args:
input_text: The combined initial input string from the user/system.
Returns:
Dict containing parsed query information, path, description, and workflow state.
"""
logger.info(f"Running input_parser_tool on input: '{input_text[:100]}...'")
# --- Extract structured info using Regex ---
query = None
dataset_path = None
dataset_description = None
query_match = re.search(r"My question is: (.*?)\n", input_text, re.IGNORECASE)
if query_match:
query = query_match.group(1).strip()
path_match = re.search(r"The dataset is located at: (.*?)\n", input_text, re.IGNORECASE)
if path_match:
dataset_path = path_match.group(1).strip()
# Use re.search to find the description potentially anywhere after its label
desc_match = re.search(r"Dataset Description: (.*)", input_text, re.DOTALL | re.IGNORECASE)
if desc_match:
# Strip leading/trailing whitespace/newlines from the captured group
dataset_description = desc_match.group(1).strip()
if not query:
logger.warning("Could not extract query from input_text using regex. Attempting full text as query.")
# Fallback: This is risky if input_text contains boilerplate
query = input_text
logger.info(f"Extracted - Query: '{query[:50]}...', Path: '{dataset_path}', Desc: '{str(dataset_description)[:50]}...'")
# --- Get LLM and Parse Query ---
try:
llm_instance = get_llm_client()
except Exception as e:
logger.error(f"Failed to initialize LLM for input_parser_tool: {e}")
return {"error": f"LLM Initialization failed: {e}", "workflow_state": {}}
# Call the component function to parse the extracted query
try:
parsed_info = parse_input(
query=query,
dataset_path_arg=dataset_path, # Use extracted path
dataset_info=None, # This arg seems unused by parse_input now
llm=llm_instance
)
except Exception as e:
logger.error(f"Error during parse_input execution: {e}", exc_info=True)
return {"error": f"Input parsing failed: {e}", "workflow_state": {}}
# Create workflow state update
workflow_update = create_workflow_state_update(
current_step="input_processing",
step_completed_flag="query_parsed",
next_tool="dataset_analyzer_tool",
next_step_reason="Now that we understand the query, we need to analyze the dataset structure"
)
# Combine results with workflow state
result = {
"original_query": parsed_info.get("original_query", query), # Fallback to regex query
"dataset_path": parsed_info.get("dataset_path") or dataset_path, # Use extracted if component missed it
"query_type": parsed_info.get("query_type"),
"extracted_variables": parsed_info.get("extracted_variables", {}),
"constraints": parsed_info.get("constraints", []),
# Pass dataset_description along
"dataset_description": dataset_description
}
print('before workflow: ', result)
# Add workflow state to the result
result.update(workflow_update)
print('after workflow: ', result)
logger.info("input_parser_tool finished successfully.")
return result