""" Tool for parsing causal inference queries. This module provides a LangChain tool for parsing causal inference queries, extracting key elements, and guiding the workflow to the next step. """ import logging import re from typing import Dict, Any, Optional from langchain_core.tools import tool from auto_causal.components.input_parser import parse_input from auto_causal.config import get_llm_client from auto_causal.components.state_manager import create_workflow_state_update import json logger = logging.getLogger(__name__) @tool def input_parser_tool(input_text: str) -> Dict[str, Any]: """ Parse the user's initial input text to extract query, dataset path, and description. This tool uses regex to find structured information within the input text and then leverages an LLM for more complex NLP tasks on the extracted query. Args: input_text: The combined initial input string from the user/system. Returns: Dict containing parsed query information, path, description, and workflow state. """ logger.info(f"Running input_parser_tool on input: '{input_text[:100]}...'") # --- Extract structured info using Regex --- query = None dataset_path = None dataset_description = None query_match = re.search(r"My question is: (.*?)\n", input_text, re.IGNORECASE) if query_match: query = query_match.group(1).strip() path_match = re.search(r"The dataset is located at: (.*?)\n", input_text, re.IGNORECASE) if path_match: dataset_path = path_match.group(1).strip() # Use re.search to find the description potentially anywhere after its label desc_match = re.search(r"Dataset Description: (.*)", input_text, re.DOTALL | re.IGNORECASE) if desc_match: # Strip leading/trailing whitespace/newlines from the captured group dataset_description = desc_match.group(1).strip() if not query: logger.warning("Could not extract query from input_text using regex. Attempting full text as query.") # Fallback: This is risky if input_text contains boilerplate query = input_text logger.info(f"Extracted - Query: '{query[:50]}...', Path: '{dataset_path}', Desc: '{str(dataset_description)[:50]}...'") # --- Get LLM and Parse Query --- try: llm_instance = get_llm_client() except Exception as e: logger.error(f"Failed to initialize LLM for input_parser_tool: {e}") return {"error": f"LLM Initialization failed: {e}", "workflow_state": {}} # Call the component function to parse the extracted query try: parsed_info = parse_input( query=query, dataset_path_arg=dataset_path, # Use extracted path dataset_info=None, # This arg seems unused by parse_input now llm=llm_instance ) except Exception as e: logger.error(f"Error during parse_input execution: {e}", exc_info=True) return {"error": f"Input parsing failed: {e}", "workflow_state": {}} # Create workflow state update workflow_update = create_workflow_state_update( current_step="input_processing", step_completed_flag="query_parsed", next_tool="dataset_analyzer_tool", next_step_reason="Now that we understand the query, we need to analyze the dataset structure" ) # Combine results with workflow state result = { "original_query": parsed_info.get("original_query", query), # Fallback to regex query "dataset_path": parsed_info.get("dataset_path") or dataset_path, # Use extracted if component missed it "query_type": parsed_info.get("query_type"), "extracted_variables": parsed_info.get("extracted_variables", {}), "constraints": parsed_info.get("constraints", []), # Pass dataset_description along "dataset_description": dataset_description } print('before workflow: ', result) # Add workflow state to the result result.update(workflow_update) print('after workflow: ', result) logger.info("input_parser_tool finished successfully.") return result