causal-agent / main /run_cais.py
FireShadow's picture
Initial clean commit
1721aea
raw
history blame
4.7 kB
## This file runs the CAIS pipeline for a list of queries provided in a CSV file
import os, re, io, time, json, logging, contextlib, textwrap
from typing import Dict, Any
import pandas as pd
import argparse
from auto_causal.agent import run_causal_analysis
# Constants
RATE_LIMIT_SECONDS = 2
def run_cais(desc, question, df):
"""
A wrapper function to run the causal analysis pipeline
Args:
desc (str): Description of the dataset
question (str): Natural language query associated with the dataset
df (str): Path to the csv file assocated with the dataset
Returns:
dict: Results from the CAIS pipeline
"""
return run_causal_analysis(query=question, dataset_path=df, dataset_description=desc)
def parse_args():
parser = argparse.ArgumentParser(description="Run batch causal analysis.")
parser.add_argument("-m", "--metadata_path", type=str, required=True,
help="Path to the CSV file with queries, descriptions, and file names etc")
parser.add_argument("-d", "--data_dir", type=str, required=True,
help="Path to the folder containing the data in CSV format")
parser.add_argument("-o", "--output_dir", type=str, required=True,
help="Path to the folder where the output is saved output")
parser.add_argument("-n", "--output_name", type=str, default="cais_results.json",)
parser.add_argument("-l", "--llm_name", type=str, required=True,
help="Name of the LLM used to be used")
return parser.parse_args()
def main():
args = parse_args()
metadata_path = args.metadata_path
data_dir = args.data_dir
output_dir = args.output_dir
output_name = args.output_name
os.environ["LLM_MODEL"] = args.llm_name
print("[main] Starting batch processing…")
if not os.path.exists(metadata_path):
logging.error(f"Meta file not found: {metadata_path}")
return
meta_df = pd.read_csv(metadata_path)
print(f"[main] Loaded metadata CSV with {len(meta_df)} rows.")
results: Dict[int, Dict[str, Any]] = {}
for idx, row in meta_df.iterrows():
data_path = os.path.join(data_dir, str(row["data_files"]))
print(f"\n[main] Row {idx+1}/{len(meta_df)} → Dataset: {data_path}")
try:
res = run_cais(desc=row["data_description"], question=row["natural_language_query"],
df=data_path)
# Format result according to specified structure
formatted_result = {
"query": row["natural_language_query"],
"method": row["method"],
"answer": row["answer"],
"dataset_description": row["data_description"],
"dataset_path": data_path,
"keywords": row.get("keywords", "Causality, Average treatment effect"),
"final_result": {
"method": res['results']['results'].get("method_used"),
"causal_effect": res['results']['results'].get("effect_estimate"),
"standard_deviation": res['results']['results'].get("standard_error"),
"treatment_variable": res['results']['variables'].get("treatment_variable", None),
"outcome_variable": res['results']['variables'].get("outcome_variable", None),
"covariates": res['results']['variables'].get("covariates", []),
"instrument_variable": res['results']['variables'].get("instrument_variable", None),
"running_variable": res['results']['variables'].get("running_variable", None),
"temporal_variable": res['results']['variables'].get("time_variable", None),
"statistical_test_results": res.get("summary", ""),
"explanation_for_model_choice": res.get("explanation", ""),
"regression_equation": res.get("regression_equation", "")
}
}
results[idx] = formatted_result
print(f"[main] Formatted result for row {idx+1}:", formatted_result)
except Exception as e:
logging.error(f"[{idx+1}] Error: {e}")
results[idx] = {"answer": str(e)}
time.sleep(RATE_LIMIT_SECONDS)
os.makedirs(output_dir, exist_ok=True)
output_json = os.path.join(output_dir, output_name)
if not output_json.endswith(".json"):
output_json += ".json"
with open(output_json, "w") as f:
json.dump(results, f, indent=4)
print(f"[main] Done. Predictions saved to {output_json}")
if __name__ == "__main__":
main()