Spaces:
Running
Running
File size: 7,184 Bytes
1721aea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
"""
Data Analyzer class for causal inference pipelines.
This module provides the DataAnalyzer class for analyzing datasets
and extracting relevant information for causal inference.
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional
class DataAnalyzer:
"""
Data analyzer for causal inference datasets.
This class provides methods for analyzing datasets to extract
relevant information for causal inference, such as variables,
relationships, and temporal structures.
"""
def __init__(self, verbose=False):
"""
Initialize the data analyzer.
Args:
verbose: Whether to print verbose information
"""
self.verbose = verbose
def analyze_dataset(self, dataset_path: str) -> Dict[str, Any]:
"""
Analyze a dataset and extract relevant information.
Args:
dataset_path: Path to the dataset file
Returns:
Dictionary with dataset analysis results
"""
try:
# Load the dataset
df = pd.read_csv(dataset_path)
# Get basic statistics
n_rows, n_cols = df.shape
columns = list(df.columns)
# Get column types and categories
column_types = {col: str(df[col].dtype) for col in columns}
column_categories = self._categorize_columns(df)
# Check for temporal structure
temporal_structure = self._check_temporal_structure(df)
# Identify potential confounders
variable_relationships = self._identify_relationships(df)
# Look for potential instruments
potential_instruments = self._identify_potential_instruments(df)
# Check for discontinuities
discontinuities = self._check_discontinuities(df)
# Construct the analysis result
analysis = {
"filepath": dataset_path,
"n_rows": n_rows,
"n_cols": n_cols,
"columns": columns,
"column_types": column_types,
"column_categories": column_categories,
"temporal_structure": temporal_structure,
"variable_relationships": variable_relationships,
"potential_instruments": potential_instruments,
"discontinuities": discontinuities
}
if self.verbose:
print(f"Dataset analysis completed: {n_rows} rows, {n_cols} columns")
return analysis
except Exception as e:
if self.verbose:
print(f"Error analyzing dataset: {str(e)}")
return {
"error": str(e),
"filepath": dataset_path,
"n_rows": 0,
"n_cols": 0,
"columns": [],
"column_types": {},
"column_categories": {},
"temporal_structure": {"has_temporal_structure": False},
"variable_relationships": {"potential_confounders": []},
"potential_instruments": [],
"discontinuities": {"has_discontinuities": False}
}
def _categorize_columns(self, df: pd.DataFrame) -> Dict[str, str]:
"""
Categorize columns by data type.
Args:
df: Pandas DataFrame
Returns:
Dictionary mapping column names to categories
"""
categories = {}
for col in df.columns:
if df[col].dtype == 'bool':
categories[col] = 'binary'
elif pd.api.types.is_numeric_dtype(df[col]):
if len(df[col].unique()) <= 2:
categories[col] = 'binary'
else:
categories[col] = 'continuous'
else:
unique_values = df[col].nunique()
if unique_values <= 2:
categories[col] = 'binary'
elif unique_values <= 10:
categories[col] = 'categorical'
else:
categories[col] = 'high_cardinality'
return categories
def _check_temporal_structure(self, df: pd.DataFrame) -> Dict[str, Any]:
"""
Check for temporal structure in the dataset.
Args:
df: Pandas DataFrame
Returns:
Dictionary with temporal structure information
"""
# Look for date/time columns
date_cols = [col for col in df.columns if
any(keyword in col.lower() for keyword in
['date', 'time', 'year', 'month', 'day', 'period'])]
# Check for panel data structure
id_cols = [col for col in df.columns if
any(keyword in col.lower() for keyword in
['id', 'group', 'entity', 'unit'])]
return {
"has_temporal_structure": len(date_cols) > 0,
"is_panel_data": len(date_cols) > 0 and len(id_cols) > 0,
"time_variables": date_cols,
"id_variables": id_cols
}
def _identify_relationships(self, df: pd.DataFrame) -> Dict[str, List[str]]:
"""
Identify potential variable relationships.
Args:
df: Pandas DataFrame
Returns:
Dictionary with relationship information
"""
# This is a simplified implementation
# A real implementation would use statistical tests or causal discovery
return {
"potential_confounders": []
}
def _identify_potential_instruments(self, df: pd.DataFrame) -> List[str]:
"""
Identify potential instrumental variables.
Args:
df: Pandas DataFrame
Returns:
List of potential instrumental variables
"""
# This is a simplified implementation
# A real implementation would use statistical tests
# Look for variables that might be instruments based on naming
potential_instruments = [col for col in df.columns if
any(keyword in col.lower() for keyword in
['instrument', 'random', 'assignment', 'iv'])]
return potential_instruments
def _check_discontinuities(self, df: pd.DataFrame) -> Dict[str, Any]:
"""
Check for potential discontinuities for RDD.
Args:
df: Pandas DataFrame
Returns:
Dictionary with discontinuity information
"""
# This is a simplified implementation
# A real implementation would use statistical tests
return {
"has_discontinuities": False,
"potential_running_variables": []
} |