Spaces:

SustainabilityLabIITGN
/

VayuChat

Running

File size: 5,459 Bytes

95b3c75

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import uuid
import calendar
import numpy as np
# Set professional matplotlib styling with high resolution
#plt.style.use('vayuchat.mplstyle')
df = pd.read_csv("AQ_met_data.csv")
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
states_df = pd.read_csv("states_data.csv")
ncap_df = pd.read_csv("ncap_funding_data.csv")
# df is pandas DataFrame with air quality data from India. Data frequency is daily from 2017 to 2024. The data has the following columns and data types:
# Unnamed: 0                int64
# Timestamp        datetime64[ns]
# State                    object
# City                     object
# Station                  object
# site_id                  object
# Year                      int64
# PM2.5 (µg/m³)           float64
# PM10 (µg/m³)            float64
# NO (µg/m³)              float64
# NO2 (µg/m³)             float64
# NOx (ppb)               float64
# NH3 (µg/m³)             float64
# SO2 (µg/m³)             float64
# CO (mg/m³)              float64
# Ozone (µg/m³)           float64
# AT (°C)                 float64
# RH (%)                  float64
# WS (m/s)                float64
# WD (deg)                float64
# RF (mm)                 float64
# TOT-RF (mm)             float64
# SR (W/mt2)              float64
# BP (mmHg)               float64
# VWS (m/s)               float64
# dtype: object
# states_df is a pandas DataFrame of state-wise population, area and whether state is union territory or not of India.
# state               object
# population           int64
# area (km2)           int64
# isUnionTerritory      bool
# dtype: object
# ncap_df is a pandas DataFrame of funding given to the cities of India from 2019-2022, under The National Clean Air Program (NCAP).
# S. No.                                 int64
# state                                 object
# city                                  object
# Amount released during FY 2019-20    float64
# Amount released during FY 2020-21    float64
# Amount released during FY 2021-22    float64
# Total fund released                  float64
# Utilisation as on June 2022          float64
# dtype: object
# Question: Compare the wind speed and PM2.5 levels during Delhi’s most polluted week (highest PM2.5) in December 2024 with the previous 15 days and the following 15 days on a time series plot.
# Generate code to answer the question and save result in 'answer' variable
# If creating a plot, save it with a unique filename and store the filename in 'answer'
# If returning text/numbers, store the result directly in 'answer'


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import uuid
import numpy as np

# Ensure data is loaded
if df.empty:
    answer = "No data available"
else:
    try:
        # Filter for Delhi in December 2024
        df_delhi = df[
            (df['City'].str.contains('Delhi', case=False, na=False)) &
            (df['Timestamp'].dt.year == 2024) &
            (df['Timestamp'].dt.month == 12)
        ].copy()
        df_delhi = df_delhi.dropna(subset=['PM2.5 (µg/m³)', 'WS (m/s)'])
        df_delhi = df_delhi.sort_values('Timestamp')

        # Need at least a full week to compute rolling mean
        if len(df_delhi) < 7:
            answer = "Insufficient data"
        else:
            # Compute 7‑day rolling mean of PM2.5
            df_delhi['PM2.5_roll7'] = df_delhi['PM2.5 (µg/m³)'].rolling(window=7, min_periods=7).mean()
            # Identify the window with the highest mean PM2.5
            max_idx = df_delhi['PM2.5_roll7'].idxmax()
            max_end_date = df_delhi.loc[max_idx, 'Timestamp']
            max_start_date = max_end_date - pd.Timedelta(days=6)

            # Define extended window: 15 days before start and 15 days after end
            ext_start = max_start_date - pd.Timedelta(days=15)
            ext_end = max_end_date + pd.Timedelta(days=15)

            # Filter data for the extended period
            mask = (df_delhi['Timestamp'] >= ext_start) & (df_delhi['Timestamp'] <= ext_end)
            df_plot = df_delhi.loc[mask].copy()

            if df_plot.empty or len(df_plot) < 30:
                answer = "Insufficient data"
            else:
                # Plot time series
                plt.figure(figsize=(9, 6))
                ax1 = plt.gca()
                sns.lineplot(data=df_plot, x='Timestamp', y='PM2.5 (µg/m³)', ax=ax1,
                             label='PM2.5 (µg/m³)', color='tab:red')
                ax1.set_ylabel('PM2.5 (µg/m³)', color='tab:red')
                ax1.tick_params(axis='y', labelcolor='tab:red')

                ax2 = ax1.twinx()
                sns.lineplot(data=df_plot, x='Timestamp', y='WS (m/s)', ax=ax2,
                             label='Wind Speed (m/s)', color='tab:blue')
                ax2.set_ylabel('Wind Speed (m/s)', color='tab:blue')
                ax2.tick_params(axis='y', labelcolor='tab:blue')

                plt.title('Delhi – PM2.5 and Wind Speed around Most Polluted Week (Dec 2024)')
                plt.xlabel('Date')
                plt.tight_layout()

                # Save plot
                filename = f"plot.png"
                plt.savefig(filename, dpi=1200, bbox_inches='tight', facecolor='white')
                plt.close()

                answer = filename
    except Exception as e:
        answer = "Unable to complete analysis with available data"