# 0. PACKAGE INSTALLATION
%pip install --quiet pandas numpy matplotlib seaborn scipy scikit-learn missingno yfinance pandas-datareader requests pgmpy
import warnings
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import median_abs_deviation
import requests
import yfinance as yf
import pandas_datareader.data as web
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os
import pgmpy

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-paper')

Note: you may need to restart the kernel to use updated packages.

import pandas as pd
import pandas_datareader.data as web
import requests
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

START_DATE = "2015-01-01"
END_DATE = "2023-12-31"

FRED_API_KEY = "6071482a4efea6801348a6fd83196365" 
EIA_API_KEY = "JSHoiaSh9e3PUtozc9UwHoRrSpa9xTniUZWSUPfa"

fred_series = {
    "US_Ind_Production": "INDPRO",              
    "US_Energy_CPI": "CPIENGSL",                
    "Global_Econ_Policy_Uncert": "GEPUCURRENT"  
}

df_macro = pd.DataFrame()
for name, ticker in fred_series.items():
    df_macro[name] = web.DataReader(ticker, "fred", START_DATE, END_DATE)

def fetch_eia_data(series_id, name):
    url = f"https://api.eia.gov/v2/seriesid/{series_id}?api_key={EIA_API_KEY}"
    try:
        response = requests.get(url).json()
        dates = [row['period'] for row in response['response']['data']]
        values = [row['value'] for row in response['response']['data']]
        df = pd.DataFrame({name: values}, index=pd.to_datetime(dates))
        return df.sort_index()
    except Exception as e:
        return pd.DataFrame()

eia_series = {
    "OPEC_Production": "STEO.PAPR_OPEC.M",
    "NonOPEC_Production": "STEO.PAPR_NONOPEC.M",
    "OPEC_Spare_Capacity": "STEO.COPS_OPEC.M",
    "OECD_GDP": "STEO.RGDPQ_OECD.M"
}

df_geo = pd.DataFrame()
for name, series_id in eia_series.items():
    eia_data = fetch_eia_data(series_id, name)
    if not eia_data.empty:
        df_geo = pd.concat([df_geo, eia_data], axis=1)

df_student_A = pd.concat([df_macro, df_geo], axis=1).resample('M').last()
df_student_A = df_student_A.loc[START_DATE:END_DATE]
df_student_A.interpolate(method='linear', inplace=True)

output_filename = "student_A_macro_geo_data.csv"
df_student_A.to_csv(output_filename)

df_student_A.plot(subplots=True, figsize=(12, 10), title="Figure 1: Macro & Geopolitical Variables")
plt.tight_layout()
plt.show()

micro_tickers = {
    "Exxon_Mobil_Corp": "XOM",
    "Oil_Services_ETF": "OIH",
    "Delta_Airlines": "DAL"
}

df_corporate = pd.DataFrame()
for name, ticker in micro_tickers.items():
    df_corporate[name] = yf.download(ticker, start=START_DATE, end=END_DATE)["Close"]

df_corporate = df_corporate.resample('M').last()

fred_micro = {"US_Vehicle_Sales": "TOTALSA"}
df_consumer = pd.DataFrame()
for name, ticker in fred_micro.items():
    df_consumer[name] = web.DataReader(ticker, "fred", START_DATE, END_DATE)
df_consumer = df_consumer.resample('M').last()

df_student_B = pd.concat([df_corporate, df_consumer], axis=1)
df_student_B.interpolate(method='linear', inplace=True)

df_student_B.plot(subplots=True, figsize=(12, 8), title="Figure 2: Microeconomic Variables")
plt.tight_layout()
plt.show()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

financial_tickers = {
    "Crude_Oil_WTI": "CL=F",
    "SP500_Index": "^GSPC",
    "USD_Index": "DX-Y.NYB",
    "Gold": "GC=F",
    "Volatility_Index_VIX": "^VIX",
    "High_Yield_Bonds_HYG": "HYG" 
}

df_student_C = pd.DataFrame()

for name, ticker in financial_tickers.items():
    df_student_C[name] = yf.download(ticker, start=START_DATE, end=END_DATE)["Close"]

df_student_C = df_student_C.resample('M').last()
df_student_C.interpolate(method='linear', inplace=True)

df_student_C.plot(subplots=True, figsize=(12, 12), title="Figure 3: Financial & Securities Variables")
plt.tight_layout()
plt.show()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

def generate_data_dictionary(df, source_mapping):
    """
    Generates a comprehensive metadata table for the given DataFrame.
    Reflects statistical rigor by extracting actual frequency, start/end dates,
    and missingness profile before the cleaning phase.
    """
    # Identify inferred frequency or default to 'M' (Month End) based on prior resampling
    freq = df.index.inferred_freq if df.index.inferred_freq else "M"
    
    summary = pd.DataFrame({
        "Variable": df.columns,
        "Source": [source_mapping.get(col, "Unknown") for col in df.columns],
        "Frequency": freq,
        "Start_Date": df.index.min().strftime('%Y-%m-%d'),
        "End_Date": df.index.max().strftime('%Y-%m-%d'),
        "Observations": df.count().values,
        "Missing_Values": df.isna().sum().values,
        "Mean": df.mean().round(2).values,
        "Std_Dev": df.std().round(2).values
    })
    
    return summary

source_map_A = {
    "US_Ind_Production": "FRED",
    "US_Energy_CPI": "FRED",
    "Global_Econ_Policy_Uncert": "FRED",
    "OPEC_Production": "EIA STEO",
    "NonOPEC_Production": "EIA STEO",
    "OPEC_Spare_Capacity": "EIA STEO",
    "OECD_GDP": "EIA STEO"
}

dict_student_A = generate_data_dictionary(df_student_A, source_map_A)

display(dict_student_A) 

print(f"Total Macro/Geo Dimensions: {df_student_A.shape}")
print(f"Overall Data Completeness: {((df_student_A.notna().sum().sum() / df_student_A.size) * 100):.2f}%\n")

Total Macro/Geo Dimensions: (108, 6)
Overall Data Completeness: 100.00%

source_map_B = {
    "Exxon_Mobil_Corp": "Yahoo Finance",
    "Oil_Services_ETF": "Yahoo Finance",
    "Delta_Airlines": "Yahoo Finance",
    "US_Vehicle_Sales": "FRED"
}

dict_student_B = generate_data_dictionary(df_student_B, source_map_B)

display(dict_student_B)

print(f"Total Micro Dimensions: {df_student_B.shape}")
print(f"Overall Data Completeness: {((df_student_B.notna().sum().sum() / df_student_B.size) * 100):.2f}%\n")

Total Micro Dimensions: (108, 4)
Overall Data Completeness: 100.00%

source_map_C = {
    "Crude_Oil_WTI": "Yahoo Finance",
    "SP500_Index": "Yahoo Finance",
    "USD_Index": "Yahoo Finance",
    "Gold": "Yahoo Finance",
    "Volatility_Index_VIX": "Yahoo Finance",
    "High_Yield_Bonds_HYG": "Yahoo Finance"
}

dict_student_C = generate_data_dictionary(df_student_C, source_map_C)

print("--- STUDENT C: FINANCIAL METADATA ---")
display(dict_student_C)

print(f"Total Financial Dimensions: {df_student_C.shape}")
print(f"Overall Data Completeness: {((df_student_C.notna().sum().sum() / df_student_C.size) * 100):.2f}%\n")

--- STUDENT C: FINANCIAL METADATA ---

Total Financial Dimensions: (108, 6)
Overall Data Completeness: 100.00%

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf # Required for tickers used in Student B/C
from scipy.stats import median_abs_deviation
from sklearn.covariance import EllipticEnvelope

df_master = pd.concat([df_student_A, df_student_B, df_student_C], axis=1).dropna()

def detect_univariate_outliers_mad(series, window=6, threshold=3.0):
    rolling_median = series.rolling(window=window, center=True).median()
    rolling_mad = series.rolling(window=window, center=True).apply(
        lambda x: median_abs_deviation(x[~np.isnan(x)], scale='normal') if len(x[~np.isnan(x)]) > 0 else 0
    )
    robust_z_score = np.abs(series - rolling_median) / (rolling_mad + 1e-8)
    outliers = robust_z_score > threshold
    return outliers, rolling_median

def detect_multivariate_outliers_mcd(df, contamination=0.05):
    df_clean = df.dropna()
    mcd = EllipticEnvelope(contamination=contamination, random_state=42)
    mcd.fit(df_clean)
    preds = mcd.predict(df_clean)
    outlier_dates = df_clean.index[preds == -1]
    return outlier_dates

target_var = 'Crude_Oil_WTI'
multivariate_outliers = detect_multivariate_outliers_mcd(df_master)

if target_var in df_master.columns:
    outlier_mask, trend = detect_univariate_outliers_mad(df_master[target_var])
    
    fig, ax = plt.subplots(figsize=(14, 7))
    
    ax.plot(df_master.index, df_master[target_var], label='WTI Price', color='steelblue', alpha=0.5, lw=2)
    ax.plot(df_master.index, trend, label='Rolling Median (Trend)', color='darkorange', linestyle='--', alpha=0.8)
    
    if outlier_mask.any():
        ax.scatter(df_master.index[outlier_mask], df_master[target_var][outlier_mask], 
                   color='crimson', label='Univariate Outliers (MAD)', zorder=5, marker='x', s=120)
    
    if len(multivariate_outliers) > 0:
        ax.scatter(multivariate_outliers, df_master.loc[multivariate_outliers, target_var], 
                   edgecolor='blue', facecolor='none', s=200, label='Multivariate Outliers (MCD)', 
                   zorder=4, linewidth=2)

    ax.set_title(f"Figure 4: Comparative Outlier Detection (Univariate vs. Multivariate) - {target_var}")
    ax.set_ylabel("Price / Value")
    ax.legend(loc='best', frameon=True)
    plt.grid(True, alpha=0.3)
    plt.show()

print(f"Master DataFrame dimensions: {df_master.shape}")
print(f"Dates identified as Multivariate Outliers:\n{multivariate_outliers.strftime('%Y-%m-%d').tolist()}")

Master DataFrame dimensions: (108, 16)
Dates identified as Multivariate Outliers:
['2020-03-31', '2020-04-30', '2020-05-31', '2020-06-30', '2020-10-31', '2021-02-28']

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

def audit_data_integrity(df):
    report = {}
    df = df.sort_index()
    
    dup_idx = df.index.duplicated().sum()
    report['Duplicated_Timestamps'] = dup_idx
    
    stale_data_mask = (df.diff().abs().rolling(window=2).sum() == 0)
    stale_counts = stale_data_mask.sum()
    report['Stale_Data_Points'] = stale_counts[stale_counts > 0].to_dict()
    
    positive_patterns = ['INDPRO', 'PAPR', 'TOTALSA', 'RGDP', 'CPI']
    neg_violations = {}
    
    for col in df.columns:
        if "Crude_Oil_WTI" in col:
            continue
            
        if any(p in col for p in positive_patterns):
            neg_count = (df[col] < 0).sum()
            if neg_count > 0:
                neg_violations[col] = neg_count
                
    report['Negative_Value_Violations'] = neg_violations
    return report, stale_data_mask

integrity_report, stale_mask = audit_data_integrity(df_master)

df_master = df_master[~df_master.index.duplicated(keep='last')].sort_index()

for col, count in integrity_report['Stale_Data_Points'].items():
    df_master.loc[stale_mask[col], col] = np.nan

for key, val in integrity_report.items():
    print(f"{key}: {val}")

cols_with_stale = [col for col in df_master.columns if stale_mask[col].any()]
if cols_with_stale:
    fig, ax = plt.subplots(figsize=(14, 4))
    sns.heatmap(stale_mask[cols_with_stale].T, cmap='YlOrRd', cbar=False, ax=ax)
    ax.set_title("Figure 5: Heatmap of Stale Data Points (Sinalized for Imputation)")
    plt.show()

Duplicated_Timestamps: 0
Stale_Data_Points: {}
Negative_Value_Violations: {}

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import matplotlib.gridspec as gridspec
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
import warnings

warnings.filterwarnings('ignore')

fig = plt.figure(figsize=(16, 7))
gs = gridspec.GridSpec(1, 2, width_ratios=[1.2, 1])

ax0 = plt.subplot(gs[0])
msno.bar(df_master, ax=ax0, color="slategray", fontsize=10, labels=True)
ax0.set_title("A. Feature Completeness Analysis (%)", loc='left', fontsize=14, fontweight='bold')


ax1 = plt.subplot(gs[1])
msno.dendrogram(df_master, ax=ax1, orientation='right', fontsize=10)
ax1.set_title("B. Nullity Hierarchy (Correlation Clusters)", loc='left', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

def impute_financial_data(df):
    df_imputed = df.copy()
    
    df_imputed = df_imputed.interpolate(method='pchip', limit=2, limit_direction='both')
    
    if df_imputed.isna().sum().sum() > 0:
        imputer = IterativeImputer(
            estimator=BayesianRidge(),
            max_iter=30, 
            random_state=42,
            tol=1e-3
        )
        
        time_idx = np.arange(len(df_imputed)).reshape(-1, 1)
        imputation_matrix = np.hstack([time_idx, df_imputed.values])
        
        imputed_array = imputer.fit_transform(imputation_matrix)
        df_imputed = pd.DataFrame(imputed_array[:, 1:], index=df.index, columns=df.columns)
        
        pos_cols = [c for c in df_imputed.columns if any(sub in c for sub in ['Production', 'Capacity', 'Sales', 'GDP'])]
        df_imputed[pos_cols] = df_imputed[pos_cols].clip(lower=0)
        
    return df_imputed

df_clean_final = impute_financial_data(df_master)

sample_col = 'OPEC_Spare_Capacity'

if sample_col in df_master.columns:
    fig, ax = plt.subplots(figsize=(14, 5))
    
    ax.plot(df_master.index, df_master[sample_col], color='steelblue', 
            linewidth=2.5, label='Original Observed Data', alpha=0.5)
    
    ax.plot(df_clean_final.index, df_clean_final[sample_col], color='crimson', 
            linestyle='--', linewidth=1.5, label='Final Imputed Trajectory (MICE/PCHIP)')
    
    null_mask = df_master[sample_col].isna()
    if null_mask.any():
        ax.scatter(df_clean_final.index[null_mask], df_clean_final.loc[null_mask, sample_col], 
                   color='gold', zorder=5, label='Imputed Points (Validation)', s=40)

    ax.set_title(f"Figure 7: Imputation Integrity Check - {sample_col}", fontsize=14, fontweight='bold')
    ax.set_ylabel("Standardized Value")
    ax.legend(frameon=True, loc='best')
    plt.grid(True, alpha=0.3)
    plt.show()

print(f"Post-Imputation Audit: {df_clean_final.isna().sum().sum()} missing values remaining.")

Post-Imputation Audit: 0 missing values remaining.

def sterilize_dataset(df: pd.DataFrame, target_var: str = 'Crude_Oil_WTI') -> pd.DataFrame:
    """
    Consolidates the data sterilization protocol combining structural integrity checks, 
    hybrid Bayesian imputation, and robust outlier winsorization. 
    Optimized for Markovian and Bayesian Network econometric modeling.
    """
    df_clean = df.copy()
    
    df_clean = df_clean[~df_clean.index.duplicated(keep='last')]
    
    stale_mask = df_clean.rolling(window=3).std() == 0
    df_clean[stale_mask] = np.nan
    
    strictly_positive_vars = ['INDPRO', 'OPEC_Production', 'NonOPEC_Production', 'OECD_GDP', 'US_Vehicle_Sales']
    for col in df_clean.columns:
        if any(var in col for var in strictly_positive_vars):
            df_clean.loc[df_clean[col] < 0, col] = np.nan
            
    predictors = [col for col in df_clean.columns if col != target_var]
    for col in predictors:
        rolling_median = df_clean[col].rolling(window=12, center=True, min_periods=1).median()
        rolling_mad = df_clean[col].rolling(window=12, center=True, min_periods=1).apply(
            lambda x: median_abs_deviation(x[~np.isnan(x)], scale='normal')
        )
        
        epsilon = 1e-8
        upper_bound = rolling_median + (5 * (rolling_mad + epsilon))
        lower_bound = rolling_median - (5 * (rolling_mad + epsilon))
        
        df_clean[col] = np.where(df_clean[col] > upper_bound, upper_bound, df_clean[col])
        df_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound, df_clean[col])


    df_clean = df_clean.interpolate(method='pchip', limit=2, limit_direction='forward')
    
    if df_clean.isna().sum().sum() > 0:
        imputer = IterativeImputer(
            estimator=BayesianRidge(), 
            max_iter=20, 
            random_state=42,
            tol=1e-3
        )
        
        time_idx = np.arange(len(df_clean)).reshape(-1, 1)
        imp_matrix = np.hstack([time_idx, df_clean.values])
        
        imputed_vals = imputer.fit_transform(imp_matrix)
        df_clean = pd.DataFrame(imputed_vals[:, 1:], index=df_clean.index, columns=df_clean.columns)

    return df_clean

try:
    df_master = pd.concat([df_student_A, df_student_B, df_student_C], axis=1)
    
    df_sterilized = sterilize_dataset(df_master, target_var='Crude_Oil_WTI')
    
    print("--- STERILIZATION PIPELINE COMPLETED ---")
    print(f"Original Dimensions:   {df_master.shape}")
    print(f"Sterilized Dimensions: {df_sterilized.shape}")
    print(f"Remaining NaNs:        {df_sterilized.isna().sum().sum()}\n")
    
    print("--- STERILIZED DATASET PREVIEW (HEAD & TAIL) ---")
    df_preview = pd.concat([df_sterilized.head(5), df_sterilized.tail(5)])
    
    display(df_preview)
    
except NameError:
    print("Error: df_student_A, df_student_B, or df_student_C not found in the current namespace.")

--- STERILIZATION PIPELINE COMPLETED ---
Original Dimensions:   (108, 16)
Sterilized Dimensions: (108, 16)
Remaining NaNs:        0

--- STERILIZED DATASET PREVIEW (HEAD & TAIL) ---

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler

plt.style.use('seaborn-v0_8-paper')
sns.set_theme(style="ticks", context="paper", font_scale=1.2)
plt.rcParams.update({
    'figure.dpi': 300,         
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'lines.linewidth': 1.5
})

scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_clean_final), 
                         index=df_clean_final.index, 
                         columns=df_clean_final.columns)

def plot_student_A_distributions(df, df_scaled, target_var='Crude_Oil_WTI'):
    fig = plt.figure(figsize=(16, 12))
    
    ax1 = fig.add_subplot(2, 2, 1)
    target_data = df[target_var].dropna()
    sns.histplot(target_data, stat="density", bins=30, color="steelblue", alpha=0.6, ax=ax1)
    sns.kdeplot(target_data, color="darkblue", linewidth=2, label="Empirical KDE", ax=ax1)
    
    mu, std = stats.norm.fit(target_data)
    xmin, xmax = ax1.get_xlim()
    x = np.linspace(xmin, xmax, 100)
    p = stats.norm.pdf(x, mu, std)
    ax1.plot(x, p, 'k--', linewidth=2, label=f'Normal Fit ($\mu$={mu:.2f}, $\sigma$={std:.2f})')
    ax1.set_title(f"Figure 8a: Empirical vs. Normal Distribution - {target_var}")
    ax1.legend()
    
    ax2 = fig.add_subplot(2, 2, 2)
    n = len(target_data)
    sorted_data = np.sort(target_data)
    z_empirical = (sorted_data - mu) / std
    p_vals = (np.arange(1, n + 1) - 0.5) / n
    q_theoretical = stats.norm.ppf(p_vals)
    deviation = z_empirical - q_theoretical
    se = np.sqrt((p_vals * (1 - p_vals)) / (n * (stats.norm.pdf(q_theoretical)**2)))
    ci_upper = 1.96 * se
    ci_lower = -1.96 * se
    
    ax2.scatter(q_theoretical, deviation, color='steelblue', s=15, alpha=0.8, label='Empirical Deviation')
    ax2.plot(q_theoretical, ci_upper, color='crimson', linestyle='--', linewidth=1.5, label='95% CI Band')
    ax2.plot(q_theoretical, ci_lower, color='crimson', linestyle='--', linewidth=1.5)
    ax2.axhline(0, color='black', linewidth=1)
    ax2.set_title(f"Figure 8b: Worm Plot (Detrended Q-Q) - {target_var}")
    ax2.set_xlabel("Theoretical Quantiles")
    ax2.set_ylabel("Deviation")
    ax2.legend(loc='best')
    
    ax3 = fig.add_subplot(2, 1, 2)
    key_vars = ['INDPRO', 'GEPUCURRENT', 'OPEC_Spare_Capacity', 'XOM', 'Volatility_Index_VIX']
    available_vars = [v for v in key_vars if v in df_scaled.columns]
    
    if available_vars:
        df_melted = df_scaled[available_vars].melt(var_name='Variable', value_name='Standardized Value')
        sns.violinplot(x='Standardized Value', y='Variable', data=df_melted, 
                       palette="muted", inner="quartile", ax=ax3, scale="width")
        ax3.set_title("Figure 8c: Cross-Sectional Distribution Density (Standardized)")
        ax3.set_xlabel("Z-Score")
        ax3.set_ylabel("")
        
    sns.despine()
    plt.tight_layout()
    plt.show()

plot_student_A_distributions(df_clean_final, df_scaled)

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def plot_student_B_timeseries(df, df_scaled, target_var='Crude_Oil_WTI'):
    fig = plt.figure(figsize=(16, 14))
    
    ax1 = fig.add_subplot(3, 1, 1)
    key_drivers = [col for col in ['US_Ind_Production', 'OPEC_Production', 'SP500_Index', target_var] if col in df_scaled.columns]
    for col in key_drivers:
        linewidth = 2.5 if col == target_var else 1.2
        alpha = 1.0 if col == target_var else 0.6
        ax1.plot(df_scaled.index, df_scaled[col], label=col, linewidth=linewidth, alpha=alpha)
    
    ax1.axvspan(pd.to_datetime('2020-02-01'), pd.to_datetime('2020-05-01'), color='crimson', alpha=0.15, label='COVID-19 Shock')
    ax1.set_title("Figure 9a: Synchronized Regime Shifts - Standardized Macro-Financial Dynamics")
    ax1.set_ylabel("Z-Score")
    ax1.legend(loc='upper left', ncol=5, frameon=False)
    
    ax2 = fig.add_subplot(3, 1, 2)
    ax2.plot(df.index, df[target_var], color='black', label=f'{target_var} (Spot)', linewidth=1.5)
    ax2.plot(df.index, df[target_var].rolling(6).mean(), color='darkorange', linestyle='--', label='6-Month SMA')
    ax2.plot(df.index, df[target_var].rolling(12).mean(), color='steelblue', linestyle='-.', label='12-Month SMA')
    ax2.set_title("Figure 9b: Long-Term Trend Extraction and Moving Averages")
    ax2.set_ylabel("USD / bbl")
    ax2.legend(loc='upper left')
    
    ax3 = fig.add_subplot(3, 1, 3)
    log_returns = np.log(df[target_var] / df[target_var].shift(1))
    rolling_vol = log_returns.rolling(window=3).std() * np.sqrt(12) 
    
    ax3.plot(rolling_vol.index, rolling_vol, color='firebrick', linewidth=1.5)
    ax3.fill_between(rolling_vol.index, rolling_vol, color='firebrick', alpha=0.2)
    ax3.set_title("Figure 9c: Volatility Clustering (Annualized 3-Month Rolling Std Dev)")
    ax3.set_ylabel("Annualized Volatility")
    ax3.set_xlabel("Date")
    
    sns.despine()
    plt.tight_layout()
    plt.show()

plot_student_B_timeseries(df_clean_final, df_scaled)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import median_abs_deviation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

try:
    df_master = pd.concat([df_student_A, df_student_B, df_student_C], axis=1)
    
    df_sterilized = sterilize_dataset(df_master, target_var='Crude_Oil_WTI')
    
    print("--- STERILIZATION PIPELINE COMPLETED ---")
except NameError as e:
    print(f"Erro de Definição: {e}. Verifique se df_student_A, B e C foram criados.")

def plot_student_C_multivariate(df, target_var='Crude_Oil_WTI'):
    corr_matrix = df.corr(method='spearman')
    
    clustermap = sns.clustermap(corr_matrix, cmap='coolwarm', center=0, 
                                annot=True, fmt=".2f", annot_kws={"size": 7},
                                linewidths=0.5, figsize=(12, 10), 
                                cbar_kws={'label': 'Spearman Correlation'})
    
    plt.setp(clustermap.ax_heatmap.get_xticklabels(), rotation=45, ha='right')
    clustermap.fig.suptitle("Figure 10a: Hierarchical Clustered Correlation Matrix (Spearman Rank)", y=1.02, fontweight='bold')
    plt.show()
    
    usd_col = [col for col in df.columns if 'USD' in col]
    if usd_col:
        usd_col = usd_col[0]
        fig, ax = plt.subplots(figsize=(14, 4))
        
        rolling_rank_target = df[target_var].rolling(12).rank()
        rolling_rank_usd = df[usd_col].rolling(12).rank()
        rolling_corr = rolling_rank_target.rolling(12).corr(rolling_rank_usd)
        
        ax.plot(rolling_corr.index, rolling_corr, color='purple', linewidth=2)
        ax.axhline(0, color='black', linestyle='--', linewidth=1)
        ax.fill_between(rolling_corr.index, 0, rolling_corr, where=(rolling_corr>0), color='green', alpha=0.3)
        ax.fill_between(rolling_corr.index, 0, rolling_corr, where=(rolling_corr<0), color='red', alpha=0.3)
        
        ax.set_title(f"Figure 10b: Dynamic Regime Shift - 12-Month Rolling Spearman Correlation")
        ax.set_ylabel("Spearman $\\rho$")
        sns.despine()
        plt.tight_layout()
        plt.show()

    top_corr_var = corr_matrix[target_var].drop(target_var).abs().idxmax()
    joint = sns.jointplot(x=top_corr_var, y=target_var, data=df, 
                          kind="kde", fill=True, cmap="mako", height=7, space=0)
    joint.plot_marginals(sns.histplot, color="darkblue", bins=25)
    joint.fig.suptitle(f"Figure 10c: Bivariate Joint Probability Density ({target_var} vs {top_corr_var})", y=1.02, fontweight='bold')
    plt.show()

plot_student_C_multivariate(df_sterilized)

--- STERILIZATION PIPELINE COMPLETED ---

from pgmpy.estimators import HillClimbSearch, MaximumLikelihoodEstimator, BIC
from pgmpy.models import DiscreteBayesianNetwork
from sklearn.preprocessing import KBinsDiscretizer
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
df_discrete = pd.DataFrame(
    discretizer.fit_transform(df_clean_final),
    columns=df_clean_final.columns,
    index=df_clean_final.index
).astype(int)

hc_search = HillClimbSearch(df_discrete)
scoring_method = BIC(df_discrete)
best_model_structure = hc_search.estimate(scoring_method=scoring_method, show_progress=False)

causal_dag = DiscreteBayesianNetwork(best_model_structure.edges())
causal_dag.fit(df_discrete, estimator=MaximumLikelihoodEstimator)

fig, ax = plt.subplots(figsize=(14, 10))
G = nx.DiGraph(causal_dag.edges())
pos = nx.spring_layout(G, k=1.2, seed=42)
nx.draw_networkx_nodes(G, pos, node_size=3000, node_color='#4682B4', alpha=0.8, ax=ax)
nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True, arrowsize=20, width=1.5, ax=ax)
nx.draw_networkx_labels(G, pos, font_size=9, font_weight='bold', ax=ax)
ax.set_title("Figure 11: Discovered Causal Structure (DAG) for Global Oil Market", fontsize=15)
plt.axis('off')
plt.show()

from pgmpy.estimators import HillClimbSearch, MaximumLikelihoodEstimator
from pgmpy.models import DiscreteBayesianNetwork
from sklearn.preprocessing import KBinsDiscretizer
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')

df_discrete = pd.DataFrame(
    discretizer.fit_transform(df_clean_final), 
    columns=df_clean_final.columns, 
    index=df_clean_final.index
).astype(int)

hc_search = HillClimbSearch(df_discrete)
best_model_structure = hc_search.estimate(scoring_method='bic-g', show_progress=False)

causal_dag = DiscreteBayesianNetwork(best_model_structure.edges())
causal_dag.fit(df_discrete, estimator=MaximumLikelihoodEstimator)

target_variable = 'Crude_Oil_WTI'

if target_variable in causal_dag.nodes():
    markov_blanket_nodes = causal_dag.get_markov_blanket(target_variable)
    parents = causal_dag.get_parents(target_variable)
    children = causal_dag.get_children(target_variable)
    
    spouses = set()
    for child in children:
        spouses.update(causal_dag.get_parents(child))
    spouses.discard(target_variable)
    spouses = list(spouses)

    plt.style.use('default')
    fig, ax = plt.subplots(figsize=(16, 12), facecolor='white')
    G = nx.DiGraph(causal_dag.edges())
    
    pos = nx.spring_layout(G, k=1.5, seed=42)
    
    node_colors = []
    for node in G.nodes():
        if node == target_variable:
            node_colors.append('#8B0000') 
        elif node in parents:
            node_colors.append('#228B22') 
        elif node in children:
            node_colors.append('#4682B4') 
        elif node in spouses:
            node_colors.append('#DAA520') 
        else:
            node_colors.append('#E0E0E0') 

    nx.draw_networkx_nodes(G, pos, node_size=3500, node_color=node_colors, edgecolors='black', linewidths=1.5, alpha=0.9, ax=ax)
    
    nx.draw_networkx_edges(G, pos, edge_color='lightgray', arrows=True, arrowstyle='-|>', arrowsize=15, width=1.0, alpha=0.5, ax=ax)
    
    mb_family = [target_variable] + parents + children + spouses
    mb_edges = [(u, v) for u, v in G.edges() if u in mb_family and v in mb_family]
    nx.draw_networkx_edges(G, pos, edgelist=mb_edges, edge_color='black', arrows=True, arrowstyle='-|>', arrowsize=25, width=2.5, ax=ax)
    
    nx.draw_networkx_labels(G, pos, font_size=9, font_weight='bold', font_family='sans-serif', ax=ax)
    
    legend_elements = [
        mpatches.Patch(facecolor='#8B0000', edgecolor='black', label='Target Variable (WTI)'),
        mpatches.Patch(facecolor='#228B22', edgecolor='black', label='Parents (Direct Causes)'),
        mpatches.Patch(facecolor='#4682B4', edgecolor='black', label='Children (Direct Effects)'),
        mpatches.Patch(facecolor='#DAA520', edgecolor='black', label='Spouses (Co-parents of Children)'),
        mpatches.Patch(facecolor='#E0E0E0', edgecolor='black', label='Conditionally Independent Nodes')
    ]
    
    ax.legend(handles=legend_elements, loc='upper right', title="Markov Blanket Topology", 
              fontsize=11, title_fontsize=13, frameon=True, borderpad=1.5)
    
    ax.set_title(f"Figure 12: Markov Blanket Isolation for {target_variable}", fontsize=18, fontweight='bold', pad=20)
    plt.axis('off')
    plt.tight_layout()
    plt.show()

from IPython.display import display, Latex

algorithm_latex = r"""
\begin{array}{l}
\hline
\mathbf{Algorithm\ 1:} \text{Stuctural learnning by causal inference} \\
\hline
\mathbf{Input:} \\
\quad \mathcal{D}: \text{Observations Dataset} \\
\quad X = \{X_1, \dots, X_m\}: \text{Set of endogenous and exogenous variables} \\
\quad \alpha: \text{Significance level} \\
\mathbf{Output:} \text{Partially complete Directed Acyclic Graph (CPDAG) } G = \langle V, E \rangle \\
\hline
\mathbf{Phase\ 1:\ Learnning \ Markov\ Blanket} \\
\mathbf{for\ each} \ X_i \in X \ \mathbf{do} \\
\quad \text{Blanket Markov } \mathcal{B}(X_i) \text{ interactively via condictionaly independency tests} \\
\quad \text{(where } \mathcal{B}(X_i) \text{ is the minimal set of parents, child and spouses suchthat } X_i \perp\!\!\!\perp X \setminus (\{X_i\} \cup \mathcal{B}(X_i)) \mid \mathcal{B}(X_i) \text{)} \\
\mathbf{end\ for} \\
\text{Validate neighborhood symmetry: } \mathbf{if} \ X_i \in \mathcal{B}(X_j) \text{ but } X_j \notin \mathcal{B}(X_i), \text{ reconcile} \\
\\
\mathbf{Phase\ 2:\ Neighborhood \ learning} \\
\text{Initialize the undirected arrow graph } X_i - X_j \text{ for all validated pairs in Phase 1} \\
\mathbf{for\ each} \ \text{adjacent pair } (X_i, X_j) \ \mathbf{do} \\
\quad \text{Identify separating edge } S_{X_iX_j} \subseteq \{ \mathcal{B}(X_i) \cup \mathcal{B}(X_j) \} \setminus \{X_i, X_j\} \\
\quad \mathbf{if} \ X_i \perp\!\!\!\perp X_j \mid S_{X_iX_j} \ \text{with p-value } > \alpha \ \mathbf{then} \\
\quad \quad \text{Remove the arrow between } X_i \text{ and } X_j \text{ (we reject the direct causal dependency)} \\
\quad \quad \text{Store } S_{X_iX_j} \text{ (required for orienting colliders in Phase 3)} \\
\quad \mathbf{else} \\
\quad \quad \text{Keep undirected edge } X_i - X_j \\
\quad \mathbf{end\ if} \\
\mathbf{end\ for} \\
\\
\mathbf{Phase\ 3:\ Edge \ orientation\ of\ edges\ (Causal\ attribution)} \\
\text{1. Identify collision structures (v-structures):} \\
\quad \mathbf{for\ each} \ \text{triple } X_i - X_k - X_j \text{ (where } X_i \text{ e } X_j \text{ if are adjacet)} \ \mathbf{do} \\
\quad \quad \mathbf{if} \ X_k \notin S_{X_iX_j} \ \mathbf{then} \text{ orient converging to the collider: } X_i \rightarrow X_k \leftarrow X_j \\
\quad \mathbf{end\ for} \\
\text{2. Propagation of consistency rules (Meek's rules):} \\
\quad \mathbf{repeat\ until} \ \text{structural stability (no new edges can be oriented)} \mathbf{do} \\
\quad \quad \text{R1 (Prevent new colliders): If } X_i \rightarrow X_j - X_k \text{ and } X_i, X_k \text{ are not adjacent } \Rightarrow X_j \rightarrow X_k \\
\quad \quad \text{R2 (Prevent cycles): If } X_i \rightarrow X_k \rightarrow X_j \text{ and } X_i - X_j \Rightarrow X_i \rightarrow X_j \\
\quad \quad \text{R3 (Double V-structure): If } X_i - X_k \rightarrow X_j \text{, } X_i - X_l \rightarrow X_j \text{ and } X_i - X_j \Rightarrow X_i \rightarrow X_j \\
\quad \quad \text{R4 (Propagation): If } X_i - X_k \rightarrow X_l \rightarrow X_j \text{ and } X_i - X_l \text{, } X_i - X_j \Rightarrow X_i \rightarrow X_j \\
\quad \mathbf{end\ repeat} \\
\hline
\end{array}
"""

display(Latex(algorithm_latex))

Category	Variable	Ticker / ID	Source	Frequency	Unity	Transformation
Macro/Geo	Industrial production	INDPRO	FRED	Montly	Índice	% Change
Macro/Geo	CPI Energy	CPIENGSL	FRED	Montly	Índice	% Change
Macro/Geo	Geopolitical uncertainty	GEPUCURRENT	FRED	Montly	Índice	Level
Macro/Geo	OPEC Production	PAPR_OPEC	EIA	Montly	mb/d	Change
Macro/Geo	On-OPEC production	PAPR_NONOPEC	EIA	Montly	mb/d	Change
Macro/Geo	Idle capcity	COPS_OPEC	EIA	Montly	mb/d	Level
Macro/Geo	PIB OCDE	RGDPQ_OECD	EIA	Montly	Billions $	% Change
Micro	Exxon Mobil	XOM	Yahoo Fin.	Montly	USD	Log-Return
Micro	Oil servicess	OIH	Yahoo Fin.	Montly	USD	Log-Return
Micro	Vehicle seles	TOTALSA	FRED	Montly	Millions	% Change
Micro	Delta Airlines	DAL	Yahoo Fin.	Montly	USD	Log-Return
Financeiro	WTI Crude Oil	CL=F	Yahoo Fin.	Montly	USD/bbl	Log-Return
Financeiro	S&P 500	^GSPC	Yahoo Fin.	Montly	Index	Log-Return
Financeiro	Dollar Index	DX-Y.NYB	Yahoo Fin.	Montly	Index	% Change
Financeiro	Gold	GC=F	Yahoo Fin.	Montly	USD/oz	Log-Return
Financeiro	VIX Index	^VIX	Yahoo Fin.	Montly	%	Level
Financeiro	High Yield Bonds	HYG	Yahoo Fin.	Montly	USD	% Change

	US_Ind_Production	US_Energy_CPI	Global_Econ_Policy_Uncert	OPEC_Production	NonOPEC_Production	OPEC_Spare_Capacity	Exxon_Mobil_Corp	Oil_Services_ETF	Delta_Airlines	US_Vehicle_Sales	Crude_Oil_WTI	SP500_Index	USD_Index	Gold	Volatility_Index_VIX	High_Yield_Bonds_HYG
2015-01-31	102.8905	199.926	129.058627	33.053219	62.191976	2.380322	54.180454	547.691101	41.315250	16.910	48.240002	1994.989990	94.800003	1255.190913	20.969999	49.649830
2015-02-28	102.2335	203.021	115.615595	32.978743	62.133020	2.279944	55.287643	575.075623	38.957119	16.891	49.759998	2104.500000	95.250000	1212.599976	13.340000	50.757572
2015-03-31	101.8914	205.757	111.378145	33.770899	62.370155	1.879726	53.077141	556.104431	39.342129	17.896	47.599998	2067.889893	98.360001	1183.099976	15.290000	50.276520
2015-04-30	101.3355	203.577	100.978142	33.961862	62.285572	1.860331	54.557053	644.031799	39.062115	17.693	59.630001	2085.510010	94.599998	1182.400024	14.550000	50.714561
2015-05-31	100.8883	208.973	104.462104	34.198546	62.192864	1.600471	53.642059	604.439758	37.634182	17.929	60.299999	2107.389893	96.910004	1189.400024	13.840000	50.894852
2023-08-31	100.8393	287.310	175.505155	32.015400	69.925825	4.352000	102.095436	323.572021	41.689693	16.071	83.629997	4507.660156	103.620003	1938.199951	13.570000	64.311607
2023-09-30	101.0211	292.623	200.638478	32.695200	70.348026	3.870000	107.962784	327.665009	35.972923	16.209	90.790001	4288.049805	106.169998	1848.099976	17.520000	63.273911
2023-10-31	100.4689	286.765	190.443809	32.664200	70.633616	4.039000	97.192207	308.064331	30.467567	15.738	81.019997	4193.799805	106.660004	1985.199951	18.139999	62.613869
2023-11-30	100.8639	282.146	207.972104	32.768800	71.317001	4.029000	95.199226	296.573669	36.005360	15.782	75.959999	4567.799805	103.500000	2038.099976	12.920000	65.674210
2023-12-31	100.6031	280.763	223.023700	32.758900	71.323795	4.228000	92.641808	297.991425	39.222725	16.321	71.650002	4769.830078	101.330002	2062.399902	12.450000	67.770882

Introduction¶

Step 1)¶

Step 2 a)¶

Step 2 b)¶

Step 2c)¶

Step 3 a)¶

Step 3 b)¶

Step 3 c) ¶

Setp 4¶

Setp 5a)¶

Step 5b)¶

Step 5c)¶

Step 6¶

Step 7 a)¶

Step 7 b)¶

Setp 7 c)¶

Step 8a)¶

Step 8b)¶

Step 8c)¶

Step 8d)¶

Step 9a)¶

Step 9 b)¶

Step 9 c)¶

Conclusions¶

Reference¶

	Variable	Source	Frequency	Start_Date	End_Date	Observations	Mean	Std_Dev
0	Exxon_Mobil_Corp	Yahoo Finance	ME	2015-01-31	2023-12-31	108	59.64	19.15
1	Oil_Services_ETF	Yahoo Finance	ME	2015-01-31	2023-12-31	108	332.68	145.97
2	Delta_Airlines	Yahoo Finance	ME	2015-01-31	2023-12-31	108	41.44	8.00
3	US_Vehicle_Sales	FRED	ME	2015-01-31	2023-12-31	108	16.56	1.77

Introduction¶

Step 1)¶

Step 2 a)¶

Step 2 b)¶

Step 2c)¶

Step 3 a)¶

Step 3 b)¶

**Step 3 c) **¶

Setp 4¶

Setp 5a)¶

Step 5b)¶

Step 5c)¶

Step 6¶

Step 7 a)¶

Step 7 b)¶

Setp 7 c)¶

Step 8a)¶

Step 8b)¶

Step 8c)¶

Step 8d)¶

Step 9a)¶

Step 9 b)¶

Step 9 c)¶

Conclusions¶

Reference¶

Step 3 c) ¶