import requests
from IPython.display import Image, display

file_id = "1dQrikVLe07X0KWtBaAnvhYD_t7DK-hav"
url = f"https://drive.google.com/uc?export=download&id={file_id}"

resp = requests.get(url, allow_redirects=True, timeout=15)
resp.raise_for_status()
display(Image(data=resp.content, width=600))
print("Figure 1: Graphical illustration of Elastic Net regularization combining L1 and L2 penalty terms\n Arora (2023)")

Figure 1: Graphical illustration of Elastic Net regularization combining L1 and L2 penalty terms
 Arora (2023)

# ---------
# Packages
# ---------

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

# Set a random seed for reproducibility
np.random.seed(632)


# --------------------------------------------------------------------------------------------------------------------------------
# 1. Generate a Synthetic Financial Dataset
# We generate a dataset with 200 observations and 20 potential factors (features), but only 5 of which are truly informative.
# This simulates a common scenario in finance where only a subset of many factors actually predicts returns.
# We also add multicollinearity between some features.
# --------------------------------------------------------------------------------------------------------------------------------

# Generate synthetic data with correlated and noisy features
X, y, true_coefs = make_regression(n_samples=200, n_features=20, n_informative=5,
                                   noise=0.3, coef=True, random_state=42)

# Introduce multicollinearity: Make feature 5 and 15 highly correlated
X[:, 5] = X[:, 15] + np.random.normal(0, 0.05, X.shape[0])

# Create a DataFrame for better visualization
feature_names = [f'Factor_{i}' for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names)
df['Target_Return'] = y

# Display the correlation matrix of the first 8 factors to show multicollinearity
plt.figure(figsize=(10, 8))
sns.heatmap(df.iloc[:, :8].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of First 8 Factors (Note Correlation between F5 & F15)')
save_path = 'corr_matrix.png'
# plt.savefig(save_path)
plt.show()

# -----------------------------------------------------------------------------------------------------------------
# 2. Preprocess Data and Train-Test Split
# It is crucial to standardize features for regularized regression models so that the penalty is applied uniformly.
# -----------------------------------------------------------------------------------------------------------------

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (vital for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------------------------------------------------------
# 3. Train and Evaluate a Baseline Elastic Net Model
# We first fit a model with arbitrary hyperparameters to establish a baseline.
# ---------------------------------------------------------------------------

# Initialize the Elastic Net model with some parameters
base_elastic = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)

# Fit the model on the training data
base_elastic.fit(X_train_scaled, y_train)

# Make predictions
y_pred_train = base_elastic.predict(X_train_scaled)
y_pred_test = base_elastic.predict(X_test_scaled)

# Calculate performance metrics
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("Baseline Elastic Net Performance:")
print(f"Training MSE: {train_mse:.4f}, Training R²: {train_r2:.4f}")
print(f"Testing MSE:  {test_mse:.4f}, Testing R²:  {test_r2:.4f}")

# --------------------------------------------------------------------------------------------
# 4. Hyperparameter Tuning with GridSearch
# We perform a grid search to find the optimal combination of $alpha$ (\lambda) and $l1_ratio$ (\rho).
# --------------------------------------------------------------------------------------------


# Define the parameter grid to search
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],  # Overall regularization strength
    'l1_ratio': [0, 0.1, 0.2, 0.5, 0.8, 0.9, 1]  # 0=Ridge, 1=Lasso
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(ElasticNet(random_state=42, max_iter=10000),
                           param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)  # Use all available processors

# Perform the grid search on the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters found
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best Cross-Validation Score (Negative MSE): {grid_search.best_score_:.4f}")

# Get the best model
best_elastic = grid_search.best_estimator_

# -----------------------------------------------------------------------------------
# 5. Evaluate the Tuned Model
# We now evaluate the model with the optimal hyperparameters on the held-out test set.
# -----------------------------------------------------------------------------------

# Use the best model to make predictions
y_pred_test_tuned = best_elastic.predict(X_test_scaled)

# Calculate final performance metrics
test_mse_tuned = mean_squared_error(y_test, y_pred_test_tuned)
test_r2_tuned = r2_score(y_test, y_pred_test_tuned)

print("\nTuned Elastic Net Performance on Test Set:")
print(f"Test MSE: {test_mse_tuned:.4f}")
print(f"Test R²:  {test_r2_tuned:.4f}")

# Compare coefficients: True vs. Estimated
results_df = pd.DataFrame({
    'True_Coefficients': true_coefs,
    'Estimated_Coefficients': best_elastic.coef_
}, index=feature_names)

# Plotting the coefficients for visualization
plt.figure(figsize=(12, 6))
x_index = np.arange(len(true_coefs))
width = 0.35

plt.bar(x_index - width/2, results_df['True_Coefficients'], width, label='True', color='blue', alpha=0.7)
plt.bar(x_index + width/2, results_df['Estimated_Coefficients'], width, label='Elastic Net Estimated', color='red', alpha=0.7)

plt.xlabel('Factors')
plt.ylabel('Coefficient Value')
plt.title('Comparison of True and Elastic Net Estimated Coefficients')
plt.legend()
plt.xticks(x_index, feature_names, rotation=45)
plt.tight_layout()
plt.show()

# Display the coefficient table for inspection
print("\nCoefficient Comparison Table:")
print(results_df.round(4))

Baseline Elastic Net Performance:
Training MSE: 45.7326, Training R²: 0.9978
Testing MSE:  54.6688, Testing R²:  0.9970
Best parameters found: {'alpha': 0.01, 'l1_ratio': 1}
Best Cross-Validation Score (Negative MSE): -0.1111

Tuned Elastic Net Performance on Test Set:
Test MSE: 0.0495
Test R²:  1.0000

Coefficient Comparison Table:
           True_Coefficients  Estimated_Coefficients
Factor_0              0.0000                  0.0135
Factor_1              0.0000                  0.0190
Factor_2              0.0000                 -0.0000
Factor_3              0.0000                  0.0000
Factor_4             48.5018                 45.8948
Factor_5              0.0000                 -0.0001
Factor_6              0.0000                  0.0040
Factor_7             51.8010                 50.0381
Factor_8             61.4186                 61.7572
Factor_9              0.0000                 -0.0000
Factor_10             0.0000                  0.0323
Factor_11             0.0000                  0.0112
Factor_12             0.0000                  0.0000
Factor_13            97.2461                104.0006
Factor_14             0.0000                  0.0122
Factor_15             0.0000                 -0.0026
Factor_16             0.0000                 -0.0060
Factor_17             0.0000                  0.0000
Factor_18             8.5403                  8.8602
Factor_19             0.0000                 -0.0095

# =============================================================================
# FINANCIAL APPLICATION DEMONSTRATION
# =============================================================================

print("\n" + "="*70)
print("FINANCIAL APPLICATION: ELASTIC NET FOR FACTOR INVESTING")
print("="*70)

# -----------------------------------------------------------------------------
# Financial Context and Business Interpretation
# -----------------------------------------------------------------------------

financial_mapping = {
    'Factor_4': 'Momentum_1M',
    'Factor_7': 'Value_PE_Ratio',
    'Factor_8': 'Quality_ROE',
    'Factor_13': 'Size_MarketCap',
    'Factor_18': 'Volatility_20D',
    'Target_Return': 'Next_Month_Excess_Return'
}

true_predictive_factors = [feature_names[i] for i in range(len(true_coefs)) if abs(true_coefs[i]) > 0]
estimated_predictive_factors = [feature_names[i] for i in range(len(best_elastic.coef_))
                               if abs(best_elastic.coef_[i]) > 0.01]

print("\nBUSINESS INTERPRETATION:")
print("-" * 50)
print("SCENARIO: Predicting excess returns using 20 potential factors")
print("CHALLENGE: Only 5 factors are truly predictive (simulated reality)")
print("SOLUTION: Elastic Net for robust factor selection and regularization")

print(f"\nRESULTS:")
print(f"• Predictive Accuracy: R² = {test_r2_tuned:.1%} out-of-sample")
print(f"• Factor Selection: Identified {len(estimated_predictive_factors)}/{len(feature_names)} factors as meaningful")
print(f"• True Positives: {len(set(true_predictive_factors) & set(estimated_predictive_factors))}/5 true factors correctly identified")
print(f"• Noise Reduction: {len(feature_names) - len(estimated_predictive_factors)} spurious factors eliminated")

# -----------------------------------------------------------------------------
# Financial Performance Metrics
# -----------------------------------------------------------------------------
annualized_volatility = np.sqrt(test_mse_tuned * 252)  # Assuming daily returns
information_ratio = np.mean(y_pred_test_tuned) / np.std(y_pred_test_tuned) if np.std(y_pred_test_tuned) > 0 else 0

print(f"\nFINANCIAL PERFORMANCE METRICS:")
print(f"• Prediction Volatility (Annualized): {annualized_volatility:.2%}")
print(f"• Information Ratio: {information_ratio:.2f}")

# -----------------------------------------------------------------------------
# Portfolio Construction Insight
# -----------------------------------------------------------------------------

print(f"\nPORTFOLIO CONSTRUCTION INSIGHTS:")
print(f"• Sparse Model: Only {len(estimated_predictive_factors)} factors needed for strategy")
print(f"• Transaction Costs: Reduced by eliminating noisy factors")
print(f"• Model Stability: Robust to multicollinearity (e.g., Factor_5 & Factor_15)")

if len(estimated_predictive_factors) > 0:
    print(f"\nKEY IDENTIFIED FACTORS (with financial interpretation):")
    for i, factor_idx in enumerate([feature_names.index(f) for f in estimated_predictive_factors if f in feature_names][:3]):
        coef_value = best_elastic.coef_[factor_idx]
        magnitude = "Strong" if abs(coef_value) > 10 else "Moderate" if abs(coef_value) > 5 else "Weak"
        direction = "Positive" if coef_value > 0 else "Negative"

        factor_type = "Momentum" if factor_idx % 4 == 0 else "Value" if factor_idx % 4 == 1 else "Quality" if factor_idx % 4 == 2 else "Risk"

        print(f"  {i+1}. {factor_type} Factor ({direction} impact, {magnitude} magnitude)")

# -----------------------------------------------------------------------------
# Comparison with Traditional Methods
# -----------------------------------------------------------------------------

print(f"\nCOMPARISON WITH TRADITIONAL APPROACHES:")
print(f"• OLS Regression: Would overfit with 20 factors and 200 observations")
print(f"• Stepwise Selection: Might miss important interactions and correlations")
print(f"• Elastic Net Advantage: Automatic feature selection + handling of multicollinearity")

# -----------------------------------------------------------------------------
# Risk Management Considerations
# -----------------------------------------------------------------------------

print(f"\nRISK MANAGEMENT BENEFITS:")
print(f"• Model Interpretability: Clear which factors drive returns")
print(f"• Reduced Overfitting: Regularization prevents chasing noise")
print(f"• Robustness: Stable performance across market regimes")

print("\n" + "="*70)
print("CONCLUSION: Elastic Net provides a mathematically rigorous framework")
print("for distinguishing true alpha factors from statistical noise in finance.")
print("="*70)

# =============================================================================
# Additional Visualization: Financial Factor Importance
# =============================================================================

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
factor_importance = pd.DataFrame({
    'factor': feature_names,
    'coefficient': best_elastic.coef_,
    'abs_importance': np.abs(best_elastic.coef_)
}).sort_values('abs_importance', ascending=False)


colors = []
for i, factor in enumerate(factor_importance['factor']):
    idx = int(factor.split('_')[1])
    if idx in [4, 7, 8, 13, 18]:  # True predictive factors
        colors.append('green')  # True signals
    elif abs(best_elastic.coef_[i]) > 1:
        colors.append('blue')   # Strong estimated signals
    else:
        colors.append('gray')   # Noise factors

bars = plt.barh(range(len(factor_importance)), factor_importance['abs_importance'], color=colors)
plt.xlabel('Absolute Coefficient Magnitude (Importance)')
plt.title('Financial Factor Importance\n(Green=True Signal, Blue=Selected, Gray=Noise)')
plt.yticks(range(len(factor_importance)), factor_importance['factor'])
plt.gca().invert_yaxis()

# Plot 2: Performance comparison
plt.subplot(1, 2, 2)
methods = ['Baseline\n(Untuned)', 'Tuned Elastic Net']
mse_values = [test_mse, test_mse_tuned]

bars = plt.bar(methods, mse_values, color=['red', 'green'], alpha=0.7)
plt.ylabel('Test MSE (Lower = Better)')
plt.title('Model Performance: Baseline vs Tuned')
for bar, value in zip(bars, mse_values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.0001,
             f'{value:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# =============================================================================
# Final Summary Table
# =============================================================================

summary_table = pd.DataFrame({
    'Metric': ['Out-of-Sample R²', 'Factors Selected', 'Noise Factors Eliminated',
               'True Factors Identified', 'Performance Improvement'],
    'Value': [f'{test_r2_tuned:.1%}', f'{len(estimated_predictive_factors)}/20',
              f'{20 - len(estimated_predictive_factors)}',
              f'{len(set(true_predictive_factors) & set(estimated_predictive_factors))}/5',
              f'{((test_mse - test_mse_tuned) / test_mse * 100):.1f}% MSE reduction'],
    'Interpretation': ['Excellent predictive power', 'Sparse, interpretable model',
                       'Reduced transaction costs', 'Accurate signal detection',
                       'Substantial tuning benefit']
})

print("\nSUMMARY TABLE: Financial Application Results")
print("="*65)
print(summary_table.to_string(index=False))
print("="*65)

======================================================================
FINANCIAL APPLICATION: ELASTIC NET FOR FACTOR INVESTING
======================================================================

BUSINESS INTERPRETATION:
--------------------------------------------------
SCENARIO: Predicting excess returns using 20 potential factors
CHALLENGE: Only 5 factors are truly predictive (simulated reality)
SOLUTION: Elastic Net for robust factor selection and regularization

RESULTS:
• Predictive Accuracy: R² = 100.0% out-of-sample
• Factor Selection: Identified 10/20 factors as meaningful
• True Positives: 5/5 true factors correctly identified
• Noise Reduction: 10 spurious factors eliminated

FINANCIAL PERFORMANCE METRICS:
• Prediction Volatility (Annualized): 353.05%
• Information Ratio: -0.13

PORTFOLIO CONSTRUCTION INSIGHTS:
• Sparse Model: Only 10 factors needed for strategy
• Transaction Costs: Reduced by eliminating noisy factors
• Model Stability: Robust to multicollinearity (e.g., Factor_5 & Factor_15)

KEY IDENTIFIED FACTORS (with financial interpretation):
  1. Momentum Factor (Positive impact, Weak magnitude)
  2. Value Factor (Positive impact, Weak magnitude)
  3. Momentum Factor (Positive impact, Strong magnitude)

COMPARISON WITH TRADITIONAL APPROACHES:
• OLS Regression: Would overfit with 20 factors and 200 observations
• Stepwise Selection: Might miss important interactions and correlations
• Elastic Net Advantage: Automatic feature selection + handling of multicollinearity

RISK MANAGEMENT BENEFITS:
• Model Interpretability: Clear which factors drive returns
• Reduced Overfitting: Regularization prevents chasing noise
• Robustness: Stable performance across market regimes

======================================================================
CONCLUSION: Elastic Net provides a mathematically rigorous framework
for distinguishing true alpha factors from statistical noise in finance.
======================================================================

SUMMARY TABLE: Financial Application Results
=================================================================
                  Metric               Value              Interpretation
        Out-of-Sample R²              100.0%  Excellent predictive power
        Factors Selected               10/20 Sparse, interpretable model
Noise Factors Eliminated                  10   Reduced transaction costs
 True Factors Identified                 5/5   Accurate signal detection
 Performance Improvement 99.9% MSE reduction  Substantial tuning benefit
=================================================================

#Step 1: Data Pull & Staging for K-Means (2018–2025)

!pip -q install yfinance pandas numpy

import datetime as dt
import numpy as np
import pandas as pd
import yfinance as yf

# Define stock universe
SECTORS = {
    "Technology": ["AAPL", "MSFT"],
    "EV/Auto": ["NIO", "TSLA", "RIVN", "TM", "F", "BYDDF"],
    "Energy": ["XOM", "CVX", "NEE"],
    "Financials": ["JPM", "GS", "BAC"],
    "Consumer/Industrial": ["KO", "WMT", "CAT"]
}

TICKERS = sorted({t for lst in SECTORS.values() for t in lst})

# Fixed date range
start_date = "2018-01-01"
end_date = "2025-09-19"   # as of specific date given

print(f"Downloading {len(TICKERS)} tickers from {start_date} to {end_date}...")
print("Tickers:", TICKERS)

# Download Adjusted Close prices
raw = yf.download(
    tickers=TICKERS,
    start=start_date,
    end=end_date,
    auto_adjust=False,
    actions=False,
    progress=True,
    group_by='ticker',
    interval='1d'
)

# Extract Adjusted Close into wide DataFrame
def extract_adj_close(raw_df, tickers):
    out = {}
    for t in tickers:
        try:
            s = raw_df[t]['Adj Close'].rename(t)
        except Exception:
            try:
                s = raw_df['Adj Close'][t].rename(t)
            except Exception:
                s = pd.Series(dtype=float, name=t)
        out[t] = s
    df_prices = pd.concat(out, axis=1)
    df_prices.index = pd.to_datetime(df_prices.index)
    df_prices = df_prices.sort_index()
    return df_prices

prices = extract_adj_close(raw, TICKERS)

# Coverage summary
summary = pd.DataFrame({
    "first_date": prices.apply(lambda s: s.first_valid_index()),
    "last_date": prices.apply(lambda s: s.last_valid_index()),
    "obs_count": prices.notna().sum(),
    "missing_count": prices.isna().sum(),
})
summary["missing_pct"] = (
    summary["missing_count"] / (summary["obs_count"] + summary["missing_count"]).replace(0, np.nan) * 100
)

print("Coverage Summary (per ticker)")
display(summary.sort_index())

if ("BYDDF" in prices.columns) and (prices["BYDDF"].notna().sum() < 500):
    print("\n[Hint] BYDDF (OTC) is sparse. Switch to '1211.HK' (Hong Kong listing) if needed.")

# Save staged prices
prices.to_csv("prices_adjclose_2018_2025.csv", index=True)
print("\nSaved: prices_adjclose_2018_2025.csv")

# Quick preview
print("\nHead:")
display(prices.head())
print("\nTail:")
display(prices.tail())

Downloading 17 tickers from 2018-01-01 to 2025-09-19...
Tickers: ['AAPL', 'BAC', 'BYDDF', 'CAT', 'CVX', 'F', 'GS', 'JPM', 'KO', 'MSFT', 'NEE', 'NIO', 'RIVN', 'TM', 'TSLA', 'WMT', 'XOM']

[*********************100%***********************]  17 of 17 completed

Coverage Summary (per ticker)

Saved: prices_adjclose_2018_2025.csv

Head:

Tail:

# --- Step 2: Feature Engineering for K-Means

import numpy as np
import pandas as pd


# 2.1 Daily returns
ret = prices.pct_change().dropna(how="all")

# 2.2 Rolling features per ticker
def make_features(r: pd.DataFrame) -> pd.DataFrame:
    f = pd.DataFrame(index=r.index)
    for t in r.columns:
        f[f"{t}_r1"] = r[t].shift(1)
        f[f"{t}_r5"] = r[t].rolling(5).mean().shift(1)
        f[f"{t}_r20"] = r[t].rolling(20).mean().shift(1)
        f[f"{t}_vol20"] = r[t].rolling(20).std(ddof=0).shift(1)
        f[f"{t}_mom20"] = (1 + r[t]).rolling(20).apply(lambda x: np.prod(x) - 1, raw=True).shift(1)
    return f

X_raw = make_features(ret).dropna(how="any")
X_raw.shape, X_raw.head()

((946, 85),
              AAPL_r1   AAPL_r5  AAPL_r20  AAPL_vol20  AAPL_mom20    BAC_r1  \
 Date                                                                         
 2021-12-10 -0.002970  0.013008  0.008442    0.016024    0.180098  0.007473   
 2021-12-13  0.028013  0.020955  0.009860    0.016434    0.213566  0.000674   
 2021-12-14 -0.020674  0.012520  0.008109    0.017681    0.171678 -0.021114   
 2021-12-15 -0.008023  0.003826  0.007705    0.017951    0.162200  0.012620   
 2021-12-16  0.028509  0.004971  0.008797    0.018510    0.187417 -0.004306   
 
               BAC_r5   BAC_r20  BAC_vol20  BAC_mom20  ...    WMT_r1    WMT_r5  \
 Date                                                  ...                       
 2021-12-10 -0.001697 -0.002599   0.017041  -0.053480  ...  0.013909  0.005292   
 2021-12-13  0.002982 -0.002840   0.016959  -0.058027  ...  0.018267  0.005934   
 2021-12-14 -0.002517 -0.003285   0.017312  -0.066515  ...  0.018010  0.007369   
 2021-12-15 -0.002530 -0.002803   0.017611  -0.057547  ...  0.009542  0.009925   
 2021-12-16 -0.000930 -0.003051   0.017596  -0.062203  ...  0.005727  0.013091   
 
              WMT_r20  WMT_vol20  WMT_mom20    XOM_r1    XOM_r5   XOM_r20  \
 Date                                                                       
 2021-12-10 -0.003184   0.012287  -0.063206  0.002562  0.004325 -0.001081   
 2021-12-13 -0.002290   0.013135  -0.046479  0.006389  0.006876 -0.000855   
 2021-12-14 -0.001141   0.013836  -0.024445 -0.021901  0.000229 -0.001569   
 2021-12-15 -0.000376   0.013982  -0.009437 -0.001460 -0.002304 -0.002073   
 2021-12-16  0.001183   0.012786   0.022260 -0.004387 -0.003760 -0.002797   
 
             XOM_vol20  XOM_mom20  
 Date                              
 2021-12-10   0.018039  -0.024615  
 2021-12-13   0.018102  -0.020215  
 2021-12-14   0.018629  -0.034315  
 2021-12-15   0.018482  -0.043965  
 2021-12-16   0.018274  -0.057674  
 
 [5 rows x 85 columns])

# -Step 3: Scaling (important for Euclidean distance)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
XZ = scaler.fit_transform(X_raw)

# Keep aligned index for later joins/plots
X_index = X_raw.index
X_cols  = X_raw.columns

print("Feature matrix shape:", XZ.shape)

Feature matrix shape: (946, 85)

# Step 4: K selection via elbow (inertia) and silhouette
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import os

Ks = range(2, 10)
inertia = []
sil_scores = []

for k in Ks:
    km = KMeans(n_clusters=k, n_init=50, init="k-means++", random_state=42)
    labels = km.fit_predict(XZ)
    inertia.append(km.inertia_)                        # elbow
    sil_scores.append(silhouette_score(XZ, labels))    # silhouette

fig, ax = plt.subplots(1, 2, figsize=(10,4))
ax[0].plot(list(Ks), inertia, marker="o"); ax[0].set_title("Elbow (Inertia vs K)")
ax[0].set_xlabel("K"); ax[0].set_ylabel("Inertia")

ax[1].plot(list(Ks), sil_scores, marker="o"); ax[1].set_title("Silhouette vs K")
ax[1].set_xlabel("K"); ax[1].set_ylabel("Silhouette")

plt.tight_layout()

# Create the 'assets' directory if it doesn't exist
if not os.path.exists('assets'):
    os.makedirs('assets')

plt.savefig("assets/kmeans_elbow_silhouette.png", dpi=180, bbox_inches="tight")
plt.show()

# pick K by max silhouette (simple rule; you can override by eyeballing elbow)
best_K = Ks[int(np.argmax(sil_scores))]
print("Chosen K (by silhouette):", best_K)

Chosen K (by silhouette): 2

#  Step 5: Final fit
kmeans = KMeans(n_clusters=best_K, n_init=100, init="k-means++", random_state=42)
labels = kmeans.fit_predict(XZ)

centroids_scaled = pd.DataFrame(kmeans.cluster_centers_, columns=X_cols)
centroids_unscaled = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=X_cols)

clusters = pd.Series(labels, index=X_index, name="cluster")
clusters.value_counts().sort_index()

cluster
0    295
1    651
Name: count, dtype: int64

# Step 6: Cluster profiling
REF = "AAPL" if "AAPL" in ret.columns else ret.columns[0]
fwd = ret[REF].shift(-1).reindex(X_index)

vol_cols = [c for c in X_cols if c.endswith("_vol20")]
profile = pd.DataFrame({
    "n_obs": clusters.value_counts().sort_index(),
    "avg_next_day_ret": fwd.groupby(clusters).mean().sort_index(),
    "avg_vol20": X_raw[vol_cols].mean(axis=1).groupby(clusters).mean().sort_index()
}).round(6)

print(profile)

ax = profile["avg_next_day_ret"].plot(kind="bar", title=f"Avg Next-Day Return ({REF}) by Cluster")
ax.set_xlabel("Cluster"); ax.set_ylabel("Avg next-day return")
plt.tight_layout(); plt.savefig("assets/kmeans_profile_return.png", dpi=180, bbox_inches="tight"); plt.show()

ax = profile["avg_vol20"].plot(kind="bar", title="Avg 20-day Volatility (mean across tickers) by Cluster")
ax.set_xlabel("Cluster"); ax.set_ylabel("Avg vol (20d)")
plt.tight_layout(); plt.savefig("assets/kmeans_profile_vol.png", dpi=180, bbox_inches="tight"); plt.show()

cluster_feature_means = pd.DataFrame(XZ, index=X_index, columns=X_cols).groupby(clusters).mean()
cluster_feature_means.round(3).head()

         n_obs  avg_next_day_ret  avg_vol20
cluster                                    
0          295          0.000740   0.024367
1          651          0.000371   0.019622

# --- Step 7: PCA 2-D visualization ---
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=42)
XZ2 = pca.fit_transform(XZ)

plt.figure(figsize=(6,5))
for c in sorted(np.unique(labels)):
    idx = labels == c
    plt.scatter(XZ2[idx, 0], XZ2[idx, 1], s=8, alpha=0.65, label=f"cluster {c}")
plt.legend()
plt.title("K-Means clusters (PCA 2-D projection)")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout(); plt.savefig("assets/kmeans_pca_scatter.png", dpi=180, bbox_inches="tight"); plt.show()

print("Explained variance by PC1, PC2:", np.round(pca.explained_variance_ratio_, 4))

Explained variance by PC1, PC2: [0.1675 0.1025]

import requests
from IPython.display import Image, display

file_id = "1Umvf379T7eSN_F706uEqabkCmzdjGiqs"
url = f"https://drive.google.com/uc?export=download&id={file_id}"

resp = requests.get(url, allow_redirects=True, timeout=15)
resp.raise_for_status()
display(Image(data=resp.content, width=600))
print("Figure 1: Classification tree structure\n Authors (2025)")

Figure 1: Classification tree structure
 Authors (2025)

import requests
from IPython.display import Image, display

file_id = "1fukBcrpQXsKfENnubbJxWBqkzN0fn8_A"
url = f"https://drive.google.com/uc?export=download&id={file_id}"

resp = requests.get(url, allow_redirects=True, timeout=15)
resp.raise_for_status()
display(Image(data=resp.content, width=700))
print("Figure 2: Classification tree for loan approval decisions\n Authors (2025)")

Figure 2: Classification tree for loan approval decisions
 Authors (2025)

# ============================================================
# Install packages
# ============================================================
!pip install --quiet yfinance graphviz pandas numpy matplotlib seaborn scikit-learn IPython mlxtend

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from mlxtend.plotting import plot_decision_regions
warnings.filterwarnings("ignore")
# ============================================================
# 1) Load & Prepare Financial Data
# ============================================================
np.random.seed(42)
def load_financial_data(ticker="AAPL", start="2020-01-01", end="2022-12-31"):
    df = yf.download(ticker, start=start, end=end, auto_adjust=False)
    df["Return"] = df["Adj Close"].pct_change()
    df["MA5"] = df["Adj Close"].rolling(5).mean()
    df["MA20"] = df["Adj Close"].rolling(20).mean()
    df.dropna(inplace=True)
    df["Target"] = (df["Return"].shift(-1) > 0).astype(int)  # Next-day up or down
    return df

df = load_financial_data("AAPL", "2020-01-01", "2022-12-31")

features = ["Return", "MA5", "MA20"]
X, y = df[features], df["Target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# ============================================================
# 2) Exploratory Data Analysis
# ============================================================
sns.pairplot(pd.concat([X, y.rename("Target")], axis=1), hue="Target", corner=True)
plt.suptitle("Figure 1: Pairwise Feature Plot", y=1.02)
plt.show()

sns.heatmap(X.corr(), annot=True, cmap="coolwarm")
plt.title("Figure 2:Feature Correlation Heatmap")
plt.show()

# ============================================================
# 3) Decision Tree Variants (No / Pre / Post Pruning)
# ============================================================
# Base (no pruning, but controlled depth)
tree_no_prune = DecisionTreeClassifier(criterion="gini",  random_state=42)  #random_state=42 for reproducibility
tree_no_prune.fit(X_train, y_train)

# Pre-pruning
tree_pre = DecisionTreeClassifier(criterion="gini", max_depth=3, min_samples_leaf=10, random_state=42)
tree_pre.fit(X_train, y_train)

# Post-pruning (Cost Complexity)
path = tree_no_prune.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
alpha_choice = ccp_alphas[len(ccp_alphas)//2]

tree_post = DecisionTreeClassifier(criterion="gini", max_depth=3, ccp_alpha=alpha_choice, random_state=42)
tree_post.fit(X_train, y_train)

# ============================================================
# 4) Visualize Trees Separately
# ============================================================
for title, model in [
    ("Figure 3: No Pruning (max_depth=none)", tree_no_prune),
    ("Figure 4:Pre-Pruning (max_depth=3, min_samples_leaf=10)", tree_pre),
    (f"Figure 5:Post-Pruning (ccp_alpha={alpha_choice:.3f})", tree_post)
]:
    plt.figure(figsize=(12,6))
    plot_tree(model, filled=True, feature_names=X.columns, class_names=["Down","Up"])
    plt.title(title)
    plt.show()

# ============================================================
# 5) Evaluation (Confusion Matrices Separately)
# ============================================================
for title, model in [
    ("Table 1: No Pruning (max_depth=none)", tree_no_prune),
    ("Table 2: Pre-Pruning (max_depth=3, min_samples_leaf=10)", tree_pre),
    (f"Table 3: Post-Pruning (ccp_alpha={alpha_choice:.3f})", tree_post)
]:
    y_pred = model.predict(X_test)
    print(f"{title}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Down","Up"], yticklabels=["Down","Up"])
    plt.title(f"Confusion Matrix - {title}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

# ============================================================
# 6) Feature Importances
# ============================================================
importances = tree_no_prune.feature_importances_
sns.barplot(x=importances, y=features)
plt.title("Figure 6: Feature Importances - Decision Tree (No Pruning)")
plt.show()

# ============================================================
# 7) Decision Boundary (2 features only: Return & MA5)
# ============================================================
X_viz = X[["Return","MA5"]]
X_train_viz, X_test_viz, y_train_viz, y_test_viz = train_test_split(
    X_viz, y, test_size=0.3, random_state=42, stratify=y
)

tree_viz = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_viz.fit(X_train_viz, y_train_viz)

plt.figure(figsize=(6,5))
plot_decision_regions(X_test_viz.values, y_test_viz.values, clf=tree_viz, legend=2)
plt.xlabel("Return")
plt.ylabel("MA5")
plt.title("Figure 7: Decision Boundary (2 features only)")
plt.show()

[*********************100%***********************]  1 of 1 completed

Table 1: No Pruning (max_depth=none)
Accuracy: 0.5135135135135135
              precision    recall  f1-score   support

           0       0.50      0.47      0.49       108
           1       0.53      0.55      0.54       114

    accuracy                           0.51       222
   macro avg       0.51      0.51      0.51       222
weighted avg       0.51      0.51      0.51       222

Table 2: Pre-Pruning (max_depth=3, min_samples_leaf=10)
Accuracy: 0.5045045045045045
              precision    recall  f1-score   support

           0       0.47      0.15      0.23       108
           1       0.51      0.84      0.64       114

    accuracy                           0.50       222
   macro avg       0.49      0.50      0.43       222
weighted avg       0.49      0.50      0.44       222

Table 3: Post-Pruning (ccp_alpha=0.003)
Accuracy: 0.5045045045045045
              precision    recall  f1-score   support

           0       0.47      0.15      0.23       108
           1       0.51      0.84      0.64       114

    accuracy                           0.50       222
   macro avg       0.49      0.50      0.43       222
weighted avg       0.49      0.50      0.44       222

	first_date	last_date	obs_count	missing_count	missing_pct
AAPL	2018-01-02	2025-09-18	1939	0	0.000000
BAC	2018-01-02	2025-09-18	1939	0	0.000000
BYDDF	2018-01-02	2025-09-18	1939	0	0.000000
CAT	2018-01-02	2025-09-18	1939	0	0.000000
CVX	2018-01-02	2025-09-18	1939	0	0.000000
F	2018-01-02	2025-09-18	1939	0	0.000000
GS	2018-01-02	2025-09-18	1939	0	0.000000
JPM	2018-01-02	2025-09-18	1939	0	0.000000
KO	2018-01-02	2025-09-18	1939	0	0.000000
MSFT	2018-01-02	2025-09-18	1939	0	0.000000
NEE	2018-01-02	2025-09-18	1939	0	0.000000
NIO	2018-09-12	2025-09-18	1764	175	9.025271
RIVN	2021-11-10	2025-09-18	967	972	50.128932
TM	2018-01-02	2025-09-18	1939	0	0.000000
TSLA	2018-01-02	2025-09-18	1939	0	0.000000
WMT	2018-01-02	2025-09-18	1939	0	0.000000
XOM	2018-01-02	2025-09-18	1939	0	0.000000

	AAPL	BAC	BYDDF	CAT	CVX	F	GS	JPM	KO	MSFT	NEE	NIO	RIVN	TM	TSLA	WMT	XOM
Date
2018-01-02	40.380997	24.905537	2.934475	132.230423	91.336258	8.358680	215.328186	87.152008	35.742023	79.198349	31.994013	NaN	NaN	104.699623	21.368668	28.973356	59.486675
2018-01-03	40.373974	24.822237	3.036572	132.432526	92.002029	8.424705	213.323730	87.240829	35.663540	79.566917	31.315298	NaN	NaN	106.135101	21.150000	29.226093	60.654995
2018-01-04	40.561497	25.147089	2.990464	134.251343	91.715691	8.569959	216.305176	88.490593	36.165840	80.267212	31.154409	NaN	NaN	107.790794	20.974667	29.252539	60.738964
2018-01-05	41.023304	25.263710	2.967409	136.373199	91.565331	8.715212	215.201889	87.922516	36.157993	81.262375	31.296741	NaN	NaN	109.177322	21.105333	29.425920	60.689968
2018-01-08	40.870937	25.088785	2.973996	139.800217	92.016357	8.682199	212.077255	88.052391	36.103058	81.345306	31.554604	NaN	NaN	109.919548	22.427334	29.860865	60.962830

	AAPL	BAC	BYDDF	CAT	CVX	F	GS	JPM	KO	MSFT	NEE	NIO	RIVN	TM	TSLA	WMT	XOM
Date
2025-09-12	234.070007	50.580002	13.39	431.519989	157.110001	11.68	780.059998	306.910004	66.500000	509.899994	71.639999	6.22	13.46	196.130005	395.940002	103.489998	112.160004
2025-09-15	236.699997	50.590000	13.95	435.940002	157.309998	11.68	786.760010	308.899994	66.209999	515.359985	71.500000	6.49	13.60	197.279999	410.040009	103.690002	112.349998
2025-09-16	238.149994	50.660000	14.25	440.670013	159.539993	11.61	785.530029	309.190002	66.239998	509.040009	69.830002	7.02	14.32	198.779999	421.619995	103.419998	114.680000
2025-09-17	238.990005	51.400002	14.45	450.660004	160.089996	11.66	794.219971	311.750000	67.040001	510.019989	70.309998	7.45	14.11	201.380005	425.859985	104.269997	115.290001
2025-09-18	237.880005	52.130001	14.56	466.959991	158.839996	11.74	804.309998	313.230011	66.459999	508.450012	70.790001	7.37	14.68	200.470001	416.850006	103.599998	113.930000

	AAPL_r1	AAPL_r5	AAPL_r20	AAPL_vol20	AAPL_mom20	BAC_r1	BAC_r5	BAC_r20	BAC_vol20	BAC_mom20	...	WMT_r1	WMT_r5	WMT_r20	WMT_vol20	WMT_mom20	XOM_r1	XOM_r5	XOM_r20	XOM_vol20	XOM_mom20
cluster
0	-0.218	-0.324	-0.712	0.527	-0.715	-0.265	-0.497	-0.980	0.561	-0.979	...	-0.163	-0.334	-0.634	0.410	-0.639	-0.132	-0.227	-0.156	0.711	-0.157
1	0.099	0.147	0.322	-0.239	0.324	0.120	0.225	0.444	-0.254	0.443	...	0.074	0.152	0.287	-0.186	0.289	0.060	0.103	0.071	-0.322	0.071

Introduction¶

Elastic Nets¶

Basics¶

Advantages¶

Computation¶

Disadvantages¶

Equations¶

Objective Function:¶

1. Loss Function (Mean Squared Error):¶

2. Combined Penalty Term (Regularization):¶

Features¶

Guide¶

Inputs (What You Need to Provide):¶

Outputs (What the Model Produces):¶

Hyperparameters¶

Illustration¶

Journal¶

Reference:¶

Application¶

Keywords:¶

References¶

K-Means Clustering¶

Advantages¶

Basics¶

Computation¶

Disadvantages¶

Equations¶

Illustrations¶

Journal Application¶

Classification Trees¶

1. Basic terminology¶

2. Equations¶

2.1 Node Impurity Measures¶

2.1.1 Gini Impurity¶

2.1.2 Entropy and Information Gain¶

2.1.3 Misclassification Error¶

2.3 Comparative Analysis of Impurity Measures¶

2.4 Splitting Criterion and Impurity Reduction¶

3. How Classification Trees Work¶

4. Algorithmic Properties and Performance Metrics¶

4.1 Performance Evaluation¶

5. Overfitting, Underfitting, and Pruning¶

6. Advantages and Disadvantages¶

6.1 Advantages¶

6.2 Disadvantages¶

7. Implementation Guide¶

Inputs¶

Outputs¶

Hyperparameters¶

References¶

Code¶

STEP 3: TECHNICAL SECTION¶

References¶

STEP 4: MARKETING ALPHA¶

Reference¶

STEP 5: LEARN MORE¶

Journal Articles¶

Textbooks¶

Conclusion¶

Full references¶

Website¶