%pip install yfinance pandas numpy matplotlib seaborn scipy statsmodels --quiet

import yfinance as yf
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

sns.set(style="whitegrid")
np.random.seed(42)

# DATA COLLECTION (WEEKLY)
assets = ["TSLA", "WMT", "BAC", "GS", "LLY", "MRK", "GOOG", "META", "AAPL", "XOM"]

data = yf.download(assets, start="2023-07-01", end="2025-09-30", interval="1wk", progress=False)
prices = data['Close']

train_data = prices.loc['2023-07-01':'2025-06-30']
test_data = prices.loc['2025-07-01':'2025-09-30']

returns_train = train_data.pct_change().dropna()
mu = returns_train.mean() * 52
cov = returns_train.cov() * 52
risk_free_rate = 0.04

def portfolio_performance(weights, mu, cov):
    p_ret = np.dot(weights, mu)
    p_vol = np.sqrt(np.dot(weights.T, np.dot(cov, weights)))
    return p_ret, p_vol

def negative_sharpe(weights, mu, cov, rf):
    p_ret, p_vol = portfolio_performance(weights, mu, cov)
    return -(p_ret - rf) / p_vol


cons_sum = {'type': 'eq', 'fun': lambda x: np.sum(x) - 1}
n = len(assets)
w0 = np.ones(n) / n

bounds_step1 = tuple((0, 1) for _ in range(n))
opt_step1 = minimize(negative_sharpe, w0, args=(mu, cov, risk_free_rate),
                     method='SLSQP', bounds=bounds_step1, constraints=cons_sum)
weights_1 = pd.Series(opt_step1.x, index=assets)

# STEP 2: Capped at 15%
bounds_step2 = tuple((0, 0.15) for _ in range(n))
opt_step2 = minimize(negative_sharpe, w0, args=(mu, cov, risk_free_rate),
                     method='SLSQP', bounds=bounds_step2, constraints=cons_sum)
weights_2 = pd.Series(opt_step2.x, index=assets)

print(" Table 1: Weights (Max Sharpe)")
print(weights_1.round(4))

/tmp/ipython-input-2042386151.py:17: FutureWarning: YF.download() has changed argument auto_adjust default to True
  data = yf.download(assets, start="2023-07-01", end="2025-09-30", interval="1wk", progress=False)

 Table 1: Weights (Max Sharpe)
TSLA    0.0000
WMT     0.0000
BAC     0.0000
GS      0.3076
LLY     0.1067
MRK     0.1485
GOOG    0.0000
META    0.0000
AAPL    0.4373
XOM     0.0000
dtype: float64

print("\n Table 2: Weights (Max 15%)")
print(weights_2.round(4))

 Table 2: Weights (Max 15%)
TSLA    0.00
WMT     0.15
BAC     0.15
GS      0.15
LLY     0.15
MRK     0.15
GOOG    0.00
META    0.00
AAPL    0.15
XOM     0.10
dtype: float64

# STEP 3: EFFICIENT FRONTIER
def get_efficient_frontier(mu, cov, bounds, num_points=100):
    target_rets = np.linspace(mu.min(), mu.max(), num_points)
    vols = []
    rets_clean = []
    for r in target_rets:
        cons = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1},
                {'type': 'eq', 'fun': lambda x: np.dot(x, mu) - r})
        res = minimize(lambda w: np.sqrt(np.dot(w.T, np.dot(cov, w))), w0,
                       bounds=bounds, constraints=cons, method='SLSQP')
        if res.success:
            vols.append(res.fun)
            rets_clean.append(r)
    return rets_clean, vols

ef1_ret, ef1_vol = get_efficient_frontier(mu, cov, bounds_step1)
ef2_ret, ef2_vol = get_efficient_frontier(mu, cov, bounds_step2)

plt.figure(figsize=(10, 6))
plt.plot(ef1_vol, ef1_ret, 'b-', label='Unconstrained Frontier')
plt.plot(ef2_vol, ef2_ret, 'r--', label='15% Cap Frontier')
plt.title("Figure 1: Efficient Frontiers Comparison (Weekly Data)")
plt.xlabel("Annualized Volatility")
plt.ylabel("Expected Return")
plt.legend()
plt.show()

# STEP 4: OUT-OF-SAMPLE

returns_test = test_data.pct_change().dropna()

portfolio_ret_1 = returns_test.dot(weights_1)
portfolio_ret_2 = returns_test.dot(weights_2)

performance = pd.DataFrame({
    "Step 1 (Unconstrained)": portfolio_ret_1,
    "Step 2 (Max 15%)": portfolio_ret_2
})

wealth_1 = (1 + performance["Step 1 (Unconstrained)"]).cumprod() * 100
wealth_2 = (1 + performance["Step 2 (Max 15%)"]).cumprod() * 100

plt.figure(figsize=(10, 5))
plt.plot(wealth_1, label='Step 1 Portfolio')
plt.plot(wealth_2, label='Step 2 Portfolio (15% Cap)', linestyle='--')
plt.title("Figure 2: Out-of-Sample Performance (Jul-Sep 2025)")
plt.ylabel("Cumulative Wealth")
plt.legend()
plt.show()

summary_stats = pd.DataFrame({
    "Total Return (3 Months)": (1 + performance).prod() - 1,
    "Weekly Volatility": performance.std(),
    "Annualized Volatility": performance.std() * np.sqrt(52)
})

print("\n Table 3: Out-of-Sample Performance Summary")
display(summary_stats.style.format("{:.2%}"))

 Table 3: Out-of-Sample Performance Summary

import pandas_datareader.data as web
import statsmodels.api as sm
from pandas_datareader._utils import RemoteDataError
import warnings
warnings.filterwarnings("ignore")

portfolio_returns_train = returns_train.dot(weights_2)
portfolio_returns_train.name = "Portfolio_Return"

try:
    ff_data_daily = web.DataReader('F-F_Research_Data_5_Factors_2x3_daily',
                                   'famafrench',
                                   start='2023-07-01',
                                   end='2025-06-30')[0]
except RemoteDataError:
    print("Check your internet or try again later.")


ff_data_daily = ff_data_daily / 100

ff_data_weekly = ff_data_daily.resample('W').apply(lambda x: (1 + x).prod() - 1)

ff_data_weekly = ff_data_weekly.reindex(portfolio_returns_train.index, method='nearest')

regression_data = pd.concat([portfolio_returns_train, ff_data_weekly], axis=1).dropna()

if len(regression_data) == 0:
    print("Data intersection is still empty. Check date ranges.")
else:
    print(f"Data successfully aligned! Rows: {len(regression_data)}")

Y = regression_data['Portfolio_Return'] - regression_data['RF']
X = regression_data[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']]
X = sm.add_constant(X)

# STEP 4
print("\n Table 4: Factor Correlation Matrix")
corr_matrix = X.drop('const', axis=1).corr()
display(corr_matrix.round(4))

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Figure 3: Fama-French 5-Factor Correlation Matrix")
plt.show()

Data successfully aligned! Rows: 104

 Table 4: Factor Correlation Matrix

%pip install statsmodels --quiet
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
import numpy as np
from statsmodels.nonparametric.smoothers_lowess import lowess

# OLS Regression
model_ols = sm.OLS(Y, X).fit()

# Robust Regression (Huber-T)
model_robust = sm.RLM(Y, X, M=sm.robust.norms.HuberT()).fit()

print(" Table 5: OLS regression results")
print(model_ols.summary())
print("\n" + "="*80 + "\n")
print(" Table 6: Robust regression results")
print(model_robust.summary())

print(" Table 7: Comparative differences analysis between OLS vs Robust")
diff_df = pd.DataFrame({
    'OLS Coef': model_ols.params,
    'Robust Coef': model_robust.params,
    'Diff (Abs)': np.abs(model_ols.params - model_robust.params),
    'Diff (%)': np.abs((model_ols.params - model_robust.params) / model_robust.params) * 100
})
display(diff_df.round(4))


def create_diagnostic_plots(model, ax_row, model_name):
    """
     Q-Q Plot, Worm Plot and Scale-Location.
    """

    if isinstance(model.model, sm.RLM):
        resid = model.resid
        scale = model.scale
        if scale < 1e-10: scale = 1.0
        std_resid = resid / scale
    else:
        influence = model.get_influence()
        std_resid = influence.resid_studentized_internal

    fitted = model.fittedvalues
    n = len(std_resid)

    sm.qqplot(std_resid, line='45', fit=True, ax=ax_row[0], alpha=0.6)
    ax_row[0].set_title(f"{model_name}: Normal Q-Q Plot")
    ax_row[0].set_ylabel("Standardized Residuals")
    ax_row[0].grid(True, alpha=0.3)


    (osm, osr) = stats.probplot(std_resid, dist="norm", fit=False)

    worm_y = osr - osm

    p_values = stats.norm.cdf(osm)
    pdf_values = stats.norm.pdf(osm)
    se = (1 / pdf_values) * np.sqrt(p_values * (1 - p_values) / n)
    upper_bound = 1.96 * se
    lower_bound = -1.96 * se

    ax_row[1].scatter(osm, worm_y, alpha=0.6, edgecolors='b', label='Residuals')
    ax_row[1].plot(osm, lower_bound, 'k--', alpha=0.7, linewidth=1, label='95% CI')
    ax_row[1].plot(osm, lower_bound, 'k--', alpha=0.7, linewidth=1)
    ax_row[1].axhline(0, color='red', linestyle='-', linewidth=1)

    ax_row[1].fill_between(osm, lower_bound, upper_bound, color='gray', alpha=0.1)

    try:
        smooth = lowess(worm_y, osm, frac=0.4)
        ax_row[1].plot(smooth[:, 0], smooth[:, 1], color='orange', linewidth=2, label='Trend')
    except:
        pass

    ax_row[1].set_title(f"{model_name}: Worm Plot (Detrended)")
    ax_row[1].set_xlabel("Theoretical Quantiles")
    ax_row[1].set_ylabel("Deviation")
    ax_row[1].legend(loc='lower right', fontsize='small')
    ax_row[1].grid(True, alpha=0.3)

    sqrt_abs_resid = np.sqrt(np.abs(std_resid))
    ax_row[2].scatter(fitted, sqrt_abs_resid, alpha=0.6)

    try:
        smooth_sl = lowess(sqrt_abs_resid, fitted, frac=0.4)
        ax_row[2].plot(smooth_sl[:, 0], smooth_sl[:, 1], color='red')
    except:
        pass

    ax_row[2].set_title(f"{model_name}: Scale-Location")
    ax_row[2].set_xlabel("Fitted Values")
    ax_row[2].set_ylabel("$\sqrt{|Std\ Residuals|}$")
    ax_row[2].grid(True, alpha=0.3)

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 10))

create_diagnostic_plots(model_ols, axes[0], "OLS")

create_diagnostic_plots(model_robust, axes[1], "Robust (Huber-T)")

fig.suptitle("Figure 4: Residuals diagnostics", fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

 Table 5: OLS regression results
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.087
Model:                            OLS   Adj. R-squared:                  0.040
Method:                 Least Squares   F-statistic:                     1.859
Date:                Thu, 12 Feb 2026   Prob (F-statistic):              0.109
Time:                        14:33:46   Log-Likelihood:                 250.56
No. Observations:                 104   AIC:                            -489.1
Df Residuals:                      98   BIC:                            -473.2
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0040      0.002      1.756      0.082      -0.001       0.009
Mkt-RF        -0.1743      0.117     -1.486      0.140      -0.407       0.058
SMB           -0.1916      0.178     -1.074      0.286      -0.546       0.163
HML           -0.1964      0.166     -1.183      0.240      -0.526       0.133
RMW           -0.6373      0.263     -2.419      0.017      -1.160      -0.115
CMA            0.0784      0.252      0.311      0.756      -0.421       0.578
==============================================================================
Omnibus:                       28.591   Durbin-Watson:                   2.040
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               80.827
Skew:                          -0.926   Prob(JB):                     2.81e-18
Kurtosis:                       6.902   Cond. No.                         138.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

================================================================================

 Table 6: Robust regression results
                    Robust linear Model Regression Results                    
==============================================================================
Dep. Variable:                      y   No. Observations:                  104
Model:                            RLM   Df Residuals:                       98
Method:                          IRLS   Df Model:                            5
Norm:                          HuberT                                         
Scale Est.:                       mad                                         
Cov Type:                          H1                                         
Date:                Thu, 12 Feb 2026                                         
Time:                        14:33:46                                         
No. Iterations:                    30                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0047      0.002      2.309      0.021       0.001       0.009
Mkt-RF        -0.1389      0.104     -1.334      0.182      -0.343       0.065
SMB           -0.2212      0.158     -1.396      0.163      -0.532       0.089
HML           -0.1260      0.147     -0.854      0.393      -0.415       0.163
RMW           -0.4935      0.234     -2.110      0.035      -0.952      -0.035
CMA            0.0791      0.224      0.354      0.724      -0.359       0.517
==============================================================================

If the model instance has been used for another fit with different fit parameters, then the fit options might not be the correct ones anymore .
 Table 7: Comparative differences analysis between OLS vs Robust

import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize

sns.set(style="whitegrid")
np.random.seed(42)

# DATA
assets = ["AAPL", "GOOG", "META", "XOM", "LLY"]
start_date = "2023-07-01"
end_date = "2025-06-30"

print(f"Downloading data for: {assets}")
data = yf.download(assets, start=start_date, end=end_date, interval="1wk", progress=False)['Close']

if isinstance(data.columns, pd.MultiIndex):
    data.columns = data.columns.get_level_values(0)

returns = data.pct_change().dropna()
mu = returns.mean().values * 52
cov = returns.cov().values * 52
n_assets = len(assets)
w0 = np.ones(n_assets) / n_assets

def neg_sharpe(weights, mu, cov):
    ret = np.dot(weights, mu)
    vol = np.sqrt(np.dot(weights.T, np.dot(cov, weights)))
    return -(ret / vol)

# Simulation
sim_sizes = [1000, 5000, 10000, 50000, 100000]

def run_simulation_batches(n_total, mu, cov, max_cap=None, batch_size=10000):
    """
    Runs simulation in small batches to prevent RAM crashes.
    """
    best_sharpe = -np.inf
    best_weights = None

    sims_done = 0

    while sims_done < n_total:
        current_batch = min(batch_size, n_total - sims_done)

        raw_size = current_batch
        if max_cap:
            raw_size = current_batch * 10

        w_batch = np.random.random((raw_size, n_assets))
        w_batch = w_batch / w_batch.sum(axis=1)[:, None]

        if max_cap:
            mask = np.all(w_batch <= max_cap, axis=1)
            w_batch = w_batch[mask]

            if len(w_batch) == 0:
                continue

        p_ret = np.dot(w_batch, mu)
        p_vol = np.sqrt(np.einsum('ij,jk,ik->i', w_batch, cov, w_batch))
        sharpe = p_ret / p_vol

        idx = np.argmax(sharpe)
        current_best_sharpe = sharpe[idx]

        if current_best_sharpe > best_sharpe:
            best_sharpe = current_best_sharpe
            best_weights = w_batch[idx]

        sims_done += len(w_batch) if max_cap else current_batch

    return best_weights

# STEP 7
cons = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
bounds_7 = tuple((0, 1) for _ in range(n_assets))
opt_7 = minimize(neg_sharpe, w0, args=(mu, cov), method='SLSQP', bounds=bounds_7, constraints=cons)
weights_opt_7 = opt_7.x

errors_7 = []
for n in sim_sizes:
    w_sim = run_simulation_batches(n, mu, cov, max_cap=None)
    error = np.linalg.norm(w_sim - weights_opt_7)
    errors_7.append(error)
    print(f"  Sims: {n} | Error: {error:.4f}")

plt.figure(figsize=(8, 4))
plt.plot(sim_sizes, errors_7, marker='o')
plt.xscale('log')
plt.title("Figure 5:  Optimal portfolio for AAPL, GOOG, META, XOM and LLY 5 assets under weights sum to 1 and no short sales")
plt.xlabel("Simulations")
plt.ylabel("Error")
plt.show()

Downloading data for: ['AAPL', 'GOOG', 'META', 'XOM', 'LLY']
  Sims: 1000 | Error: 0.1258
  Sims: 5000 | Error: 0.1105
  Sims: 10000 | Error: 0.1261
  Sims: 50000 | Error: 0.0697
  Sims: 100000 | Error: 0.0663

/tmp/ipython-input-1551259101.py:17: FutureWarning: YF.download() has changed argument auto_adjust default to True
  data = yf.download(assets, start=start_date, end=end_date, interval="1wk", progress=False)['Close']

# STEP 8

# Optimization
bounds_8 = tuple((0, 0.30) for _ in range(n_assets))
opt_8 = minimize(neg_sharpe, w0, args=(mu, cov), method='SLSQP', bounds=bounds_8, constraints=cons)
weights_opt_8 = opt_8.x

# Simulation Loop
errors_8 = []
for n in sim_sizes:
    w_sim = run_simulation_batches(n, mu, cov, max_cap=0.30)

    if w_sim is None:
        errors_8.append(np.nan)
        continue

    error = np.linalg.norm(w_sim - weights_opt_8)
    errors_8.append(error)
    print(f"  Sims: {n} | Error: {error:.4f}")

plt.figure(figsize=(8, 4))
plt.plot(sim_sizes, errors_8, marker='s', color='red')
plt.xscale('log')
plt.title("Figure 6: Optimal portfolio under non negative weights, which sum to 1 ans no assets is more thatn 30% of total portfolio")
plt.xlabel("Simulations")
plt.ylabel("Error")
plt.show()

  Sims: 1000 | Error: 0.0511
  Sims: 5000 | Error: 0.0325
  Sims: 10000 | Error: 0.0433
  Sims: 50000 | Error: 0.0143
  Sims: 100000 | Error: 0.0326

print("\nTable 5: Optimal Weights Comparison")
df_comp = pd.DataFrame({
    'Asset': assets,
    'Step 7 (Opt)': weights_opt_7,
    'Step 8 (Opt)': weights_opt_8
})
print(df_comp.round(4))

Final Optimal Weights Comparison:
  Asset  Step 7 (Opt)  Step 8 (Opt)
0  AAPL        0.0000         0.000
1  GOOG        0.1321         0.224
2  META        0.2101         0.231
3   XOM        0.4838         0.300
4   LLY        0.1740         0.245

	Total Return (3 Months)	Weekly Volatility	Annualized Volatility
Step 1 (Unconstrained)	13.64%	2.35%	16.92%
Step 2 (Max 15%)	8.42%	1.87%	13.49%

	Mkt-RF	SMB	HML	RMW	CMA
Mkt-RF	1.0000	0.1601	-0.2485	-0.4911	0.1029
SMB	0.1601	1.0000	0.4162	-0.5048	0.2327
HML	-0.2485	0.4162	1.0000	0.0207	0.2150
RMW	-0.4911	-0.5048	0.0207	1.0000	-0.2780
CMA	0.1029	0.2327	0.2150	-0.2780	1.0000

Introduction¶

Part 1¶

Part 2¶

Step 4: Model and factors¶

Step 5¶

Ordinary learst squares¶

Robust estimation¶

Step 5¶

Step 6¶

Part 3¶

Step 8¶

Step 7 and 8 results¶

Conclusions¶

References¶

	OLS Coef	Robust Coef	Diff (Abs)	Diff (%)
const	0.0040	0.0047	0.0007	14.3665
Mkt-RF	-0.1743	-0.1389	0.0354	25.4951
SMB	-0.1916	-0.2212	0.0296	13.3954
HML	-0.1964	-0.1260	0.0705	55.9430
RMW	-0.6373	-0.4935	0.1438	29.1282
CMA	0.0784	0.0791	0.0007	0.8812