Question 5.3
Pick one of the funds.
Download its data.
Pick 1 of the easier metrics (e.g. Correlation, or Dispersion ratio are easier than LASSO).
In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
ewz = yf.download("EWZ", start="2020-01-01", end="2023-10-27")
def calculate_rsi(data, window):
delta = data.diff()
up = delta.clip(lower=0)
down = -1*delta.clip(upper=0)
avg_up = up.rolling(window=window).mean()
avg_down = down.rolling(window=window).mean()
rsi = 100 - (100 / (1 + avg_up / avg_down))
return rsi
ewz['SMA_5'] = ewz['Close'].rolling(window=5).mean()
ewz['SMA_20'] = ewz['Close'].rolling(window=20).mean()
ewz['RSI_14'] = calculate_rsi(ewz['Close'], 14)
ewz['Volatility'] = ewz['Close'].rolling(window=10).std()
ewz.dropna(inplace=True)
ewz['Target'] = (ewz['Close'].shift(-1) > ewz['Close']).astype(int)
ewz.dropna(inplace=True)
X = ewz.drop('Target', axis=1)
y = ewz['Target']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
correlation = ewz.corr()
correlation.head()
[*********************100%***********************] 1 of 1 completed
Out[1]:
| Price | Adj Close | Close | High | Low | Open | Volume | SMA_5 | SMA_20 | RSI_14 | Volatility | Target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Ticker | EWZ | EWZ | EWZ | EWZ | EWZ | EWZ | ||||||
| Price | Ticker | |||||||||||
| Adj Close | EWZ | 1.000000 | 0.764202 | 0.747732 | 0.770245 | 0.752870 | -0.132706 | 0.739613 | 0.623218 | 0.224921 | -0.277076 | -0.071891 |
| Close | EWZ | 0.764202 | 1.000000 | 0.996332 | 0.996773 | 0.992819 | -0.005895 | 0.980084 | 0.879476 | 0.207611 | -0.144382 | -0.066260 |
| High | EWZ | 0.747732 | 0.996332 | 1.000000 | 0.995304 | 0.997008 | 0.029978 | 0.985462 | 0.893800 | 0.184472 | -0.116321 | -0.057554 |
| Low | EWZ | 0.770245 | 0.996773 | 0.995304 | 1.000000 | 0.995921 | -0.033448 | 0.979841 | 0.872775 | 0.207374 | -0.171283 | -0.059249 |
| Open | EWZ | 0.752870 | 0.992819 | 0.997008 | 0.995921 | 1.000000 | 0.004716 | 0.984368 | 0.887459 | 0.182375 | -0.138835 | -0.056075 |
Implement the cross-fold validation or k-cross-fold validation.
In [2]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
# Now your cross-validation code
X_train_val, X_test, y_train_val, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42
)
k = 10 # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)
best_accuracy = 0
best_model = None
fold_data = []
for fold, (train_index, val_index) in enumerate(kf.split(X_train_val)):
print(f"Training fold {fold + 1}/{k}")
X_train, X_val = X_train_val[train_index], X_train_val[val_index]
y_train, y_val = y_train_val[train_index], y_train_val[val_index]
mlp = MLPClassifier(
hidden_layer_sizes=(100,),
activation='relu',
solver='adam',
alpha=0.0001,
batch_size=32,
learning_rate_init=0.001,
max_iter=500,
random_state=42
)
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Fold {fold + 1} Validation Accuracy: {accuracy:.4f}")
fold_data.append({
'Fold': fold + 1,
'Accuracy': accuracy,
'Model': mlp
})
if accuracy > best_accuracy:
best_accuracy = accuracy
best_model = mlp
print(f"\nBest Model Validation Accuracy: {best_accuracy:.4f}")
Training fold 1/10 Fold 1 Validation Accuracy: 0.5395 Training fold 2/10 Fold 2 Validation Accuracy: 0.4868 Training fold 3/10 Fold 3 Validation Accuracy: 0.3553 Training fold 4/10 Fold 4 Validation Accuracy: 0.4211 Training fold 5/10 Fold 5 Validation Accuracy: 0.5733 Training fold 6/10 Fold 6 Validation Accuracy: 0.4000 Training fold 7/10 Fold 7 Validation Accuracy: 0.4933 Training fold 8/10 Fold 8 Validation Accuracy: 0.4933 Training fold 9/10 Fold 9 Validation Accuracy: 0.5867 Training fold 10/10 Fold 10 Validation Accuracy: 0.5200 Best Model Validation Accuracy: 0.5867
In [3]:
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy: {test_accuracy}")
Test Accuracy: 0.4708994708994709
Reproduce the table
In [4]:
results_table = pd.DataFrame(fold_data)
print("\nCross-Validation Results:")
print(results_table)
Cross-Validation Results: Fold Accuracy Model 0 1 0.539474 MLPClassifier(batch_size=32, max_iter=500, ran... 1 2 0.486842 MLPClassifier(batch_size=32, max_iter=500, ran... 2 3 0.355263 MLPClassifier(batch_size=32, max_iter=500, ran... 3 4 0.421053 MLPClassifier(batch_size=32, max_iter=500, ran... 4 5 0.573333 MLPClassifier(batch_size=32, max_iter=500, ran... 5 6 0.400000 MLPClassifier(batch_size=32, max_iter=500, ran... 6 7 0.493333 MLPClassifier(batch_size=32, max_iter=500, ran... 7 8 0.493333 MLPClassifier(batch_size=32, max_iter=500, ran... 8 9 0.586667 MLPClassifier(batch_size=32, max_iter=500, ran... 9 10 0.520000 MLPClassifier(batch_size=32, max_iter=500, ran...
Reproduce the graphs.
In [5]:
plt.figure(figsize=(8, 6))
plt.plot(results_table['Fold'], results_table['Accuracy'], marker='o', linestyle='-')
plt.title('Validation Accuracy Across Folds (MLP)')
plt.xlabel('Fold')
plt.ylabel('Validation Accuracy')
plt.xticks(results_table['Fold'])
plt.grid(True)
plt.show()
In [6]:
plt.figure(figsize=(8,6))
sns.boxplot(y=results_table['Accuracy'])
plt.title('Distribution of Validation Accuracy Across Folds (MLP)')
plt.ylabel('Validation Accuracy')
plt.show()
Part 2
In [7]:
! pip install pytrends
# Code to import and structure data
import pandas as pd
import numpy as np
from pytrends.request import TrendReq
import datetime as dt
def initialize_pytrends():
"""Initialize PyTrends with default parameters"""
return TrendReq(hl='en-US', tz=360)
def fetch_search_data(keywords, timeframe='today 5-y'):
"""
Fetch search volume data for given keywords
Parameters:
keywords (list): List of search terms
timeframe (str): Time period for data collection
Returns:
DataFrame: Search volume data
"""
pytrends = initialize_pytrends()
try:
# Build payload
pytrends.build_payload(keywords,
cat=0,
timeframe=timeframe,
geo='',
gprop='')
# Get interest over time
df = pytrends.interest_over_time()
# Basic cleaning
df = df.drop(columns=['isPartial'])
df.index = pd.to_datetime(df.index)
return df
except Exception as e:
print(f"Error fetching data: {str(e)}")
return None
# Example usage
keywords = ['Nvidia', 'Tesla','Apple']
df = fetch_search_data(keywords)
Requirement already satisfied: pytrends in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (4.9.2) Requirement already satisfied: requests>=2.0 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pytrends) (2.31.0) Requirement already satisfied: pandas>=0.25 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pytrends) (2.1.1) Requirement already satisfied: lxml in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pytrends) (5.3.0) Requirement already satisfied: numpy>=1.23.2 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pandas>=0.25->pytrends) (1.26.0) Requirement already satisfied: python-dateutil>=2.8.2 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pandas>=0.25->pytrends) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pandas>=0.25->pytrends) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pandas>=0.25->pytrends) (2023.3) Requirement already satisfied: charset-normalizer<4,>=2 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from requests>=2.0->pytrends) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from requests>=2.0->pytrends) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from requests>=2.0->pytrends) (2.2.1) Requirement already satisfied: certifi>=2017.4.17 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from requests>=2.0->pytrends) (2024.12.14) Requirement already satisfied: six>=1.5 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas>=0.25->pytrends) (1.16.0)
In [8]:
import sqlite3
from datetime import datetime
class SVIDatabase:
def __init__(self, db_name='svi_data.db'):
"""Initialize database connection"""
self.conn = sqlite3.connect(db_name)
self.create_tables()
def create_tables(self):
"""Create necessary database tables"""
cursor = self.conn.cursor()
# Search terms table - stores unique search terms with their categories
cursor.execute('''
CREATE TABLE IF NOT EXISTS search_terms (
term_id INTEGER PRIMARY KEY,
term TEXT UNIQUE,
category TEXT,
first_collected DATETIME
)
''')
# Search volumes table - stores the actual search volume data
cursor.execute('''
CREATE TABLE IF NOT EXISTS search_volumes (
volume_id INTEGER PRIMARY KEY,
term_id INTEGER,
date DATETIME,
volume REAL,
FOREIGN KEY (term_id) REFERENCES search_terms(term_id)
)
''')
self.conn.commit()
def store_data(self, df, category='general'):
"""Store search volume data in database"""
cursor = self.conn.cursor()
for column in df.columns:
# Insert search term if it doesn't exist
cursor.execute('''
INSERT OR IGNORE INTO search_terms (term, category, first_collected)
VALUES (?, ?, ?)
''', (column, category, datetime.now()))
# Get term_id for the current search term
cursor.execute('SELECT term_id FROM search_terms WHERE term = ?', (column,))
term_id = cursor.fetchone()[0]
# Insert volume data for each date
for date, value in df[column].items():
cursor.execute('''
INSERT INTO search_volumes (term_id, date, volume)
VALUES (?, ?, ?)
''', (term_id, date, value))
self.conn.commit()
In [9]:
import sqlite3
from datetime import datetime
class SVIDatabase:
def __init__(self, db_name='svi_data.db'):
"""Initialize database connection"""
self.conn = sqlite3.connect(db_name)
self.create_tables()
def create_tables(self):
"""Create necessary database tables"""
cursor = self.conn.cursor()
# Search terms table
cursor.execute('''
CREATE TABLE IF NOT EXISTS search_terms (
term_id INTEGER PRIMARY KEY,
term TEXT UNIQUE,
category TEXT,
first_collected DATETIME
)
''')
# Search volumes table
cursor.execute('''
CREATE TABLE IF NOT EXISTS search_volumes (
volume_id INTEGER PRIMARY KEY,
term_id INTEGER,
date DATETIME,
volume REAL,
FOREIGN KEY (term_id) REFERENCES search_terms(term_id)
)
''')
self.conn.commit()
def store_data(self, df, category='general'):
"""Store search volume data in database"""
cursor = self.conn.cursor()
for column in df.columns:
# Insert search term
cursor.execute('''
INSERT OR IGNORE INTO search_terms (term, category, first_collected)
VALUES (?, ?, ?)
''', (column, category, datetime.now()))
# Get term_id
cursor.execute('SELECT term_id FROM search_terms WHERE term = ?', (column,))
term_id = cursor.fetchone()[0]
# Insert volumes
for date, value in df[column].items():
cursor.execute('''
INSERT INTO search_volumes (term_id, date, volume)
VALUES (?, ?, ?)
''', (term_id, date, value))
self.conn.commit()
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
def analyze_svi_data(df):
"""
Perform basic exploratory analysis of SVI data
Parameters:
df (DataFrame): Search volume data
"""
# Basic statistics
print("Basic Statistics:")
print(df.describe())
# Correlation analysis
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Search Terms')
plt.show()
# Time series decomposition
for column in df.columns:
decomposition = seasonal_decompose(df[column], period=52)
plt.figure(figsize=(15, 10))
plt.subplot(411)
plt.plot(df[column])
plt.title(f'Time Series Components for {column}')
plt.subplot(412)
plt.plot(decomposition.trend)
plt.title('Trend')
plt.subplot(413)
plt.plot(decomposition.seasonal)
plt.title('Seasonal')
plt.subplot(414)
plt.plot(decomposition.resid)
plt.title('Residual')
plt.tight_layout()
plt.show()
# Example usage
analyze_svi_data(df)
Basic Statistics:
Nvidia Tesla Apple
count 262.000000 262.000000 262.000000
mean 5.709924 13.767176 62.393130
std 2.652227 2.298423 8.670315
min 3.000000 7.000000 49.000000
25% 4.000000 12.000000 56.000000
50% 5.000000 14.000000 61.000000
75% 6.000000 15.000000 66.000000
max 28.000000 22.000000 100.000000
Useful link