Question 5.3

Pick one of the funds.

Download its data.

Pick 1 of the easier metrics (e.g. Correlation, or Dispersion ratio are easier than LASSO).

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

ewz = yf.download("EWZ", start="2020-01-01", end="2023-10-27")

def calculate_rsi(data, window):
    delta = data.diff()
    up = delta.clip(lower=0)
    down = -1*delta.clip(upper=0)
    avg_up = up.rolling(window=window).mean()
    avg_down = down.rolling(window=window).mean()
    rsi = 100 - (100 / (1 + avg_up / avg_down))
    return rsi

ewz['SMA_5'] = ewz['Close'].rolling(window=5).mean()
ewz['SMA_20'] = ewz['Close'].rolling(window=20).mean()
ewz['RSI_14'] = calculate_rsi(ewz['Close'], 14)
ewz['Volatility'] = ewz['Close'].rolling(window=10).std()

ewz.dropna(inplace=True)

ewz['Target'] = (ewz['Close'].shift(-1) > ewz['Close']).astype(int)
ewz.dropna(inplace=True)

X = ewz.drop('Target', axis=1)
y = ewz['Target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

correlation = ewz.corr()

correlation.head()
[*********************100%***********************]  1 of 1 completed
Out[1]:
Price Adj Close Close High Low Open Volume SMA_5 SMA_20 RSI_14 Volatility Target
Ticker EWZ EWZ EWZ EWZ EWZ EWZ
Price Ticker
Adj Close EWZ 1.000000 0.764202 0.747732 0.770245 0.752870 -0.132706 0.739613 0.623218 0.224921 -0.277076 -0.071891
Close EWZ 0.764202 1.000000 0.996332 0.996773 0.992819 -0.005895 0.980084 0.879476 0.207611 -0.144382 -0.066260
High EWZ 0.747732 0.996332 1.000000 0.995304 0.997008 0.029978 0.985462 0.893800 0.184472 -0.116321 -0.057554
Low EWZ 0.770245 0.996773 0.995304 1.000000 0.995921 -0.033448 0.979841 0.872775 0.207374 -0.171283 -0.059249
Open EWZ 0.752870 0.992819 0.997008 0.995921 1.000000 0.004716 0.984368 0.887459 0.182375 -0.138835 -0.056075

Implement the cross-fold validation or k-cross-fold validation.

In [2]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Now your cross-validation code
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

k = 10  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)
best_accuracy = 0
best_model = None
fold_data = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train_val)):
    print(f"Training fold {fold + 1}/{k}")
    
    X_train, X_val = X_train_val[train_index], X_train_val[val_index]
    y_train, y_val = y_train_val[train_index], y_train_val[val_index]
    
    mlp = MLPClassifier(
        hidden_layer_sizes=(100,),
        activation='relu',
        solver='adam',
        alpha=0.0001,
        batch_size=32,
        learning_rate_init=0.001,
        max_iter=500,
        random_state=42
    )
    
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Fold {fold + 1} Validation Accuracy: {accuracy:.4f}")
    
    fold_data.append({
        'Fold': fold + 1,
        'Accuracy': accuracy,
        'Model': mlp
    })
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = mlp

print(f"\nBest Model Validation Accuracy: {best_accuracy:.4f}")
Training fold 1/10
Fold 1 Validation Accuracy: 0.5395
Training fold 2/10
Fold 2 Validation Accuracy: 0.4868
Training fold 3/10
Fold 3 Validation Accuracy: 0.3553
Training fold 4/10
Fold 4 Validation Accuracy: 0.4211
Training fold 5/10
Fold 5 Validation Accuracy: 0.5733
Training fold 6/10
Fold 6 Validation Accuracy: 0.4000
Training fold 7/10
Fold 7 Validation Accuracy: 0.4933
Training fold 8/10
Fold 8 Validation Accuracy: 0.4933
Training fold 9/10
Fold 9 Validation Accuracy: 0.5867
Training fold 10/10
Fold 10 Validation Accuracy: 0.5200

Best Model Validation Accuracy: 0.5867
In [3]:
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy: {test_accuracy}")
Test Accuracy: 0.4708994708994709

Reproduce the table

In [4]:
results_table = pd.DataFrame(fold_data)
print("\nCross-Validation Results:")
print(results_table)
Cross-Validation Results:
   Fold  Accuracy                                              Model
0     1  0.539474  MLPClassifier(batch_size=32, max_iter=500, ran...
1     2  0.486842  MLPClassifier(batch_size=32, max_iter=500, ran...
2     3  0.355263  MLPClassifier(batch_size=32, max_iter=500, ran...
3     4  0.421053  MLPClassifier(batch_size=32, max_iter=500, ran...
4     5  0.573333  MLPClassifier(batch_size=32, max_iter=500, ran...
5     6  0.400000  MLPClassifier(batch_size=32, max_iter=500, ran...
6     7  0.493333  MLPClassifier(batch_size=32, max_iter=500, ran...
7     8  0.493333  MLPClassifier(batch_size=32, max_iter=500, ran...
8     9  0.586667  MLPClassifier(batch_size=32, max_iter=500, ran...
9    10  0.520000  MLPClassifier(batch_size=32, max_iter=500, ran...

Reproduce the graphs.

In [5]:
plt.figure(figsize=(8, 6))
plt.plot(results_table['Fold'], results_table['Accuracy'], marker='o', linestyle='-')
plt.title('Validation Accuracy Across Folds (MLP)')
plt.xlabel('Fold')
plt.ylabel('Validation Accuracy')
plt.xticks(results_table['Fold'])
plt.grid(True)
plt.show()
No description has been provided for this image
In [6]:
plt.figure(figsize=(8,6))
sns.boxplot(y=results_table['Accuracy'])
plt.title('Distribution of Validation Accuracy Across Folds (MLP)')
plt.ylabel('Validation Accuracy')
plt.show()
No description has been provided for this image

Part 2

In [7]:
! pip install pytrends
# Code to import and structure data
import pandas as pd
import numpy as np
from pytrends.request import TrendReq
import datetime as dt

def initialize_pytrends():
    """Initialize PyTrends with default parameters"""
    return TrendReq(hl='en-US', tz=360)

def fetch_search_data(keywords, timeframe='today 5-y'):
    """
    Fetch search volume data for given keywords

    Parameters:
    keywords (list): List of search terms
    timeframe (str): Time period for data collection

    Returns:
    DataFrame: Search volume data
    """
    pytrends = initialize_pytrends()

    try:
        # Build payload
        pytrends.build_payload(keywords,
                             cat=0,
                             timeframe=timeframe,
                             geo='',
                             gprop='')

        # Get interest over time
        df = pytrends.interest_over_time()

        # Basic cleaning
        df = df.drop(columns=['isPartial'])
        df.index = pd.to_datetime(df.index)

        return df

    except Exception as e:
        print(f"Error fetching data: {str(e)}")
        return None

# Example usage
keywords = ['Nvidia', 'Tesla','Apple']
df = fetch_search_data(keywords)
Requirement already satisfied: pytrends in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (4.9.2)
Requirement already satisfied: requests>=2.0 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pytrends) (2.31.0)
Requirement already satisfied: pandas>=0.25 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pytrends) (2.1.1)
Requirement already satisfied: lxml in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pytrends) (5.3.0)
Requirement already satisfied: numpy>=1.23.2 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pandas>=0.25->pytrends) (1.26.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pandas>=0.25->pytrends) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pandas>=0.25->pytrends) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from pandas>=0.25->pytrends) (2023.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from requests>=2.0->pytrends) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from requests>=2.0->pytrends) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from requests>=2.0->pytrends) (2.2.1)
Requirement already satisfied: certifi>=2017.4.17 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from requests>=2.0->pytrends) (2024.12.14)
Requirement already satisfied: six>=1.5 in /Users/bekay/anaconda3/envs/MsFE/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas>=0.25->pytrends) (1.16.0)
In [8]:
import sqlite3
from datetime import datetime

class SVIDatabase:
    def __init__(self, db_name='svi_data.db'):
        """Initialize database connection"""
        self.conn = sqlite3.connect(db_name)
        self.create_tables()
    
    def create_tables(self):
        """Create necessary database tables"""
        cursor = self.conn.cursor()
        
        # Search terms table - stores unique search terms with their categories
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS search_terms (
                term_id INTEGER PRIMARY KEY,
                term TEXT UNIQUE,
                category TEXT,
                first_collected DATETIME
            )
        ''')
        
        # Search volumes table - stores the actual search volume data
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS search_volumes (
                volume_id INTEGER PRIMARY KEY,
                term_id INTEGER,
                date DATETIME,
                volume REAL,
                FOREIGN KEY (term_id) REFERENCES search_terms(term_id)
            )
        ''')
        self.conn.commit()
    
    def store_data(self, df, category='general'):
        """Store search volume data in database"""
        cursor = self.conn.cursor()
        
        for column in df.columns:
            # Insert search term if it doesn't exist
            cursor.execute('''
                INSERT OR IGNORE INTO search_terms (term, category, first_collected)
                VALUES (?, ?, ?)
            ''', (column, category, datetime.now()))
            
            # Get term_id for the current search term
            cursor.execute('SELECT term_id FROM search_terms WHERE term = ?', (column,))
            term_id = cursor.fetchone()[0]
            
            # Insert volume data for each date
            for date, value in df[column].items():
                cursor.execute('''
                    INSERT INTO search_volumes (term_id, date, volume)
                    VALUES (?, ?, ?)
                ''', (term_id, date, value))
        
        self.conn.commit()
In [9]:
import sqlite3
from datetime import datetime

class SVIDatabase:
    def __init__(self, db_name='svi_data.db'):
        """Initialize database connection"""
        self.conn = sqlite3.connect(db_name)
        self.create_tables()

    def create_tables(self):
        """Create necessary database tables"""
        cursor = self.conn.cursor()

        # Search terms table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS search_terms (
                term_id INTEGER PRIMARY KEY,
                term TEXT UNIQUE,
                category TEXT,
                first_collected DATETIME
            )
        ''')

        # Search volumes table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS search_volumes (
                volume_id INTEGER PRIMARY KEY,
                term_id INTEGER,
                date DATETIME,
                volume REAL,
                FOREIGN KEY (term_id) REFERENCES search_terms(term_id)
            )
        ''')

        self.conn.commit()

    def store_data(self, df, category='general'):
        """Store search volume data in database"""
        cursor = self.conn.cursor()

        for column in df.columns:
            # Insert search term
            cursor.execute('''
                INSERT OR IGNORE INTO search_terms (term, category, first_collected)
                VALUES (?, ?, ?)
            ''', (column, category, datetime.now()))

            # Get term_id
            cursor.execute('SELECT term_id FROM search_terms WHERE term = ?', (column,))
            term_id = cursor.fetchone()[0]

            # Insert volumes
            for date, value in df[column].items():
                cursor.execute('''
                    INSERT INTO search_volumes (term_id, date, volume)
                    VALUES (?, ?, ?)
                ''', (term_id, date, value))

        self.conn.commit()
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

def analyze_svi_data(df):
    """
    Perform basic exploratory analysis of SVI data

    Parameters:
    df (DataFrame): Search volume data
    """
    # Basic statistics
    print("Basic Statistics:")
    print(df.describe())

    # Correlation analysis
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix of Search Terms')
    plt.show()

    # Time series decomposition
    for column in df.columns:
        decomposition = seasonal_decompose(df[column], period=52)

        plt.figure(figsize=(15, 10))
        plt.subplot(411)
        plt.plot(df[column])
        plt.title(f'Time Series Components for {column}')
        plt.subplot(412)
        plt.plot(decomposition.trend)
        plt.title('Trend')
        plt.subplot(413)
        plt.plot(decomposition.seasonal)
        plt.title('Seasonal')
        plt.subplot(414)
        plt.plot(decomposition.resid)
        plt.title('Residual')
        plt.tight_layout()
        plt.show()

# Example usage
analyze_svi_data(df)
Basic Statistics:
           Nvidia       Tesla       Apple
count  262.000000  262.000000  262.000000
mean     5.709924   13.767176   62.393130
std      2.652227    2.298423    8.670315
min      3.000000    7.000000   49.000000
25%      4.000000   12.000000   56.000000
50%      5.000000   14.000000   61.000000
75%      6.000000   15.000000   66.000000
max     28.000000   22.000000  100.000000
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Useful link

https://finance.yahoo.com/