-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_data_ti.py
163 lines (136 loc) · 5.78 KB
/
load_data_ti.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import yfinance as yf
import pandas as pd
import numpy as np
import talib
import sys
start = "2010-01-04"
end = "2020-05-08"
pd.options.mode.chained_assignment = None # default='warn'
# pd.set_option('display.max_rows', None)
# tickers = ['NVDA', 'QCOM']
tickers = ['NVDA']
def add_turbulence(data):
"""
add turbulence index from a precalcualted dataframe
:param data: (df) pandas dataframe
:return: (df) pandas dataframe
"""
df = data.copy()
turbulence_index = calculate_turbulence(df)
# Set 'date' as the index
turbulence_index.set_index('date', inplace=True)
# Drop rows where 'turbulence' is 0
df = turbulence_index[turbulence_index['turbulence'] != 0]
return df
def calculate_turbulence(data):
"""calculate turbulence index based on dow 30"""
# can add other market assets
df = data.copy()
df_price_pivot = df[['Close']]
# use returns to calculate turbulence
df_price_pivot = df_price_pivot.pct_change()
unique_date = df.index.tolist()
# start after a year
start = 252
turbulence_index = [0] * start
# turbulence_index = [0]
count = 0
for i in range(start, len(unique_date)):
current_price = df_price_pivot[df_price_pivot.index == unique_date[i]]
# use one year rolling window to calcualte covariance
hist_price = df_price_pivot[
(df_price_pivot.index < unique_date[i])
& (df_price_pivot.index >= unique_date[i - 252])
]
# Drop tickers which has number missing values more than the "oldest" ticker
filtered_hist_price = hist_price.iloc[
hist_price.isna().sum().min() :
].dropna(axis=1)
cov_temp = filtered_hist_price.cov()
current_temp = current_price[[x for x in filtered_hist_price]] - np.mean(
filtered_hist_price, axis=0
)
# cov_temp = hist_price.cov()
# current_temp=(current_price - np.mean(hist_price,axis=0))
temp = current_temp.values.dot(np.linalg.pinv(cov_temp)).dot(
current_temp.values.T
)
if temp > 0:
count += 1
if count > 2:
turbulence_temp = temp[0][0]
else:
# avoid large outlier because of the calculation just begins
turbulence_temp = 0
else:
turbulence_temp = 0
turbulence_index.append(turbulence_temp)
try:
turbulence_index = pd.DataFrame(
{"date": df_price_pivot.index, "turbulence": turbulence_index}
)
except ValueError:
raise Exception("Turbulence information could not be added.")
return turbulence_index
def back_252(date):
# Set the target date
target_date = pd.Timestamp(date)
# Generate a date range going backwards to include enough days to capture 252 trading days
all_dates = pd.date_range(end=target_date, periods=400, freq='B') # 'B' stands for business days
# The first date in the list should be 252 trading days behind the target date
return all_dates[-252]
# Get the data from the CSV files
stock_data = {}
for ticker in tickers:
df = pd.read_csv(f'data/{ticker}.csv', index_col='Date', parse_dates=True)
stock_data[ticker] = df
# split the data into training, validation and test sets
training_data_time_range = ('2008-11-13', '2015-12-31')
validation_data_time_range = ('2016-01-01', '2016-12-31')
test_data_time_range = ('2017-01-01', '2020-05-08')
# split the data into training, validation and test sets
training_data = {}
validation_data = {}
test_data = {}
for ticker, df in stock_data.items():
training_data[ticker] = df.loc[training_data_time_range[0]:training_data_time_range[1]]
validation_data[ticker] = df.loc[validation_data_time_range[0]:validation_data_time_range[1]]
test_data[ticker] = df.loc[test_data_time_range[0]:test_data_time_range[1]]
# print shape of training, validation and test data
ticker = 'NVDA'
print(f'Training data shape for {ticker}: {training_data[ticker].shape}')
print(f'Validation data shape for {ticker}: {validation_data[ticker].shape}')
print(f'Test data shape for {ticker}: {test_data[ticker].shape}')
# Display the first 5 rows of the data
print(stock_data['NVDA'].head())
print("###################")
def add_technical_indicators(df, ticker):
df['MACD'], df['MACD_Signal'], _ = talib.MACD(df['Close'])
df['RSI'] = talib.RSI(df['Close'])
df['CCI'] = talib.CCI(df['High'], df['Low'], df['Close'])
df['ADX'] = talib.ADX(df['High'], df['Low'], df['Close'])
df_temp = pd.read_csv(f'data/{ticker}.csv', index_col='Date', parse_dates=True)
df_temp = df_temp.loc[back_252(df.index.min()):df.index.max()]
# turbulence_index = calculate_turbulence(df_temp)
# print(turbulence_index)
df['TINDEX'] = add_turbulence(df_temp)
# drop NaN values
df.dropna(inplace=True)
# keep Open, High, Low, Close, Volume, MACD, Signal, RSI, CCI, ADX
df = df[['Open', 'High', 'Low', 'Close', 'Volume', 'MACD', 'MACD_Signal', 'RSI', 'CCI', 'ADX', 'TINDEX']]
return df
# add technical indicators to the training data for each stock
for ticker, df in training_data.items():
training_data[ticker] = add_technical_indicators(df, ticker)
# add technical indicators to the validation data for each stock
# for ticker, df in validation_data.items():
# validation_data[ticker] = add_technical_indicators(df, ticker)
# # add technical indicators to the test data for each stock
# for ticker, df in test_data.items():
# test_data[ticker] = add_technical_indicators(df, ticker)
# print the first 5 rows of the data
print('Shape of training data for NVDA:', training_data['NVDA'].shape)
print('Shape of validation data for NVDA:', validation_data['NVDA'].shape)
print('Shape of test data for NVDA:', test_data['NVDA'].shape)
print("###################")
print(training_data['NVDA'].head())