This repository has been archived by the owner on Oct 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path預測日數和準確率繪圖2.py
288 lines (223 loc) · 12.1 KB
/
預測日數和準確率繪圖2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import pandas as pd
import talib
DataFolder = 'Ori_Data/'
Currency_data = pd.read_excel(DataFolder + 'USDtoTWD_Currency_data.xlsx',
index_col = 'Date') # 讀取匯率資料
Currency_data.drop(columns = ['Adj Close'], inplace = True)
df_open = Currency_data['Open']
df_close = Currency_data['Close']
df_high = Currency_data['High']
df_low = Currency_data['Low']
# 處理 x 資料
Currency_data['MA_5'] = talib.SMA(df_close, 5) # 計算 MA5
Currency_data['MA_10'] = talib.SMA(df_close, 10) # 計算 MA10
Currency_data['MA_20'] = talib.SMA(df_close, 20) # 計算 MA20
Currency_data['RSI_14'] = talib.RSI(df_close, 14) # 計算 RSI
macd, macdsignal, macdhist = talib.MACD(df_close, fastperiod = 12,
slowperiod = 26,
signalperiod = 9) # 計算 MACD
Currency_data['MACD'] = macd # 將 MACD 計算結果存回資料中
Currency_data['K'], Currency_data['D'] = \
talib.STOCH(df_high, df_low, df_close, fastk_period = 14,
slowk_period = 14, slowd_period = 3) # 計算 KD
upperband, middleband, lowerband = talib.BBANDS(df_close,
timeperiod=5,
nbdevup=2, nbdevdn=2,
matype=0)
Currency_data['Bollinger Bands Upper'] = upperband
Currency_data['Bollinger Bands Middle'] = middleband
Currency_data['Bollinger Bands lower'] = lowerband
Currency_data['CCI'] = talib.CCI(df_high, df_low, df_close, timeperiod = 14)
Currency_data['MOM'] = talib.MOM(df_close, timeperiod = 10)
Currency_data['BOP'] = talib.BOP(df_open, df_high, df_low, df_close)
Currency_data['WILLR'] = talib.WILLR(df_high, df_low, df_close,
timeperiod = 14)
Currency_data['SAR'] = talib.SAR(df_high, df_low)
Currency_data['AVGPRICE'] = talib.AVGPRICE(df_open, df_high, df_low, df_close)
Currency_data['WCLPRICE'] = talib.WCLPRICE(df_high, df_low, df_close)
Currency_data['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df_close, 14)
Currency_data['WMA'] = talib.WMA(df_close, 30) # 計算 MA5
Currency_data['STDDEV'] = talib.STDDEV (df_close, timeperiod = 5, nbdev = 1)
Currency_data['CDL3BLACKCROWS'] = talib.CDL3BLACKCROWS (df_open, df_high,
df_low, df_close)
# 讀入其他資料進行合併
Fed_Funds_Rate = pd.read_excel(DataFolder + 'Fed_Funds_Rate.xlsx')
USA_CPI = pd.read_excel(DataFolder + 'USA_CPI_Data.xlsx')
USA_Unemployment_Rate = pd.read_excel(DataFolder + 'USA_Unemployment_Rate.xlsx')
TW_CPI = pd.read_excel(DataFolder + 'TW_CPI.xlsx')
USA_GDP = pd.read_excel(DataFolder + 'USA_GDP.xlsx')
TW_Rate = pd.read_excel(DataFolder + 'TW_Rate.xlsx')
DXY_NYB = pd.read_excel(DataFolder + 'Dx-y_data.xlsx')
GOLD_data = pd.read_excel(DataFolder + 'Gold_data.xlsx')
# =============================================================================
# merge_asof,用於合併兩個數據框,其中一個數據框的時間戳(或排序列)可能在另一個數據框中找不到完全對應的記錄。這時,可以根據時間戳的前向或後向對齊進行合併。
# 參數說明
# left: 左側數據欄。
#
# right: 右側數據欄。
#
# on: 要合併的列名(必須在兩個數據框中都有),這通常是時間戳列。
#
# by: 可選,按指定列進行額外分組(例如,根據產品 ID 分組合併)。
#
# left_by 和 right_by: 可選,指定按這些列進行合併的分組。
#
# direction: 合併方向,有三個選項:
# 'backward'(預設):從右側數據框中選擇小於等於左側數據框的最接近值。
# 'forward':從右側數據框中選擇大於等於左側數據框的最接近值。
# 'nearest':從右側數據框中選擇最接近的值。
#
# tolerance: 可選,指定容忍度(以時間間隔為單位),用來限制合併的時間差。
# =============================================================================
df_merge = pd.merge_asof(Fed_Funds_Rate.sort_values('DATE'),
USA_CPI.sort_values('DATE'), on = 'DATE') # 合併資料
df_merge = pd.merge_asof(df_merge.sort_values('DATE'),
USA_Unemployment_Rate.sort_values('DATE'),
on = 'DATE') # 合併資料
TW_CPI = TW_CPI.rename(columns = {'CPI': 'TW_CPI'}) # 欄位名稱調整
df_merge = pd.merge_asof(df_merge.sort_values('DATE'),
TW_CPI.sort_values('DATE'), on = 'DATE') # 合併資料
USA_GDP = USA_GDP.rename(columns = {'GDP': 'US_GDP'}) # 欄位名稱調整
df_merge = pd.merge_asof(df_merge.sort_values('DATE'),
USA_GDP.sort_values('DATE'), on = 'DATE') # 合併資料
df_merge = pd.merge_asof(df_merge.sort_values('DATE'),
TW_Rate.sort_values('DATE'), on = 'DATE') # 合併資料
DXY_NYB = DXY_NYB.rename(columns = {'Date': 'DATE', 'Close': 'USD_Index',
'Growth Rate': 'USD_Index_Growth_Rate'}) # 美元指數小寫改大寫
df_merge = pd.merge_asof(df_merge.sort_values('DATE'),
DXY_NYB.sort_values('DATE'), on = 'DATE') # 合併資料
GOLD_data = GOLD_data.rename(columns = {'Date': 'DATE', 'Open': 'Gold_Open',
'High': 'Gold_High',
'Low': 'Gold_Low',
'Close': 'Gold_Close',
'Adj Close': 'Gold_Adj_Close',
'Volume': 'Gold_Volume',
'Growth Rate': 'Gold_Growth_Rate'}) # 黃金改大寫
df_merge = pd.merge_asof(df_merge.sort_values('DATE'),
GOLD_data.sort_values('DATE'), on = 'DATE') # 合併資料
Currency_data = Currency_data.reset_index()
Currency_data = Currency_data.rename(columns = {'Date': 'DATE'})
df_merge = pd.merge_asof(Currency_data.sort_values('DATE'),
df_merge.sort_values('DATE'), on = 'DATE') # 合併資料
# 計算兩筆資料間差距 (前後或者上下之間)
df_merge['FEDFUNDS_Delta'] = df_merge['FEDFUNDS'].pct_change(periods = 21)
# 因資料特定欄位計算有回朔需求而向前推進抓取時間,設定要排除的期間
end_date = '2004-12-31'
# 排除特定期間內的數據
df_merge.set_index('DATE', inplace = True)
df_merge.drop(df_merge.loc[:end_date].index, inplace = True)
print(df_merge.head())
# 計算差距欄位 (欄位之間)
df_merge['CPI_Delta'] = df_merge['CPIAUCNS'] - df_merge['TW_CPI'] # 兩國 CPI 差距
# 移除無用欄位
df_merge = df_merge.drop(columns = ['Volume', 'BOP', 'CDL3BLACKCROWS',
'Gold_Adj_Close'])
# 讀入近期數據
df_L = pd.read_excel('data_Latest.xlsx')
def classify_return(x):
return 1 if x > 0 else 0 # 標示漲跌,大於0標示為漲(1),小於0標示為跌(0)
def function(num): # 預測日數
# 處理 y 資料
pre_day = num
df_merge[f'Next_{pre_day}Day_Return'] = \
df_merge['Close'].diff(pre_day).shift(-pre_day) # 計算價格變化
# =============================================================================
# diff 函數,計算列中相鄰元素之間的差異。計算當前值與前指定時間點的值(pre_day)的差
# shift 函數﹑移動要指定哪個目標資料,負數表示向上移動,反之向下
# =============================================================================
df_merge['LABEL'] = \
df_merge[f'Next_{pre_day}Day_Return'].apply(
classify_return) # 創造新的一列 LABEL 來記錄漲跌
# 處理 y 資料
pre_day = num
df_L[f'Next_{pre_day}Day_Return'] = \
df_L['Close'].diff(pre_day).shift(-pre_day) # 計算價格變化
df_L['LABEL'] = \
df_L[f'Next_{pre_day}Day_Return'].apply(
classify_return) # 創造新的一列 LABEL 來記錄漲跌
feature_names = ['Gold_Close', 'Gold_High', 'CPIAUCNS', 'Gold_Open', 'UNRATE',
'MA_20', 'MA_10', 'USD_Index_Growth_Rate', 'TW_CPI_Rate',
'WILLR', 'Open', 'K', 'RSI_14', 'Gold_Volume',
'Gold_Growth_Rate', 'FEDFUNDS', 'Bollinger Bands lower',
'Bollinger Bands Upper', 'USA_GDP_Rate']
def split_stock_data(stock_data, label_column,
test_size = 0.3, random_state = 42):
X = stock_data[feature_names].values
y = stock_data[label_column].values
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = test_size,
random_state = random_state)
return X_train, X_test, y_train, y_test, feature_names
test_list = []
latest_test_scores = []
for i in range(100):
function(i + 1)
# 移除過去新增的欄位
if (i + 1) > 1:
df_merge = df_merge.drop(columns = [f'Next_{i}Day_Return'])
df_L = df_L.drop(columns = [f'Next_{i}Day_Return'])
#print(df_merge.head())
df_merge.to_excel("data.xlsx", index = False) # 將整理好的資料存成 excel
ones_count = (df_merge['LABEL'] == 1).sum()
zero_count = (df_merge['LABEL'] == 0).sum()
df_L.to_excel("data_Latest.xlsx", index = False) # 將整理好的資料存成 excel
ones_count = (df_L['LABEL'] == 1).sum()
zero_count = (df_L['LABEL'] == 0).sum()
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import time
# 讀取數據
df = pd.read_excel('data.xlsx')
df_L = pd.read_excel('data_Latest.xlsx')
label_column = 'LABEL'
# 分割資料
trainX, testX, trainY, testY, feature_names = \
split_stock_data(df, label_column)
Xgboost = XGBClassifier()
start_time = time.time()
Xgboost.fit(trainX, trainY)
training_time = time.time() - start_time
test_predic = Xgboost.predict(testX) # 取得預測的結果
test_acc = Xgboost.score(testX, testY)
print(f'Xgboost({i + 1}) 測試集準確率 %.3f' % test_acc)
# print(f"訓練時間: {training_time // 60:.2f} 分 {training_time % 60:.2f} 秒")
# 0.821
test_list.append(test_acc)
# 最新資料測試
X_Latest = df_L[feature_names].values
y_Latest = df_L['LABEL'].values
latest_data_test_acc = Xgboost.score(X_Latest, y_Latest)
latest_test_scores.append(latest_data_test_acc)
print(f'Xgboost({i + 1}) 近期數據準確率 %.3f' % latest_data_test_acc)
import matplotlib.pyplot as plt
# 顯示每批準確度變化
plt.rcParams['font.family'] = 'Microsoft JhengHei' # 設置中文字體
# 設置背景顏色
plt.gcf().patch.set_facecolor('black') # 設置整個圖表背景為黑色
plt.gca().set_facecolor('black') # 設置坐標軸背景為黑色
# =============================================================================
# 同效果方法二
# fig, ax = plt.subplots()
# fig.patch.set_facecolor('black') # 設置整個圖表背景為黑色
# ax.set_facecolor('black') # 設置坐標軸背景為黑色
# =============================================================================
plt.plot(range(1, len(test_list) + 1), test_list,
label = '測試集準確率', color = 'blue')
plt.plot(range(1, len(test_list) + 1), latest_test_scores,
label = '預測年初至近期準確率', color = 'lime')
plt.plot(range(1, len(test_list) + 1),
[i / len(df_L)for i in range(1, len(test_list) + 1)],
label = '無效資料占比', color = 'r')
plt.xticks([1, 5] + [i for i in range(10, len(test_list), 5)] + \
[len(test_list)],
['1'] + [str(i) for i in range(5, len(test_list), 5)] + \
[len(test_list)],
color = 'white')
plt.xlabel("預測日數", color = 'white', fontsize = 11)
ytick = [i / 100 for i in range(0, 105, 10)]
plt.yticks(ytick, [str(int(i * 100)) + ' %' for i in ytick], color = 'white')
plt.ylabel("準確率", color = 'white', fontsize = 11)
plt.title('預測日數準確率變化', color = 'white', fontsize = 14)
plt.grid(True)
plt.legend(loc = 'lower right', facecolor = 'black', labelcolor = 'w',
fontsize = 10)