# lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
'tableau-colorblind10')
plt.style.use(
# different models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score
# homemade
from feature_engineering import dimension_reduction as DR
from features import tautil
from labeling import labeling
from backtest import round_trip
from triple_barrier import make_rt
from mlutil.pkfold import PKFold
머신러닝을 이용한 트레이딩: (7) 매매 신뢰도 측정과 전략 강화
Enhancing the Strategy and Bet Confidence
trading
enhancing
machine learning
전략 강화 모형
inputs
- 라벨: 매매 규칙에 따른 전략의 결과 = 각 매매 성공/실패 여부
models: SVM, Random Forest, Gradient Boosting, LSTM
outputs
- 매매 신뢰도 (bet confidence)
매매 신뢰도를 이용한 전략 강화
import warnings
='ignore') warnings.filterwarnings(action
get X,y
= pd.read_csv('C:data/market_samsung.csv')
market_df = market_df.rename(columns={market_df.columns[0]:'Date'})
market_df = pd.to_datetime(market_df.Date)
market_df.index ='Date',inplace=True)
market_df.drop(columns=True)
market_df.dropna(inplace= market_df.close['2010':'2020']
close
= pd.read_csv('C:data/features_samsung.csv')
feature_df = feature_df.rename(columns={feature_df.columns[0]:'Date'})
feature_df = pd.to_datetime(feature_df.Date)
feature_df.index ='Date',inplace=True)
feature_df.drop(columns=True)
feature_df.dropna(inplace
= pd.read_csv('C:data/selected_features.csv').columns[1:] selected_features
= feature_df.dropna()
feature = feature[selected_features]
feature = StandardScaler()
sc = sc.fit_transform(feature)
X_sc = pd.DataFrame(X_sc, index=feature.index, columns=feature.columns) X_sc
#benchmark
= pd.read_csv('C:data/barrier_bm.csv')
barrier_bm = pd.to_datetime(barrier_bm.Date)
barrier_bm.index = pd.to_datetime(barrier_bm.exit)
barrier_bm.exit ='Date',inplace=True) barrier_bm.drop(columns
#labeling
= pd.read_csv('C:data/barrier.csv')
barrier = pd.to_datetime(barrier.Date)
barrier.index = pd.to_datetime(barrier.exit)
barrier.exit ='Date',inplace=True)
barrier.drop(columns
= make_rt(close,barrier.dropna())
rts = rts.rt_returns
outcome = rts.open_dt outcome.index
#meta-label
= np.sign(np.sign(outcome)+1)
wl = wl
y_ y_.value_counts()
1.0 608
0.0 421
Name: rt_returns, dtype: int64
= wl.value_counts()[0]
loss = wl.value_counts()[1]
win =(10,3))
plt.figure(figsize==1].index,close.loc[wl[wl==1].index], alpha=0.5)
plt.scatter(wl[wl==0].index,close.loc[wl[wl==0].index], marker='x', alpha=0.5)
plt.scatter(wl[wl'win 1','lose 0'])
plt.legend(['y (meta-label): win {}, lose {}'.format(win,loss))
plt.title( plt.show()
= X_sc.copy()
raw_X = raw_X.join(y_).dropna()
tmp =tmp.iloc[:,:-1]
X=tmp.iloc[:,-1] y
Model Construction
# Choose model
# Cross Validation (k-fold)
=4
n_cv= pd.to_datetime(barrier.exit.loc[X.index])
t1 = PKFold(n_cv,t1,0) cv
# Choose model (SVM-rbf)
= [0.1, 1,10]
C = dict(C=C)
param_grid_rbf = SVC(kernel='rbf', probability=True)
svc_rbf = GridSearchCV(estimator=svc_rbf, param_grid= param_grid_rbf, cv=cv, scoring='precision')
gs_svc_rbf
gs_svc_rbf.fit(X,y)= gs_svc_rbf.best_estimator_
svc_best svc_best
SVC(C=10, probability=True)
= [200,1000]
n_estimators #max_depth = [3,7]
= dict(n_estimators=n_estimators)
param_grid_rfc = RandomForestClassifier()
rfc = GridSearchCV(estimator=rfc, param_grid= param_grid_rfc, cv=cv, scoring='precision')
gs_rfc
gs_rfc.fit(X,y)= gs_rfc.best_estimator_
rfc_best rfc_best
RandomForestClassifier(n_estimators=200)
= [50,100]
n_estimators_ab = [1,0.1]
learning_rate = dict(n_estimators=n_estimators_ab, learning_rate=learning_rate)
param_grid_abc
=AdaBoostClassifier()
abc= GridSearchCV(estimator=abc, param_grid= param_grid_abc, cv=cv, scoring='precision')
gs_abc
gs_abc.fit(X,y)= gs_abc.best_estimator_
ada_best ada_best
AdaBoostClassifier(learning_rate=1, n_estimators=100)
= [100,200]
n_estimators_gb = [0.1,0.01]
learning_rate = dict(n_estimators=n_estimators_gb, learning_rate=learning_rate)
param_grid_gbc =GradientBoostingClassifier()
gbc= GridSearchCV(estimator=gbc, param_grid= param_grid_gbc, cv=cv, scoring='precision')
gs_gbc
gs_gbc.fit(X,y)= gs_gbc.best_estimator_
gbc_best gbc_best
GradientBoostingClassifier(learning_rate=0.01, n_estimators=200)
Model
= [svc_best, rfc_best, ada_best, gbc_best]
clf_list =['SVM_best','RF_best','AdaBoost_best','GradientBoost_best']
estimators= []
scores_list = []
y_preds_list = []
y_probs_list
# for ML model prediction
for clf in clf_list:
= []
y_preds_ = []
y_probs_
for train, test in cv.split(X, y):
clf.fit(X.iloc[train], y.iloc[train])= y.iloc[test]
y_true = clf.predict(X.iloc[test])
y_pred = clf.predict_proba(X.iloc[test])
y_probs = y_probs[:, 1]
y_probs = pd.Series(y_pred,index=y[test].index)
y_pred_series = pd.Series(y_probs,index=y[test].index)
y_probs_series
y_preds_.append(y_pred_series)
y_probs_.append(y_probs_series)
= pd.concat([i for i in y_preds_])
y_preds__ = pd.concat([i for i in y_probs_])
y_probs__ = y.loc[y_preds__.index]
y_true__ = accuracy_score(y_true__, y_preds__)
accs =f1_score(y_true__, y_preds__)
f1=roc_auc_score(y_true__, y_probs__)
roc=precision_score(y_true__, y_preds__)
prec= [accs, f1, roc, prec]
score
scores_list.append(score)
y_preds_list.append(y_preds__) y_probs_list.append(y_probs__)
= pd.DataFrame(scores_list, columns=['accuracy','f1 score','roc auc score','precision score'],index=estimators)
results = results.sort_values('precision score', ascending=False) result_show
result_show
accuracy | f1 score | roc auc score | precision score | |
---|---|---|---|---|
AdaBoost_best | 0.567541 | 0.631927 | 0.552471 | 0.635607 |
SVM_best | 0.544218 | 0.585323 | 0.574228 | 0.632887 |
RF_best | 0.549077 | 0.657817 | 0.537073 | 0.596257 |
GradientBoost_best | 0.519922 | 0.609177 | 0.490364 | 0.586890 |
= pd.DataFrame()
y_probs_df for i in range(len(estimators)):
= y_probs_list[i] y_probs_df[estimators[i]]
#평균
= pd.Series(y_probs_df.mean(axis=1),index=y_probs_df.index)
pred_prob
#하나하나
#y_probs_df_2 = y_probs_df[estimators[3]]
#pred_prob = pd.Series(y_probs_df_2,index=y_probs_df_2.index)
=pd.Series(normalize(pred_prob.to_frame().T).reshape(-1,), index=y_probs_df.index).rename('bet_confidence') pred_prob2
=pd.Series(MinMaxScaler().fit_transform(pred_prob2.to_frame()).reshape(-1,), index=y_probs_df.index).rename('bet_confidence') bet_confidence
'Bet confidence distribution')
plt.title(=30)[2]
plt.hist(bet_confidence, bins'Bet confidence')
plt.xlabel('counts') plt.ylabel(
Text(0, 0.5, 'counts')
= close.loc[bet_confidence.index]
c =(10,5))
plt.figure(figsize'Bet confidence')
plt.title(=0.1)
plt.plot(close, alpha= bet_confidence, s=20,cmap='vlag',vmin=0,vmax=1)
plt.scatter(c.index,c, c
plt.colorbar() plt.show()
Algo Trading Backtest
= barrier_bm.dropna()
barrier_bm = barrier.loc[bet_confidence.index].dropna()
barrier_before = barrier_before.loc[bet_confidence.loc[bet_confidence>0.5].index] barrier_enhanced
= make_rt(close,barrier_bm)
rts_bm = make_rt(close,barrier_before)
rts_before = make_rt(close,barrier_enhanced) rts_enhanced
= pd.concat([round_trip.get_df_ann_sr(rts_bm,'Benchmark',years=11),
result1 'Trading Strategy (Primary)',years=11)],axis=1)
round_trip.get_df_ann_sr(rts_before,
= round_trip.get_df_ann_sr(rts_enhanced,'Enhanced Trading Strategy (Second)',years=11)
df_sr = result1.join(df_sr) result1
result1
Benchmark | Trading Strategy (Primary) | Enhanced Trading Strategy (Second) | |
---|---|---|---|
avg_n_bets_per_year | 246.272727 | 93.545455 | 49.636364 |
win_ratio | 0.520506 | 0.590467 | 0.612844 |
annualized_sharpe_ratio | 0.538232 | 1.525995 | 1.623284 |
= pd.concat([round_trip.get_df_ann_sr(rts_bm,'Benchmark',years=11),
result2 'Trading Strategy (Primary)',years=11)],axis=1)
round_trip.get_df_ann_sr(rts_before,= []
winr for i in np.linspace(0.1,0.9,9):
= barrier_before.loc[bet_confidence.loc[bet_confidence>=i].index]
barrier_enhanced_ = make_rt(close,barrier_enhanced_)
rts_enhanced_ = round_trip.get_df_ann_sr(rts_enhanced_,'b',years=11)
df_sr 0]) winr.append(df_sr.T.win_ratio[
= dict(zip(np.linspace(0.1,0.9,9).round(2),winr)) dict_
= pd.DataFrame.from_dict(dict_,orient='index')
df_res =(10,5))
plt.figure(figsize"Hit-ratio of different thresholds strategy")
plt.title(0], width=0.05)
plt.bar(df_res.index, df_res[
plt.plot(df_res)'win ratio')
plt.ylabel('bet confidence threshold')
plt.xlabel(0.5,0.8)
plt.ylim( plt.show()
매매신뢰도의 전략 강화 결과 win ratio가 상승했으며, 신뢰도의 임계치에 따라 결과가 다른데, 임계치를 높이할수록 결과가 좋다.