# lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
'tableau-colorblind10')
plt.style.use(
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.cluster import OPTICS, KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_selection import RFECV, SequentialFeatureSelector
# homemade
from feature_engineering import cluster
from feature_importance import importance
from labeling import labeling
from mlutil.pkfold import PKFold
머신러닝을 이용한 트레이딩: (4) 피쳐 선정
Feature Selection
trading
feature selection
- Input
- Label: 트렌드 스캐닝 기준 up-trend vs. (down- or no-trend)
- 기간 : 2005 - 2010
- 피쳐: market data features
- Model: 랜덤포레스트
- 5가지 피쳐 선정 기법 비교: original, mda-kmeans, mda-optics, mda-onc, rfecv, sbfs
- Output
- 최상의 방법으로 선정한 피쳐 사용
= pd.read_csv('C:data/market_samsung.csv')
market_df = market_df.rename(columns={market_df.columns[0]:'Date'})
market_df = pd.to_datetime(market_df.Date)
market_df.index ='Date',inplace=True)
market_df.drop(columns=True)
market_df.dropna(inplace
= pd.read_csv('C:data/features_samsung.csv')
feature_df = feature_df.rename(columns={feature_df.columns[0]:'Date'})
feature_df = pd.to_datetime(feature_df.Date)
feature_df.index ='Date',inplace=True)
feature_df.drop(columns=True) feature_df.dropna(inplace
= feature_df.dropna() X
X.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3873 entries, 2005-11-04 to 2021-10-15
Data columns (total 32 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 momentum_rsi_15 3873 non-null float64
1 momentum_wr_15 3873 non-null float64
2 trend_adx_15 3873 non-null float64
3 trend_aroon_ind_20 3873 non-null float64
4 trend_dpo_20 3873 non-null float64
5 trend_macd_diff_25_10_9 3873 non-null float64
6 trend_mass_index_10_25 3873 non-null float64
7 trend_trix_15 3873 non-null float64
8 volatility_atr_10 3873 non-null float64
9 volatility_ui_15 3873 non-null float64
10 volume_cmf_20 3873 non-null float64
11 volume_fi_15 3873 non-null float64
12 volume_mfi_15 3873 non-null float64
13 volume_sma_em_15 3873 non-null float64
14 volume_vpt 3873 non-null float64
15 ret_10 3873 non-null float64
16 ret_20 3873 non-null float64
17 ret_5 3873 non-null float64
18 std_30 3873 non-null float64
19 individual sma_5 3873 non-null float64
20 individual sma_20 3873 non-null float64
21 foreign sma_5 3873 non-null float64
22 foreign sma_20 3873 non-null float64
23 institutional sma_5 3873 non-null float64
24 institutional sma_20 3873 non-null float64
25 trend_back_scan_20 3873 non-null float64
26 trend_back_scan_60 3873 non-null float64
27 kyle_lambda 3873 non-null float64
28 amihud_lambda 3873 non-null float64
29 hasbrouck_lambda 3873 non-null float64
30 bekker_parkinson_vol 3873 non-null float64
31 corwin_schultz_estimator 3873 non-null float64
dtypes: float64(32)
memory usage: 998.5 KB
피쳐 선정(Feature selection)
클러스터 기반 방법
clustering
from sklearn.preprocessing import StandardScaler
= StandardScaler()
sc = sc.fit_transform(X)
X_sc = pd.DataFrame(X_sc, index=X.index, columns=X.columns) X_sc
=X_sc[:'2020'] X_sc
= []
silhouette_coefficients = {
kmeans_kwargs "init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42,
}
for k in range(2, 30):
= KMeans(n_clusters=k, **kmeans_kwargs)
kmeans
kmeans.fit(X.T)= silhouette_score(X.T, kmeans.labels_)
score silhouette_coefficients.append(score)
=np.argmin(silhouette_coefficients)+2
n_clusters= KMeans(
kmeans ="random",
init=n_clusters,
n_clusters=10,
n_init=300,
max_iter=42)
random_state
kmeans.fit(X_sc.T)= {i: X_sc.columns[np.where(kmeans.labels_ == i)[0]].tolist() for i in np.unique(kmeans.labels_)} clusters_kmeans
= OPTICS(min_cluster_size=2)
optics
optics.fit(X.T)= {i: X_sc.columns[np.where(optics.labels_ == i)[0]].tolist() for i in np.unique(optics.labels_)} clusters_optics
# 오래 걸림.
= cluster.get_feature_clusters(X_sc, dependence_metric= 'distance_correlation') clusters_onc_dist
No feature/s found with low silhouette score. All features belongs to its respective clusters
mda - selection
#labeling
= 60
trend_scanning_window = 3
trend_scanning_q = labeling.trend_scanning_label(market_df.close, window = trend_scanning_window, q = trend_scanning_q)
ts_out = ts_out[0]
mom_label = np.sign(mom_label-1)+1 # up-trend vs others y
= X_sc.copy()
raw_X
= raw_X.join(y).dropna()
tmp =tmp.iloc[:,:-1]
X=tmp.iloc[:,-1]
y
# train & test split
# use previous data for feature selection
= X.loc['2005':'2010']
X = y.loc['2005':'2010'] y
# CV
=4
n_cv= ts_out[1].loc[X.index]
t1 = PKFold(n_cv,t1,0.01) cv
= [clusters_kmeans[i] for i in range(n_clusters)]
clusters = [clusters_optics[i] for i in clusters_optics.keys()]
clusters2 = clusters_onc_dist clusters3
= RandomForestClassifier(n_estimators=1000,class_weight='balanced')
clf = importance.mean_decrease_accuracy(clf,X,y,cv,clustered_subsets=clusters)
mda_cluster = importance.mean_decrease_accuracy(clf,X,y,cv,clustered_subsets=clusters2)
mda_cluster2 = importance.mean_decrease_accuracy(clf,X,y,cv,clustered_subsets=clusters3) mda_cluster3
= mda_cluster.loc[mda_cluster['mean'] == mda_cluster['mean'].max()].index
features_mda_kmeans = mda_cluster2.loc[mda_cluster2['mean'] == mda_cluster2['mean'].max()].index
features_mda_optics = mda_cluster3.loc[mda_cluster3['mean'] == mda_cluster3['mean'].max()].index
features_mda_onc_dist
# 0에서 min 값으로 변경함.
= X[features_mda_kmeans]
new_X1 = X[features_mda_optics]
new_X2 = X[features_mda_onc_dist] new_X3
비-클러스터링 방법
- RFECV(Rercursive Feature Elimination with CV)
# 오래걸림
= RandomForestClassifier(class_weight='balanced')
rf
= 2 # Minimum number of features to consider
min_features_to_select = RFECV(
rfecv =rf,
estimator=1,
step=cv,
cv="accuracy",
scoring=min_features_to_select,
min_features_to_select
)= rfecv.fit_transform(X,y) new_X5_
= pd.DataFrame(new_X5_, index=X.index, columns=rfecv.get_feature_names_out()) new_X5
= [X,new_X1,new_X2,new_X3,new_X5] X_list
results
= RandomForestClassifier(class_weight='balanced')
clf = []
score_list for X_ in X_list:
= []
accs = []
f1 = []
roc_auc
for train, test in cv.split(X_, y):
clf.fit(X_.iloc[train], y.iloc[train])= y.iloc[test]
y_true = clf.predict(X_.iloc[test])
y_pred = clf.predict_proba(X_.iloc[test])
y_probs = y_probs[:, 1]
y_probs
accs.append(accuracy_score(y_true, y_pred))
f1.append(f1_score(y_true, y_pred))
roc_auc.append(roc_auc_score(y_true, y_probs))
= np.mean(accs)
accs = np.mean(f1)
f1 = np.mean(roc_auc)
roc = [accs, f1, roc]
scores score_list.append(scores)
= pd.DataFrame(score_list,
result_df =['accuracy','f1 score','roc auc score'],
columns= ['original','mda_kmeans','mda_optics','mda_onc','rfecv']) index
'mean_'] = result_df.mean(axis=1)
result_df['mean_', ascending=False) result_df.sort_values(
accuracy | f1 score | roc auc score | mean_ | |
---|---|---|---|---|
rfecv | 0.641745 | 0.279223 | 0.535976 | 0.485648 |
original | 0.630062 | 0.243987 | 0.510087 | 0.461379 |
mda_kmeans | 0.596573 | 0.251614 | 0.504474 | 0.450887 |
mda_onc | 0.592679 | 0.196722 | 0.464315 | 0.417905 |
mda_optics | 0.591900 | 0.156097 | 0.479004 | 0.409001 |
RFECV 기법으로 선정한 피쳐가 가장 좋은 성능을 보인다.
#best features
= X_list[result_df['mean_'].argmax()].iloc[0:2]
selected_features selected_features.columns
Index(['trend_adx_15', 'trend_mass_index_10_25', 'trend_trix_15',
'volatility_atr_10', 'volatility_ui_15', 'volume_cmf_20',
'volume_mfi_15', 'std_30', 'individual sma_20', 'foreign sma_20',
'institutional sma_20', 'trend_back_scan_60', 'kyle_lambda',
'amihud_lambda', 'hasbrouck_lambda'],
dtype='object')
'C:data/selected_features.csv') selected_features.to_csv(