머신러닝을 이용한 트레이딩: (4) 피쳐 선정

Feature Selection

trading
feature selection
Author

Cheonghyo Cho

# lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()
plt.style.use('tableau-colorblind10')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

from sklearn.cluster import OPTICS, KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_selection import RFECV, SequentialFeatureSelector

# homemade
from feature_engineering import cluster
from feature_importance import importance
from labeling import labeling
from mlutil.pkfold import PKFold
market_df = pd.read_csv('C:data/market_samsung.csv')
market_df = market_df.rename(columns={market_df.columns[0]:'Date'})
market_df.index = pd.to_datetime(market_df.Date)
market_df.drop(columns='Date',inplace=True)
market_df.dropna(inplace=True)

feature_df = pd.read_csv('C:data/features_samsung.csv')
feature_df = feature_df.rename(columns={feature_df.columns[0]:'Date'})
feature_df.index = pd.to_datetime(feature_df.Date)
feature_df.drop(columns='Date',inplace=True)
feature_df.dropna(inplace=True)
X = feature_df.dropna()
X.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3873 entries, 2005-11-04 to 2021-10-15
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   momentum_rsi_15           3873 non-null   float64
 1   momentum_wr_15            3873 non-null   float64
 2   trend_adx_15              3873 non-null   float64
 3   trend_aroon_ind_20        3873 non-null   float64
 4   trend_dpo_20              3873 non-null   float64
 5   trend_macd_diff_25_10_9   3873 non-null   float64
 6   trend_mass_index_10_25    3873 non-null   float64
 7   trend_trix_15             3873 non-null   float64
 8   volatility_atr_10         3873 non-null   float64
 9   volatility_ui_15          3873 non-null   float64
 10  volume_cmf_20             3873 non-null   float64
 11  volume_fi_15              3873 non-null   float64
 12  volume_mfi_15             3873 non-null   float64
 13  volume_sma_em_15          3873 non-null   float64
 14  volume_vpt                3873 non-null   float64
 15  ret_10                    3873 non-null   float64
 16  ret_20                    3873 non-null   float64
 17  ret_5                     3873 non-null   float64
 18  std_30                    3873 non-null   float64
 19  individual sma_5          3873 non-null   float64
 20  individual sma_20         3873 non-null   float64
 21  foreign sma_5             3873 non-null   float64
 22  foreign sma_20            3873 non-null   float64
 23  institutional sma_5       3873 non-null   float64
 24  institutional sma_20      3873 non-null   float64
 25  trend_back_scan_20        3873 non-null   float64
 26  trend_back_scan_60        3873 non-null   float64
 27  kyle_lambda               3873 non-null   float64
 28  amihud_lambda             3873 non-null   float64
 29  hasbrouck_lambda          3873 non-null   float64
 30  bekker_parkinson_vol      3873 non-null   float64
 31  corwin_schultz_estimator  3873 non-null   float64
dtypes: float64(32)
memory usage: 998.5 KB

피쳐 선정(Feature selection)

클러스터 기반 방법

clustering

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)
X_sc = pd.DataFrame(X_sc, index=X.index, columns=X.columns)
X_sc=X_sc[:'2020']
silhouette_coefficients = []
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

for k in range(2, 30):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X.T)
    score = silhouette_score(X.T, kmeans.labels_)
    silhouette_coefficients.append(score)
n_clusters=np.argmin(silhouette_coefficients)+2
kmeans = KMeans(
    init="random",
    n_clusters=n_clusters,
    n_init=10,
    max_iter=300,
    random_state=42)
kmeans.fit(X_sc.T)
clusters_kmeans = {i: X_sc.columns[np.where(kmeans.labels_ == i)[0]].tolist() for i in np.unique(kmeans.labels_)}
optics = OPTICS(min_cluster_size=2)
optics.fit(X.T)
clusters_optics = {i: X_sc.columns[np.where(optics.labels_ == i)[0]].tolist() for i in np.unique(optics.labels_)}
# 오래 걸림.
clusters_onc_dist = cluster.get_feature_clusters(X_sc, dependence_metric= 'distance_correlation')
No feature/s found with low silhouette score. All features belongs to its respective clusters

mda - selection

#labeling
trend_scanning_window = 60
trend_scanning_q = 3
ts_out = labeling.trend_scanning_label(market_df.close, window = trend_scanning_window, q = trend_scanning_q)
mom_label = ts_out[0]
y = np.sign(mom_label-1)+1 # up-trend vs others
raw_X = X_sc.copy()

tmp = raw_X.join(y).dropna()
X=tmp.iloc[:,:-1]
y=tmp.iloc[:,-1]

# train & test split
# use previous data for feature selection
X = X.loc['2005':'2010']
y = y.loc['2005':'2010']
# CV
n_cv=4
t1 = ts_out[1].loc[X.index]
cv = PKFold(n_cv,t1,0.01)
clusters = [clusters_kmeans[i] for i in range(n_clusters)]
clusters2 = [clusters_optics[i] for i in clusters_optics.keys()]
clusters3 = clusters_onc_dist
clf = RandomForestClassifier(n_estimators=1000,class_weight='balanced')
mda_cluster = importance.mean_decrease_accuracy(clf,X,y,cv,clustered_subsets=clusters)
mda_cluster2 = importance.mean_decrease_accuracy(clf,X,y,cv,clustered_subsets=clusters2)
mda_cluster3 = importance.mean_decrease_accuracy(clf,X,y,cv,clustered_subsets=clusters3)
features_mda_kmeans = mda_cluster.loc[mda_cluster['mean'] == mda_cluster['mean'].max()].index
features_mda_optics = mda_cluster2.loc[mda_cluster2['mean'] == mda_cluster2['mean'].max()].index
features_mda_onc_dist = mda_cluster3.loc[mda_cluster3['mean'] == mda_cluster3['mean'].max()].index

# 0에서 min 값으로 변경함.
new_X1 = X[features_mda_kmeans]
new_X2 = X[features_mda_optics]
new_X3 = X[features_mda_onc_dist]

비-클러스터링 방법

  • RFECV(Rercursive Feature Elimination with CV)
# 오래걸림

rf = RandomForestClassifier(class_weight='balanced')

min_features_to_select = 2  # Minimum number of features to consider
rfecv = RFECV(
    estimator=rf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
)
new_X5_ = rfecv.fit_transform(X,y)
new_X5 = pd.DataFrame(new_X5_, index=X.index, columns=rfecv.get_feature_names_out())
X_list = [X,new_X1,new_X2,new_X3,new_X5]

results

clf = RandomForestClassifier(class_weight='balanced')
score_list = []
for X_ in X_list:
    accs = []
    f1 = []
    roc_auc = []

    for train, test in cv.split(X_, y):
        clf.fit(X_.iloc[train], y.iloc[train])
        y_true = y.iloc[test]
        y_pred = clf.predict(X_.iloc[test])
        y_probs = clf.predict_proba(X_.iloc[test])
        y_probs = y_probs[:, 1]
        accs.append(accuracy_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        roc_auc.append(roc_auc_score(y_true, y_probs))


    accs = np.mean(accs)
    f1 = np.mean(f1)
    roc = np.mean(roc_auc)
    scores = [accs, f1, roc]
    score_list.append(scores)
result_df = pd.DataFrame(score_list,
            columns=['accuracy','f1 score','roc auc score'], 
             index = ['original','mda_kmeans','mda_optics','mda_onc','rfecv'])
result_df['mean_'] = result_df.mean(axis=1)
result_df.sort_values('mean_', ascending=False)
accuracy f1 score roc auc score mean_
rfecv 0.641745 0.279223 0.535976 0.485648
original 0.630062 0.243987 0.510087 0.461379
mda_kmeans 0.596573 0.251614 0.504474 0.450887
mda_onc 0.592679 0.196722 0.464315 0.417905
mda_optics 0.591900 0.156097 0.479004 0.409001

RFECV 기법으로 선정한 피쳐가 가장 좋은 성능을 보인다.

#best features
selected_features = X_list[result_df['mean_'].argmax()].iloc[0:2]
selected_features.columns
Index(['trend_adx_15', 'trend_mass_index_10_25', 'trend_trix_15',
       'volatility_atr_10', 'volatility_ui_15', 'volume_cmf_20',
       'volume_mfi_15', 'std_30', 'individual sma_20', 'foreign sma_20',
       'institutional sma_20', 'trend_back_scan_60', 'kyle_lambda',
       'amihud_lambda', 'hasbrouck_lambda'],
      dtype='object')
selected_features.to_csv('C:data/selected_features.csv')