Feature Importances
ㄴ 머신러닝 모델에서 각 특성(feature)이 예측 결과에 얼마나 중요한 역할을 하는지를 나타내는 지표
ㄴ 모델링 과정에서 특성 선택(feature selection)이나 특성 중요도 파악에 활용될 수 있음
= > 중요한 특성을 선택하거나 불필요한 특성을 제거할 수 있음 --> 모델의 복잡성을 줄이고 과적합을 방지
ㄴ 일반적으로 트리 기반 알고리즘인 Decision Tree, Random Forest, Gradient Boosting 등에서 Feature Importances를 계산할 수 있음
ㄴ 각 특성의 중요도는 해당 특성이 분기 결정에 얼마나 크게 기여하는지를 측정한 값임
ㄴ 일반적으로 Feature Importances 값은 0과 1 사이의 값으로 표현되며, 모든 특성의 중요도 값의 합은 1임
ㄴ Feature Importances를 시각화하여 각 특성의 상대적인 중요도를 비교할 수도 있음
ㄴ 주의할 점은 Feature Importances는 인과 관계를 나타내지는 않음 --> 특성 간의 상관관계나 도메인 지식을 고려하여 해석해야 함
RandomForestClassifier 와 DecisionTreeClassifier
ml09_feature_importance_iris.py
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_iris() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
model = RandomForestClassifier()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc : 0.9333333333333333
# stratifiedKFold cv pred acc : 0.9333333333333333
###### feature importance ##########
print(model, " : ", model.feature_importances_)
# cv pred acc : 0.9666666666666667
# RandomForestClassifier() : [0.10919274 0.0294921 0.41943273 0.44188242]
# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('iris Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)
plt.show()
ml09_feature_importance_cancer.py
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_breast_cancer() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
model = RandomForestClassifier()
# model = DecisionTreeClassifier()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc : 0.9333333333333333
# stratifiedKFold cv pred acc : 0.9333333333333333
###### feature importance ##########
print(model, " : ", model.feature_importances_)
# cv pred acc : 0.9333333333333333
# RandomForestClassifier() : [0.07901659 0.0336995 0.44851384 0.43877007]
# cv pred acc : 0.9210526315789473
# DecisionTreeClassifier() : [0. 0.05847766 0. 0. 0. 0.
# 0. 0.69141955 0. 0. 0.01198257 0.
# 0. 0. 0.00123678 0. 0.01855447 0.01593081
# 0. 0. 0.05229927 0.00937631 0.05149396 0.
# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('cancer Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)
plt.show()
# cv pred acc : 0.956140350877193
# RandomForestClassifier() : [0.02443992 0.01231809 0.04860259 0.03692516 0.00602853 0.01530975
# 0.04929247 0.12535733 0.00450328 0.00613796 0.00773211 0.0045192
# 0.01451287 0.0626458 0.00414944 0.0043145 0.00862259 0.00404682
# 0.00417373 0.00526797 0.09041171 0.01994775 0.07992037 0.10940668
# 0.0124072 0.02641721 0.03291273 0.1608525 0.01100217 0.00782157]
model = RandomForestClassifier()
model = DecisionTreeClassifier()
ml09_feature_importance_wine.py
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_wine() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
# model = RandomForestClassifier()
model = DecisionTreeClassifier()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc : 0.9333333333333333
# stratifiedKFold cv pred acc : 0.9333333333333333
###### feature importance ##########
print(model, " : ", model.feature_importances_)
# cv pred acc : 0.9166666666666666
# DecisionTreeClassifier() : [0.01899507 0. 0.02094206 0. 0. 0.
# 0.41105326 0. 0. 0.38493424 0. 0.
# 0.16407537]
# cv pred acc : 0.9444444444444444
# RandomForestClassifier() : [0.10694432 0.02804456 0.01501634 0.04239458 0.03297987 0.05527102
# 0.14608004 0.01357275 0.02125219 0.19027345 0.07083705 0.13506052
# 0.1422733 ]
# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('wine Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)
plt.show()
model = RandomForestClassifier()
model = DecisionTreeClassifier()
RandomForestRegressor 와 DecisionTreeRegressor
ml09_feature_importance_california.py
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = fetch_california_housing() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
model = RandomForestRegressor()
# model = DecisionTreeRegressor()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
r2 = r2_score(y_test, y_predict)
print('cv pred acc : ', r2)
###### feature importance ##########
print(model, " : ", model.feature_importances_)
# cv pred acc : 0.7471223874826161
# RandomForestRegressor() : [0.5239796 0.05301973 0.04321174 0.02901859 0.03021195 0.13861994
# 0.09079173 0.09114671]
# cv pred acc : 0.5193726837904202
# DecisionTreeRegressor() : [0.52823158 0.05252938 0.0532782 0.02808679 0.0309845 0.12997447
# 0.09318208 0.083733 ]
# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('california Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)
plt.show()
model = RandomForestRegressor()
model = DecisionTreeRegressor()
drop_features
ㄴ 모델의 성능 향상이나 데이터 처리의 효율성을 위해 특성을 제거
# drop_features
x = np.delete(x, 1, axis=1)
# cv pred acc : 0.9333333333333333
x = np.delete(x, 0, axis=1)
# cv pred acc : 0.9666666666666667
# x = np.delete(x, [0, 1], axis=1)
# cv pred acc : 1.0
ml10_fi_iris_delete.py
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_iris() # 다중분류
x = datasets['data']
y = datasets.target
# print(datasets.DESCR)
# drop_features
x = np.delete(x, 1, axis=1)
# cv pred acc : 0.9333333333333333
x = np.delete(x, 0, axis=1)
# cv pred acc : 0.9666666666666667
# x = np.delete(x, [0, 1], axis=1)
# cv pred acc : 1.0
print(x.shape) # (150, 3)
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
model = RandomForestClassifier()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
###### feature importance ##########
print(model, " : ", model.feature_importances_)
# delete 전
# cv pred acc : 0.9333333333333333
# RandomForestClassifier() : [0.07901659 0.0336995 0.44851384 0.43877007]
# 0번째, 1번째 delete 후
# cv pred acc : 1.0
# RandomForestClassifier() : [0.49936052 0.50063948]
'''
# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('iris Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)
plt.show()
'''