본문 바로가기
네이버클라우드/AI

AI 6일차 (2023-05-15) 인공지능 기초 _머신러닝 - Feature Importances

by prometedor 2023. 5. 15.

Feature Importances

ㄴ 머신러닝 모델에서 각 특성(feature)이 예측 결과에 얼마나 중요한 역할을 하는지를 나타내는 지표

ㄴ 모델링 과정에서 특성 선택(feature selection)이나 특성 중요도 파악에 활용될 수 있음

      = > 중요한 특성을 선택하거나 불필요한 특성을 제거할 수 있음 --> 모델의 복잡성을 줄이고 과적합을 방지
ㄴ 일반적으로 트리 기반 알고리즘인 Decision Tree, Random Forest, Gradient Boosting 등에서 Feature Importances를 계산할 수 있음

ㄴ 각 특성의 중요도는 해당 특성이 분기 결정에 얼마나 크게 기여하는지를 측정한 값임

ㄴ 일반적으로 Feature Importances 값은 0과 1 사이의 값으로 표현되며, 모든 특성의 중요도 값의 합은 1임

ㄴ Feature Importances를 시각화하여 각 특성의 상대적인 중요도를 비교할 수도 있음
ㄴ 주의할 점은 Feature Importances는 인과 관계를 나타내지는 않음  -->  특성 간의 상관관계나 도메인 지식을 고려하여 해석해야 함

 

 

RandomForestClassifier 와 DecisionTreeClassifier

ml09_feature_importance_iris.py

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
tf.random.set_seed(77)  # weight 의 난수값 조정

# 1. 데이터
datasets = load_iris()  # 다중분류
x = datasets['data']
y = datasets.target

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42
)


# kfold
n_splits = 11    # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)


# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test) 


# 2. 모델
model = RandomForestClassifier()


# 3. 훈련
model.fit(x_train, y_train)


# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
# print('cv pred : ', y_predict)
# cv pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc :  0.9333333333333333
# stratifiedKFold cv pred acc :  0.9333333333333333


###### feature importance ##########
print(model, " : ", model.feature_importances_)
# cv pred acc :  0.9666666666666667
# RandomForestClassifier()  :  [0.10919274 0.0294921  0.41943273 0.44188242]


# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('iris Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)

plt.show()

 

 

 

ml09_feature_importance_cancer.py

import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
tf.random.set_seed(77)  # weight 의 난수값 조정

# 1. 데이터
datasets = load_breast_cancer()  # 다중분류
x = datasets['data']
y = datasets.target

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42
)

# kfold
n_splits = 11    # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)

# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test) 


# 2. 모델
model = RandomForestClassifier()
# model = DecisionTreeClassifier()


# 3. 훈련
model.fit(x_train, y_train)


# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
# print('cv pred : ', y_predict)
# cv pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc :  0.9333333333333333
# stratifiedKFold cv pred acc :  0.9333333333333333


###### feature importance ##########
print(model, " : ", model.feature_importances_)

# cv pred acc :  0.9333333333333333
# RandomForestClassifier()  :  [0.07901659 0.0336995  0.44851384 0.43877007]

# cv pred acc :  0.9210526315789473
# DecisionTreeClassifier()  :  [0.         0.05847766 0.         0.         0.         0.
# 0.         0.69141955 0.         0.         0.01198257 0.
# 0.         0.         0.00123678 0.         0.01855447 0.01593081
# 0.         0.         0.05229927 0.00937631 0.05149396 0.

# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('cancer Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)

plt.show()

# cv pred acc :  0.956140350877193
# RandomForestClassifier()  :  [0.02443992 0.01231809 0.04860259 0.03692516 0.00602853 0.01530975
# 0.04929247 0.12535733 0.00450328 0.00613796 0.00773211 0.0045192
# 0.01451287 0.0626458  0.00414944 0.0043145  0.00862259 0.00404682
# 0.00417373 0.00526797 0.09041171 0.01994775 0.07992037 0.10940668
# 0.0124072  0.02641721 0.03291273 0.1608525  0.01100217 0.00782157]

 

model = RandomForestClassifier()

 

model = DecisionTreeClassifier()

 

 

 

ml09_feature_importance_wine.py

import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
tf.random.set_seed(77)  # weight 의 난수값 조정

# 1. 데이터
datasets = load_wine()  # 다중분류
x = datasets['data']
y = datasets.target

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42

)
# kfold
n_splits = 11    # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)


# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test) 


# 2. 모델
# model = RandomForestClassifier()
model = DecisionTreeClassifier()


# 3. 훈련
model.fit(x_train, y_train)


# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
# print('cv pred : ', y_predict)
# cv pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc :  0.9333333333333333
# stratifiedKFold cv pred acc :  0.9333333333333333


###### feature importance ##########
print(model, " : ", model.feature_importances_)

# cv pred acc :  0.9166666666666666
# DecisionTreeClassifier()  :  [0.01899507 0.         0.02094206 0.         0.         0.
# 0.41105326 0.         0.         0.38493424 0.         0.
# 0.16407537]

# cv pred acc :  0.9444444444444444
# RandomForestClassifier()  :  [0.10694432 0.02804456 0.01501634 0.04239458 0.03297987 0.05527102
# 0.14608004 0.01357275 0.02125219 0.19027345 0.07083705 0.13506052
# 0.1422733 ]

# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('wine Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)

plt.show()

 

model = RandomForestClassifier()

 

model = DecisionTreeClassifier()

 

 

 

RandomForestRegressor 와 DecisionTreeRegressor

ml09_feature_importance_california.py

import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
tf.random.set_seed(77)  # weight 의 난수값 조정

# 1. 데이터
datasets = fetch_california_housing()  # 다중분류
x = datasets['data']
y = datasets.target

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42

)
# kfold
n_splits = 11    # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)


# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test) 



# 2. 모델
model = RandomForestRegressor()
# model = DecisionTreeRegressor()

# 3. 훈련
model.fit(x_train, y_train)


# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
# print('cv pred : ', y_predict)
# cv pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
r2 = r2_score(y_test, y_predict)
print('cv pred acc : ', r2)


###### feature importance ##########
print(model, " : ", model.feature_importances_)

# cv pred acc :  0.7471223874826161
# RandomForestRegressor()  :  [0.5239796  0.05301973 0.04321174 0.02901859 0.03021195 0.13861994
# 0.09079173 0.09114671]
 
# cv pred acc :  0.5193726837904202
# DecisionTreeRegressor()  :  [0.52823158 0.05252938 0.0532782  0.02808679 0.0309845  0.12997447
# 0.09318208 0.083733  ]


# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('california Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)

plt.show()

 

model = RandomForestRegressor()

 

model = DecisionTreeRegressor()

 

 

 

drop_features

ㄴ 모델의 성능 향상이나 데이터 처리의 효율성을 위해 특성을 제거

# drop_features
x = np.delete(x, 1, axis=1)
# cv pred acc :  0.9333333333333333
x = np.delete(x, 0, axis=1)
# cv pred acc :  0.9666666666666667

# x = np.delete(x, [0, 1], axis=1)
# cv pred acc :  1.0

 

ml10_fi_iris_delete.py

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
tf.random.set_seed(77)  # weight 의 난수값 조정

# 1. 데이터
datasets = load_iris()  # 다중분류
x = datasets['data']
y = datasets.target
# print(datasets.DESCR)

# drop_features
x = np.delete(x, 1, axis=1)
# cv pred acc :  0.9333333333333333
x = np.delete(x, 0, axis=1)
# cv pred acc :  0.9666666666666667
# x = np.delete(x, [0, 1], axis=1)
# cv pred acc :  1.0

print(x.shape)  # (150, 3)

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42
)


# kfold
n_splits = 11    # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)


# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test) 


# 2. 모델
model = RandomForestClassifier()


# 3. 훈련
model.fit(x_train, y_train)


# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)

acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)


###### feature importance ##########
print(model, " : ", model.feature_importances_)
# delete 전
# cv pred acc :  0.9333333333333333
# RandomForestClassifier()  :  [0.07901659 0.0336995  0.44851384 0.43877007]

# 0번째, 1번째 delete 후
# cv pred acc :  1.0
# RandomForestClassifier()  :  [0.49936052 0.50063948]

'''
# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('iris Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)

plt.show()
'''