본문 바로가기
네이버클라우드/AI

AI 8일차 (2023-05-17) 인공지능 기초 _머신러닝 - Feature Importances + SelectFromModel

by prometedor 2023. 5. 18.

Feature Importances + SelectFromModel

ㄴ 모델에서 중요한 특성을 선택하는 데 사용되는 기법

 

from sklearn.feature_selection import SelectFromModel
thresholds = model.feature_importances_

#2. 모델
model = XGBRegressor(random_state=123, n_estimators=1000,
learning_rate = 0.1, max_depth = 6, gamma= 1)


#3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=200,
eval_set = [(x_train, y_train), (x_test, y_test)],
eval_metric='rmse')


#4. 평가, 예측
result = model.score(x_test, y_test)
print('r2 : ', result)

y_predict = model.predict(x_test)
acc = r2_score(y_test, y_predict)
print("진짜 최종 test 점수 : ", acc)
print(model.feature_importances_)

thresholds = model.feature_importances_

print("=========== SelectFromModel ===============")
for thresh in thresholds:
	selection = SelectFromModel(model, threshold=thresh, prefit=True)
	select_x_train = selection.transform(x_train)
	select_x_test = selection.transform(x_test)
	print(select_x_train.shape, select_x_test.shape)
    
	selection_model = XGBRegressor(n_jobs=-1,
	random_state=123,
	n_estimators=1000,
	learning_rate = 0.1,
	max_depth = 6,
	gamma= 1,)

    selection_model.fit(select_x_train, y_train)
    y_predict = selection_model.predict(select_x_test)
    score = r2_score(y_test, y_predict)
    print("Thresh=%.3f, n=%d, R2:%.2f%%"
   		 %(thresh, select_x_train.shape[1], score*100))

 

 

실습

ml22_selectfromModel_iris.py

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler

import tensorflow as tf
tf.random.set_seed(77)  # weight 의 난수값 조정

# 1. 데이터
datasets = load_iris()  # 다중분류
x = datasets['data']
y = datasets.target

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42

)


# kfold
n_splits = 11    # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)


# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test) 


# 2. 모델
from xgboost import XGBClassifier
model = XGBClassifier()


# 3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=100,
          eval_set = [(x_train, y_train), (x_test, y_test)],
          eval_metric='merror')  # 다중분류 : merror


# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
# print('cv pred : ', y_predict)
# cv pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)


# SelectFromModel
from sklearn.feature_selection import SelectFromModel	# model.feature_importances_를 통해 학습된 모델의 특성 중요도를 가져옴
thresholds = model.feature_importances_	# 이를 thresholds 변수에 할당


for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)   # prefit=True --> 자신보다 작은 걸 가지고온다
    # thresholds에 있는 각 임계치(threshold)에 대해 반복
	# SelectFromModel을 생성하고, 임계치를 threshold 매개변수로 설정하여 특성 선택 객체를 생성
	# prefit=True로 설정하여 model이 이미 학습된 상태라고 가정
    select_x_train = selection.transform(x_train)	
    select_x_test = selection.transform(x_test)
    # 특성 선택 객체를 사용하여 x_train과 x_test 데이터에서 선택된 특성만 남김
    # 이를 각각 select_x_train과 select_x_test 변수에 할당함
    print(select_x_train.shape)
    print(select_x_test.shape)

    selection_model = XGBClassifier()
    selection_model.fit(select_x_train, y_train)
    y_predict = selection_model.predict(select_x_test)
    score = accuracy_score(y_test, y_predict)
    print("Thresh=%.3f, n=%d, acc:%.2f%%"
        %(thresh, select_x_train.shape[1], score*100))
    # 선택된 특성만으로 새로운 모델(selection_model)을 생성하고, 이 모델을 훈련
	# 테스트 데이터로 예측을 수행하고, 정확도를 계산하여 출력
	# 임계치(thresh)와 선택된 특성 개수(select_x_train.shape[1]), 그리고 정확도(score)를 출력

# cv pred acc :  0.8666666666666667
# (120, 4)
# (30, 4)
# Thresh=0.011, n=4, acc:100.00%
# (120, 3)
# (30, 3)
# Thresh=0.029, n=3, acc:100.00%
# (120, 1)
# (30, 1)
# Thresh=0.752, n=1, acc:93.33%
# (120, 2)
# (30, 2)
# Thresh=0.207, n=2, acc:100.00%

# (120, 4)
# (30, 4)
# Thresh=0.011, n=4, acc:100.00%

ㄴ 특성 선택 과정에서 임계치(thresh)가 0.011일 때, 선택된 특성의 개수(n)가 4개이며, 이를 활용하여 예측한 결과 정확도(acc)가 100%라는 것을 의미
ㄴ 임계치 0.011을 기준으로 상위 4개의 특성이 선택되었고, 이 4개의 특성을 사용하여 예측 모델을 구성하였을 때, 테스트 데이터에 대한 정확도가 100%가 나왔다는 것을 의미

 

 

ml22_selectfromModel_cancer.py

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler

import tensorflow as tf
tf.random.set_seed(77)  # weight 의 난수값 조정

# 1. 데이터
datasets = load_breast_cancer()  # 다중분류
x = datasets['data']
y = datasets.target

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42

)
# kfold
n_splits = 11    # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)


# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test) 



# 2. 모델
from xgboost import XGBClassifier
model = XGBClassifier()

# 3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=100,
          eval_set = [(x_train, y_train), (x_test, y_test)],
          eval_metric='error')  # 이진분류 : error
# cv pred acc :  0.9473684210526315


# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
# print('cv pred : ', y_predict)
# cv pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)


# SelectFromModel
from sklearn.feature_selection import SelectFromModel
thresholds = model.feature_importances_

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)   # prefit=True --> 자신보다 작은 걸 가지고온다
    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)
    print(select_x_train.shape)
    print(select_x_test.shape)

    selection_model = XGBClassifier()
    selection_model.fit(select_x_train, y_train)
    y_predict = selection_model.predict(select_x_test)
    score = accuracy_score(y_test, y_predict)
    print("Thresh=%.3f, n=%d, acc:%.2f%%"
        %(thresh, select_x_train.shape[1], score*100))
    

# cv pred acc :  0.9473684210526315
# (455, 15)
# (114, 15)
# Thresh=0.008, n=15, acc:95.61%
# (455, 8)
# (114, 8)
# Thresh=0.024, n=8, acc:95.61%
# (455, 13)
# (114, 13)
# Thresh=0.014, n=13, acc:95.61%
# (455, 9)
# (114, 9)
# Thresh=0.020, n=9, acc:95.61%
# (455, 16)
# (114, 16)
# Thresh=0.006, n=16, acc:96.49%
# (455, 18)
# (114, 18)
# Thresh=0.005, n=18, acc:96.49%
# (455, 7)
# (114, 7)
# Thresh=0.028, n=7, acc:94.74%
# (455, 1)
# (114, 1)
# Thresh=0.401, n=1, acc:87.72%
# (455, 27)
# (114, 27)
# Thresh=0.002, n=27, acc:96.49%
# (455, 21)
# (114, 21)
# Thresh=0.004, n=21, acc:96.49%
# (455, 14)
# (114, 14)
# Thresh=0.013, n=14, acc:95.61%
# (455, 19)
# (114, 19)
# Thresh=0.005, n=19, acc:96.49%
# (455, 10)
# (114, 10)
# Thresh=0.018, n=10, acc:95.61%
# (455, 20)
# (114, 20)
# Thresh=0.004, n=20, acc:96.49%
# (455, 22)
# (114, 22)
# Thresh=0.004, n=22, acc:96.49%
# (455, 23)
# (114, 23)
# Thresh=0.004, n=23, acc:96.49%
# (455, 6)
# (114, 6)
# Thresh=0.029, n=6, acc:94.74%
# (455, 28)
# (114, 28)
# Thresh=0.002, n=28, acc:95.61%
# (455, 29)
# (114, 29)
# Thresh=0.001, n=29, acc:95.61%
# (455, 25)
# (114, 25)
# Thresh=0.002, n=25, acc:96.49%
# (455, 5)
# (114, 5)
# Thresh=0.055, n=5, acc:95.61%
# (455, 11)
# (114, 11)
# Thresh=0.016, n=11, acc:95.61%
# (455, 4)
# (114, 4)
# Thresh=0.057, n=4, acc:95.61%
# (455, 3)
# (114, 3)
# Thresh=0.067, n=3, acc:95.61%
# (455, 17)
# (114, 17)
# Thresh=0.005, n=17, acc:96.49%
# (455, 26)
# (114, 26)
# Thresh=0.002, n=26, acc:96.49%
# (455, 12)
# (114, 12)
# Thresh=0.015, n=12, acc:95.61%
# (455, 2)
# (114, 2)
# Thresh=0.184, n=2, acc:87.72%
# (455, 24)
# (114, 24)
# Thresh=0.003, n=24, acc:96.49%
# (455, 30)
# (114, 30)
# Thresh=0.000, n=30, acc:95.61%

# (455, 16)
# (114, 16)
# Thresh=0.006, n=16, acc:96.49%

ㄴ 특성 선택 과정에서 임계치(thresh)가 0.006일 때, 선택된 특성의 개수(n)가 16개이며, 이를 활용하여 예측한 결과 정확도(acc)가 96.49%라는 것을 의미
ㄴ 임계치 0.006을 기준으로 상위 16개의 특성이 선택되었고, 이 16개의 특성을 사용하여 예측 모델을 구성하였을 때, 테스트 데이터에 대한 정확도가 96.49%가 나왔다는 것을 의미

 

ml22_selectfromModel_wine.py

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler

import tensorflow as tf
tf.random.set_seed(77)  # weight 의 난수값 조정

# 1. 데이터
datasets = load_wine()  # 다중분류
x = datasets['data']
y = datasets.target
feature_name = datasets.feature_names
print(feature_name)

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42

)
# kfold
n_splits = 11    # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)


# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test) 



# 2. 모델
from xgboost import XGBClassifier
model = XGBClassifier()

# 3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=100,
          eval_set = [(x_train, y_train), (x_test, y_test)],
          eval_metric='merror')  # 다중분류 : merror


# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)
# print('cv pred : ', y_predict)
# cv pred :  [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)


# SelectFromModel
from sklearn.feature_selection import SelectFromModel
thresholds = model.feature_importances_

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)   # prefit=True --> 자신보다 작은 걸 가지고온다
    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)
    print(select_x_train.shape)
    print(select_x_test.shape)

    selection_model = XGBClassifier()
    selection_model.fit(select_x_train, y_train)
    y_predict = selection_model.predict(select_x_test)
    score = accuracy_score(y_test, y_predict)
    print("Thresh=%.3f, n=%d, acc:%.2f%%"
        %(thresh, select_x_train.shape[1], score*100))
    
# 컬럼명 출력
selected_feature_indices = selection.get_support(indices=True)
selected_feature_names = [feature_name[i] for i in selected_feature_indices]
print(selected_feature_names)
# selection.get_support(indices=True)를 사용하여 선택된 특성의 인덱스를 가져옴
# feature_name은 특성들의 이름이 담긴 리스트로 가정하고, 선택된 특성의 이름을 출력함


# cv pred acc :  0.8611111111111112
# (142, 7)
# (36, 7)
# Thresh=0.014, n=7, acc:94.44%
# (142, 10)
# (36, 10)
# Thresh=0.009, n=10, acc:97.22%
# (142, 8)
# (36, 8)
# Thresh=0.013, n=8, acc:94.44%
# (142, 11)
# (36, 11)
# Thresh=0.005, n=11, acc:97.22%
# (142, 5)
# (36, 5)
# Thresh=0.031, n=5, acc:94.44%
# (142, 6)
# (36, 6)
# Thresh=0.017, n=6, acc:94.44%
# (142, 4)
# (36, 4)
# Thresh=0.119, n=4, acc:94.44%
# (142, 13)
# (36, 13)
# Thresh=0.000, n=13, acc:97.22%
# (142, 12)
# (36, 12)
# Thresh=0.001, n=12, acc:97.22%
# (142, 2)
# (36, 2)
# Thresh=0.173, n=2, acc:88.89%
# (142, 9)
# (36, 9)
# Thresh=0.011, n=9, acc:94.44%
# (142, 1)
# (36, 1)
# Thresh=0.469, n=1, acc:61.11%
# (142, 3)
# (36, 3)
# Thresh=0.139, n=3, acc:91.67%

# (142, 10)
# (36, 10)
# Thresh=0.009, n=10, acc:97.22%

ㄴ 특성 선택 과정에서 임계치(thresh)가 0.009일 때, 선택된 특성의 개수(n)가 10개이며, 이를 활용하여 예측한 결과 정확도(acc)가 97.22%라는 것을 의미
ㄴ 임계치 0.009를 기준으로 상위 10개의 특성이 선택되었고, 이 10개의 특성을 사용하여 예측 모델을 구성하였을 때, 테스트 데이터에 대한 정확도가 97.22%가 나왔다는 것을 의미

 

 

ml22_xgb_selectfromModel_california.py

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler

import tensorflow as tf
tf.random.set_seed(77) #weight의 난수값 조절

#1. 데이터
datasets = fetch_california_housing()
x = datasets.data
y = datasets.target


x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size = 0.7,  random_state=100, shuffle= True 
)

# kfold
n_splits = 11    # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)

# Scaler 적용
# scaler = MinMaxScaler()
# scaler = StandardScaler()
# scaler = MaxAbsScaler()
scaler = RobustScaler()
scaler.fit(x_train)                 # train 은 fit, transform 모두 해줘야 함
x_train = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x_test = scaler.transform(x_test)   # test 는 transform 만 하면 됨

# 2. 모델
from xgboost import XGBRegressor
model = XGBRegressor(random_state=123, n_estimators=1000,
                    learning_rate = 0.1, max_depth = 6, gamma= 1)

# 3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=20,
          eval_set = [(x_train, y_train), (x_test, y_test)],
          eval_metric='rmse')  
          # eval_metric 회귀모델 : rmse, mae, rmsle...
          #             이진분류 : error, auc, logloss...
          #             다중분류 : merror, mlogloss...
          # eval_set : 검증
        

# 4. 평가, 예측
score = cross_val_score(model, 
                        x_train, y_train, 
                        cv=kfold)   # cv : corss validation
# print('cv acc : ', score)   # kfold 에 있는 n_splits 숫자만큼 나옴 
y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv=kfold)

r2 = r2_score(y_test, y_predict)
print('cv pred r2 : ', r2)


# SelectFromModel
from sklearn.feature_selection import SelectFromModel
thresholds = model.feature_importances_

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)   # prefit=True --> 자신보다 작은 걸 가지고온다
    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)
    print(select_x_train.shape)
    print(select_x_test.shape)

    selection_model = XGBRegressor()
    selection_model.fit(select_x_train, y_train)
    y_predict = selection_model.predict(select_x_test)
    score = r2_score(y_test, y_predict)
    print("Thresh=%.3f, n=%d, R2:%.2f%%"
        %(thresh, select_x_train.shape[1], score*100))
    
# cv pred r2 :  0.8184110207883767
# (14447, 1)
# (6193, 1)
# Thresh=0.534, n=1, R2:48.35%
# (14447, 3)
# (6193, 3)
# Thresh=0.083, n=3, R2:62.49%
# (14447, 6)
# (6193, 6)
# Thresh=0.037, n=6, R2:84.35%
# (14447, 8)
# (6193, 8)
# Thresh=0.022, n=8, R2:84.47%
# (14447, 7)
# (6193, 7)
# Thresh=0.027, n=7, R2:83.89%
# (14447, 2)
# (6193, 2)
# Thresh=0.142, n=2, R2:58.01%
# (14447, 5)
# (6193, 5)
# Thresh=0.077, n=5, R2:83.24%
# (14447, 4)
# (6193, 4)
# Thresh=0.078, n=4, R2:73.51%

ㄴ 임계치(thresh)가 0.022일 때, 선택된 특성의 개수가 8개이며, 이를 활용하여 모델을 훈련한 결과 R2 스코어가 84.47%라는 것을 나타냄
ㄴ (14447, 8) : 선택된 특성을 포함한 훈련 데이터의 크기는 14447개의 샘플과 8개의 특성임
ㄴ (6193, 8) : 선택된 특성을 포함한 테스트 데이터의 크기는 6193개의 샘플과 8개의 특성임
ㄴ Thresh=0.022, n=8, R2:84.47%: 임계치(thresh)가 0.022일 때, 선택된 특성의 개수(n)는 8개이며, 이를 활용하여 모델을 훈련한 결과 R2 스코어가 84.47%임

ㄴ R2 스코어는 회귀 모델의 예측 성능을 나타내는 지표로, 1에 가까울수록 예측이 잘 된다는 것을 의미함