부스팅 (Boosting )
ㄴ Boosting이란 약한 분류기를 결합하여 강한 분류기를 만드는 과정
ㄴ 각 0.3의 정확도를 가진 A, B, C를 결합하여 더 높은 정확도, 예를 들어 0.7 정도의 accuracy를 얻는 게 앙상블 알고리즘의 기본 원리
ㄴ Boosting은 이 과정을 순차적으로 실행
ㄴ A 분류기를 만든 후, 그 정보를 바탕으로 B 분류기를 만들고, 다시 그 정보를 바탕으로 C 분류기를 만듦
Adaptive Boosting (AdaBoost)
ㄴ 다수결을 통한 정답 분류 및 오답에 가중치 부여
Gradient Boosting Model (GBM)
ㄴ Loss Function의 gradient를 통해 오답에 가중치 부여
ㄴ LightGBM, CatBoost, XGBoost - Gradient Boosting Algorithm을 구현한 패키지
XGBoost
ml11_xgb_iris.py
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_iris() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
from xgboost import XGBClassifier
model = XGBClassifier()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# 결과
# SVC() 결과 acc : 0.9777777777777777
# LinearSVC() 결과 acc : 0.9777777777777777
# my 결과 acc : 1.0
# MinMaxScaler() 결과 acc : 0.9777777777777777
# ===============================================
# tree 결과 acc : 0.9555555555555556
# ===============================================
# ensemble 결과 acc : 0.9555555555555556
# ===============================================
# KFold 결과 acc : 1.0
# ====================================================================
# cv acc : [0.91666667 1. 0.91666667 0.83333333 1. ]
# ====================================================================
# cv pred acc : 0.8666666666666667
ㄴ cv pred acc : 0.8666666666666667 --> 성능 떨어짐
= > 굳이 사용하지 말자
ml11_xgb_california.py
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) #weight의 난수값 조절
#1. 데이터
datasets = fetch_california_housing()
x = datasets.data
y = datasets.target
x_train, x_test, y_train, y_test = train_test_split(
x, y, train_size = 0.7, random_state=100, shuffle= True
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler 적용
# scaler = MinMaxScaler()
# scaler = StandardScaler()
# scaler = MaxAbsScaler()
scaler = RobustScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x_train = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x_test = scaler.transform(x_test) # test 는 transform 만 하면 됨
# 2. 모델
from xgboost import XGBRegressor
model = XGBRegressor()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
r2 = r2_score(y_test, y_predict)
print('cv pred r2 : ', r2)
# cv pred r2 : 0.8161893201063641 --> 성능 향상됨
'''
##### feature impoartances #######
print(model, " : ", model.feature_importances_)
# 시각화
import matplotlib.pyplot as plt
from xgboost.plotting import plot_importance
plot_importance(model)
plt.show()
'''
# 시각화
import matplotlib.pyplot as plt
n_features = datasets.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), datasets.feature_names)
plt.title('california Feature Importances')
plt.ylabel('Feature')
plt.xlabel('Importances')
plt.ylim(-1, n_features)
plt.show()
# SVR() 결과 r2 : -0.01663695941103427
# 결과 r2 : 0.06830124384888547
# my r2스코어 : 0.5346585367965508
# RobustScaler 적용 후 결과 r2 : 0.6873119065345796
# ====================================================
# tree 결과 r2 : 0.612701922946608
# ====================================================
# ensemble 결과 r2 : 0.8114840410530733
# ====================================================
# kfold 결과 r2 : 0.8128621988883818
ㄴ cv pred r2 : 0.8161893201063641 --> 성능 향상됨
LightGBM (LGBM)
ml11_lgbm_california.py
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) #weight의 난수값 조절
#1. 데이터
datasets = fetch_california_housing()
x = datasets.data
y = datasets.target
x_train, x_test, y_train, y_test = train_test_split(
x, y, train_size = 0.7, random_state=100, shuffle= True
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler 적용
# scaler = MinMaxScaler()
# scaler = StandardScaler()
# scaler = MaxAbsScaler()
scaler = RobustScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x_train = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x_test = scaler.transform(x_test) # test 는 transform 만 하면 됨
# 2. 모델
from lightgbm import LGBMRegressor
model = LGBMRegressor()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
r2 = r2_score(y_test, y_predict)
print('cv pred r2 : ', r2)
# cv pred r2 : 0.8327038050418941
# SVR() 결과 r2 : -0.01663695941103427
# 결과 r2 : 0.06830124384888547
# my r2스코어 : 0.5346585367965508
# RobustScaler 적용 후 결과 r2 : 0.6873119065345796
# ====================================================
# tree 결과 r2 : 0.612701922946608
# ====================================================
# ensemble 결과 r2 : 0.8114840410530733
# ====================================================
# kfold 결과 r2 : 0.8128621988883818
ㄴ cv pred r2 : 0.8327038050418941 --> 성능 향상됨
CatBoost
ml11_catboost_california.py
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) #weight의 난수값 조절
#1. 데이터
datasets = fetch_california_housing()
x = datasets.data
y = datasets.target
x_train, x_test, y_train, y_test = train_test_split(
x, y, train_size = 0.7, random_state=100, shuffle= True
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler 적용
# scaler = MinMaxScaler()
# scaler = StandardScaler()
# scaler = MaxAbsScaler()
scaler = RobustScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x_train = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x_test = scaler.transform(x_test) # test 는 transform 만 하면 됨
# 2. 모델
from catboost import CatBoostRegressor
model = CatBoostRegressor() # 성능 가장 좋음
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
r2 = r2_score(y_test, y_predict)
print('cv pred r2 : ', r2)
# cv pred r2 : 0.8460367991799119
# SVR() 결과 r2 : -0.01663695941103427
# 결과 r2 : 0.06830124384888547
# my r2스코어 : 0.5346585367965508
# RobustScaler 적용 후 결과 r2 : 0.6873119065345796
# ====================================================
# tree 결과 r2 : 0.612701922946608
# ====================================================
# ensemble 결과 r2 : 0.8114840410530733
# ====================================================
# kfold 결과 r2 : 0.8128621988883818
ㄴ cv pred r2 : 0.8460367991799119 --> 성능 향상됨
'네이버클라우드 > AI' 카테고리의 다른 글
AI 7일차 (2023-05-16) 인공지능 기초 _머신러닝 - Bagging(배깅) (0) | 2023.05.16 |
---|---|
AI 7일차 (2023-05-16) 인공지능 기초 _머신러닝 - 하이퍼파라미터 튜닝_GridSearchCV (그리드서치) (0) | 2023.05.16 |
AI 6일차 (2023-05-15) 인공지능 기초 _머신러닝 - Feature Importances (0) | 2023.05.15 |
AI 6일차 (2023-05-15) 인공지능 기초 _머신러닝 - K-Fold 와 StratifiedKFold (0) | 2023.05.15 |
AI 6일차 (2023-05-15) 인공지능 기초 _머신러닝 - All_Estimator (0) | 2023.05.15 |