XGBoost early stopping
ㄴ Early stopping : 모델의 훈련을 조기에 중단하는 기법 --> 모델이 더 이상 개선되지 않을 때 훈련을 종료하여 과적합을 방지하고 효율적인 모델을 얻을 수 있음
ㄴ XGBoost에서는 early_stopping_rounds 매개변수를 사용하여 early stopping을 구현할 수 있음
metrics
eval_metric
회귀모델 : rmse, mae, rmsle...
이진분류 : error, auc, logloss...
다중분류 : merror, mlogloss...
실습
ml21_xgb_earlystopping_california.py
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) #weight의 난수값 조절
#1. 데이터
datasets = fetch_california_housing()
x = datasets.data
y = datasets.target
x_train, x_test, y_train, y_test = train_test_split(
x, y, train_size = 0.7, random_state=100, shuffle= True
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler 적용
# scaler = MinMaxScaler()
# scaler = StandardScaler()
# scaler = MaxAbsScaler()
scaler = RobustScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x_train = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x_test = scaler.transform(x_test) # test 는 transform 만 하면 됨
# 2. 모델
from xgboost import XGBRegressor
model = XGBRegressor(random_state=123, n_estimators=1000,
learning_rate = 0.1, max_depth = 6, gamma= 1)
# 3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=20,
eval_set = [(x_train, y_train), (x_test, y_test)],
eval_metric='rmse')
# eval_set : 검증
# fit 메서드를 사용하여 모델을 훈련
# early_stopping_rounds = 20 : 20번의 반복 동안 검증 데이터의 성능 향상이 없을 경우 훈련을 조기 종료
# eval_set 매개변수에 훈련 세트와 테스트 세트를 지정하여 성능을 평가
# eval_metric 매개변수를 사용하여 평가 지표를 설정
# eval_metric 회귀모델 : rmse, mae, rmsle...
# 이진분류 : error, auc, logloss...
# 다중분류 : merror, mlogloss...
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
r2 = r2_score(y_test, y_predict)
print('cv pred r2 : ', r2)
# cv pred r2 : 0.8184110207883767
ㄴ fetch_california_housing 데이터는 회귀이므로 eval_metric = 'rmse'
ml21_xgb_earlystopping_cancer.py
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_breast_cancer() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
from xgboost import XGBClassifier
model = XGBClassifier()
# 3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=100,
eval_set = [(x_train, y_train), (x_test, y_test)],
eval_metric='error') # 이진분류 : error
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc : 0.9473684210526315
ㄴ cload_breast_cancer 데이터는 이진분류이므로 eval_metric = 'error' 사용
ml21_xgb_earlystopping_iris.py
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_iris() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
from xgboost import XGBClassifier
model = XGBClassifier()
# 3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=100,
eval_set = [(x_train, y_train), (x_test, y_test)],
eval_metric='merror') # 다중분류 : merror
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc : 0.8666666666666667
ㄴ load_iris 데이터는 다중분류이므로 eval_metric = 'merror' 사용
ml21_xgb_earlystopping_wine.py
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_wine() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
from xgboost import XGBClassifier
model = XGBClassifier()
# 3. 훈련
model.fit(x_train, y_train, early_stopping_rounds=100,
eval_set = [(x_train, y_train), (x_test, y_test)],
eval_metric='merror') # 다중분류 : merror
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc : 0.8611111111111112
ㄴ load_wine 데이터는 다중분류이므로 eval_metricn = 'merror' 사용
'네이버클라우드 > AI' 카테고리의 다른 글
AI 8일차 (2023-05-17) 인공지능 기초 _머신러닝 - 데이터 전처리_Optuna 과 QuantileTransformer (0) | 2023.05.18 |
---|---|
AI 8일차 (2023-05-17) 인공지능 기초 _머신러닝 - Feature Importances + SelectFromModel (0) | 2023.05.18 |
AI 7일차 (2023-05-16) 인공지능 기초 _머신러닝 - 데이터 전처리_결측치 처리와 log 변환 (0) | 2023.05.16 |
AI 7일차 (2023-05-16) 인공지능 기초 _머신러닝 - outliers (아웃라이어) (0) | 2023.05.16 |
AI 7일차 (2023-05-16) 인공지능 기초 _머신러닝 - Voting(보팅) (0) | 2023.05.16 |