K-Fold
ㄴ ML 모델에서 가장 보편적으로 사용되는 교차 검증 기법
ㄴ K개의 데이터 폴드 세트를 만들어서 K번만큼 각 폴드 세트에 학습과 검증 평가를 수행
KFold 와 StratifiedKFold
ㄴ scikit-learn에서 제공하는 교차 검증(Cross-validation) 전략
ㄴ KFold
ㄴ 데이터를 k개의 폴드(fold)로 나누어 각 폴드를 한 번씩 테스트 세트로 사용하고 나머지 폴드들을 훈련 세트로 사용하여 모델을 학습 및 평가
ㄴ 데이터를 무작위로 섞은 후에 나누기 때문에 일반적으로 데이터셋이 충분히 큰 경우에 사용됩니다.
ㄴ StratifiedKFold
ㄴ KFold와 유사하지만 클래스별 비율을 유지하는 데에 초점을 둔 교차 검증 전략
ㄴ 각 폴드에서 훈련 세트와 테스트 세트에 속하는 클래스의 비율이 전체 데이터셋과 동일하게 유지됨
ㄴ 이는 데이터의 클래스 분포가 불균형한 경우에 유용
ex) 이진 분류 문제에서 클래스 A와 클래스 B의 비율이 80:20이라고 가정
ㄴ KFold를 사용하면 특정 폴드에서 클래스 A의 샘플 비율이 80%, 클래스 B의 샘플 비율이 20%가 되지 않을 수 있음
ㄴ StratifiedKFold를 사용하면 각 폴드에서 클래스 A와 클래스 B의 비율을 유지하여 평가를 수행
= > KFold는 데이터를 무작위로 섞어 균등한 분할 수행, StratifiedKFold는 클래스 비율을 유지하면서 데이터 분할
KFold
ㄴ 회귀 문제에서의 교차 검증
ml07_kfold_iris.py
# 1. iris
import numpy as np
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_iris() # 다중분류
x = datasets['data']
y = datasets.target
# kfold
n_splits = 5 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler 적용 (?)
scaler = MinMaxScaler()
scaler.fit(x) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x) # train 은 fit, transform 모두 해줘야 함
# 2. 모델
model = RandomForestClassifier()
# 3. 훈련
model.fit(x, y)
# 4. 평가, 예측
result = model.score(x, y)
print('결과 acc : ', result)
# SVC() 결과 acc : 0.9777777777777777
# LinearSVC() 결과 acc : 0.9777777777777777
# my tf keras 모델 결과 acc : 1.0
# MinMaxScaler() 결과 acc : 0.9777777777777777
# ===============================================
# tree 결과 acc : 0.9555555555555556
# ===============================================
# ensemble 결과 acc : 0.9555555555555556
# ===============================================
# KFold 결과 acc : 1.0
ml07_kfold_iris02.py
# 1. iris
import numpy as np
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_iris() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
model = RandomForestClassifier()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0] # 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# cv pred acc : 0.9333333333333333
# 결과
# SVC() 결과 acc : 0.9777777777777777
# LinearSVC() 결과 acc : 0.9777777777777777
# my tf keras 모델 결과 acc : 1.0
# MinMaxScaler() 결과 acc : 0.9777777777777777
# ===============================================
# tree 결과 acc : 0.9555555555555556
# ===============================================
# ensemble 결과 acc : 0.9555555555555556
# ===============================================
# KFold 결과 acc : 1.0
# ====================================================================
# cv acc : [0.91666667 1. 0.91666667 0.83333333 1. ]
ㄴ cross_val_score, cross_val_predict 이용
ml07_kfold_cancer.py
# 1. iris
import numpy as np
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_breast_cancer() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
model = RandomForestClassifier()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# 결과
# SVC() 결과 acc : 0.9777777777777777
# LinearSVC() 결과 acc : 0.9777777777777777
# my tf keras 모델 결과 acc : 1.0
# MinMaxScaler() 결과 acc : 0.9777777777777777
# ===============================================
# tree 결과 acc : 0.9555555555555556
# ===============================================
# ensemble 결과 acc : 0.9555555555555556
# ===============================================
# KFold 결과 acc : 1.0
# ====================================================================
# cv acc : [0.97619048 0.95238095 0.95238095 1. 0.95121951 1.
# 0.95121951 0.87804878 0.97560976 0.92682927 0.90243902] ]
# cv pred : [1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
# 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1
# 0
# 1 1 1 0 1 1 0 1 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 1
# 0
# 1 1 0]
# cv pred acc : 0.9473684210526315
ml07_kfold_wine.py
# 1. wine
import numpy as np
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_wine() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
model = RandomForestClassifier()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# 결과
# SVC() 결과 acc : 0.9777777777777777
# LinearSVC() 결과 acc : 0.9777777777777777
# my tf keras 모델 결과 acc : 1.0
# MinMaxScaler() 결과 acc : 0.9777777777777777
# ===============================================
# tree 결과 acc : 0.9555555555555556
# ===============================================
# ensemble 결과 acc : 0.9555555555555556
# ===============================================
# KFold 결과 acc : 1.0
# ====================================================================
# cv acc : [0.84615385 1. 1. 1. 0.92307692 0.92307692
# 1. 1. 1. 1. 1. ]
# cv pred : [0 0 2 0 1 0 1 2 1 2 0 2 0 2 0 1 1 1 0 1 0 0 1 2 2 2 1 1 1 0 0 1 2 0 0 0]
# cv pred acc : 0.9444444444444444
ml07_kfold_california.py
import numpy as np
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
import tensorflow as tf
tf.random.set_seed(77) #weight의 난수값 조절
#1. 데이터
datasets = fetch_california_housing()
x = datasets.data
y = datasets.target
x_train, x_test, y_train, y_test = train_test_split(
x, y, train_size = 0.7, random_state=100, shuffle= True
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = KFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler 적용
# scaler = MinMaxScaler()
# scaler = StandardScaler()
# scaler = MaxAbsScaler()
scaler = RobustScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x_train = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x_test = scaler.transform(x_test) # test 는 transform 만 하면 됨
# 2. 모델
model = RandomForestRegressor()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# 0.8% 를 제외한 나머지 0.2 %
r2 = r2_score(y_test, y_predict)
print('cv pred r2 : ', r2)
# SVR() 결과 r2 : -0.01663695941103427
# 결과 r2 : 0.06830124384888547
# my tf keras 모델 r2스코어 : 0.5346585367965508
# RobustScaler 적용 후 결과 r2 : 0.6873119065345796
# ====================================================
# tree 결과 r2 : 0.612701922946608
# ====================================================
# ensemble 결과 r2 : 0.8114840410530733
# ====================================================
# kfold 결과 r2 : 0.8128621988883818
# ====================================================
# cv acc :
# cv pred :
# cv pred r2 : 0.7822263447056246
StratifedKFold
ㄴ 레이블 데이터가 왜곡되었을 경우
ㄴ 일반적으로 분류에서의 교차 검증
ml08_stratifiedKFold_iris.py
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
tf.random.set_seed(77) # weight 의 난수값 조정
# 1. 데이터
datasets = load_iris() # 다중분류
x = datasets['data']
y = datasets.target
x_train, x_test, y_train,y_test = train_test_split(
x, y, train_size=0.8, shuffle=True, random_state=42
)
# kfold
n_splits = 11 # 보통 홀수로 들어감
random_state = 42
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True,
random_state=random_state)
# Scaler
scaler = MinMaxScaler()
scaler.fit(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_train) # train 은 fit, transform 모두 해줘야 함
x = scaler.transform(x_test)
# 2. 모델
model = RandomForestClassifier()
# 3. 훈련
model.fit(x_train, y_train)
# 4. 평가, 예측
score = cross_val_score(model,
x_train, y_train,
cv=kfold) # cv : corss validation
# print('cv acc : ', score) # kfold 에 있는 n_splits 숫자만큼 나옴
y_predict = cross_val_predict(model,
x_test, y_test,
cv=kfold)
# print('cv pred : ', y_predict)
# 0.8% 를 제외한 나머지 0.2 %
acc = accuracy_score(y_test, y_predict)
print('cv pred acc : ', acc)
# 결과
# SVC() 결과 acc : 0.9777777777777777
# LinearSVC() 결과 acc : 0.9777777777777777
# my tf keras 모델 결과 acc : 1.0
# MinMaxScaler() 결과 acc : 0.9777777777777777
# ===============================================
# tree 결과 acc : 0.9555555555555556
# ===============================================
# ensemble 결과 acc : 0.9555555555555556
# ===============================================
# KFold 결과 acc : 1.0
# ====================================================================
# cv acc : [0.91666667 1. 0.91666667 0.83333333 1. ]
# cv pred : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0]
# cv pred acc : 0.9333333333333333
# stratifiedKFold cv pred acc : 0.9333333333333333