사용한 모델
CatBoostClassifier, xgboost, LightGBM, DecisionTree, RandomForest, NaiveBayes
사용한 데이터셋
https://www.kaggle.com/c/microsoft-malware-prediction
공통 코드
import pandas as pd
import numpy as np
import lightgbm as lgb
#import xgboost as xgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from scipy import sparse
# Scalers
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
import gc
gc.enable()
#Reduce the memory usage - Inspired by Panchajanya Banerjee
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
dtypes = {
'MachineIdentifier': 'category',
'ProductName': 'category',
'EngineVersion': 'category',
'AppVersion': 'category',
'AvSigVersion': 'category',
'IsBeta': 'int8',
'RtpStateBitfield': 'float16',
'IsSxsPassiveMode': 'int8',
'DefaultBrowsersIdentifier': 'float16',
'AVProductStatesIdentifier': 'float32',
'AVProductsInstalled': 'float16',
'AVProductsEnabled': 'float16',
'HasTpm': 'int8',
'CountryIdentifier': 'int16',
'CityIdentifier': 'float32',
'OrganizationIdentifier': 'float16',
'GeoNameIdentifier': 'float16',
'LocaleEnglishNameIdentifier': 'int8',
'Platform': 'category',
'Processor': 'category',
'OsVer': 'category',
'OsBuild': 'int16',
'OsSuite': 'int16',
'OsPlatformSubRelease': 'category',
'OsBuildLab': 'category',
'SkuEdition': 'category',
'IsProtected': 'float16',
'AutoSampleOptIn': 'int8',
'PuaMode': 'category',
'SMode': 'float16',
'IeVerIdentifier': 'float16',
'SmartScreen': 'category',
'Firewall': 'float16',
'UacLuaenable': 'float32',
'Census_MDC2FormFactor': 'category',
'Census_DeviceFamily': 'category',
'Census_OEMNameIdentifier': 'float16',
'Census_OEMModelIdentifier': 'float32',
'Census_ProcessorCoreCount': 'float16',
'Census_ProcessorManufacturerIdentifier': 'float16',
'Census_ProcessorModelIdentifier': 'float16',
'Census_ProcessorClass': 'category',
'Census_PrimaryDiskTotalCapacity': 'float32',
'Census_PrimaryDiskTypeName': 'category',
'Census_SystemVolumeTotalCapacity': 'float32',
'Census_HasOpticalDiskDrive': 'int8',
'Census_TotalPhysicalRAM': 'float32',
'Census_ChassisTypeName': 'category',
'Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16',
'Census_InternalPrimaryDisplayResolutionHorizontal': 'float16',
'Census_InternalPrimaryDisplayResolutionVertical': 'float16',
'Census_PowerPlatformRoleName': 'category',
'Census_InternalBatteryType': 'category',
'Census_InternalBatteryNumberOfCharges': 'float32',
'Census_OSVersion': 'category',
'Census_OSArchitecture': 'category',
'Census_OSBranch': 'category',
'Census_OSBuildNumber': 'int16',
'Census_OSBuildRevision': 'int32',
'Census_OSEdition': 'category',
'Census_OSSkuName': 'category',
'Census_OSInstallTypeName': 'category',
'Census_OSInstallLanguageIdentifier': 'float16',
'Census_OSUILocaleIdentifier': 'int16',
'Census_OSWUAutoUpdateOptionsName': 'category',
'Census_IsPortableOperatingSystem': 'int8',
'Census_GenuineStateName': 'category',
'Census_ActivationChannel': 'category',
'Census_IsFlightingInternal': 'float16',
'Census_IsFlightsDisabled': 'float16',
'Census_FlightRing': 'category',
'Census_ThresholdOptIn': 'float16',
'Census_FirmwareManufacturerIdentifier': 'float16',
'Census_FirmwareVersionIdentifier': 'float32',
'Census_IsSecureBootEnabled': 'int8',
'Census_IsWIMBootEnabled': 'float16',
'Census_IsVirtualDevice': 'float16',
'Census_IsTouchEnabled': 'int8',
'Census_IsPenCapable': 'int8',
'Census_IsAlwaysOnAlwaysConnectedCapable': 'float16',
'Wdft_IsGamer': 'float16',
'Wdft_RegionIdentifier': 'float16',
'HasDetections': 'int8'
}
print('Download Train and Test Data.\n')
train = reduce_mem_usage(pd.read_csv('./train-tree.csv', dtype=dtypes, low_memory=True))
train['MachineIdentifier'] = train.index.astype('uint32')
test = reduce_mem_usage(pd.read_csv('./test-tree.csv', dtype=dtypes, low_memory=True))
test['MachineIdentifier'] = test.index.astype('uint32')
gc.collect()
#Feature Engineering
print('Transform all features to category.\n')
for usecol in train.columns.tolist()[1:-1]:
train[usecol] = train[usecol].astype('str')
test[usecol] = test[usecol].astype('str')
#Fit LabelEncoder
le = LabelEncoder().fit(
np.unique(train[usecol].unique().tolist()+
test[usecol].unique().tolist()))
#At the end 0 will be used for dropped values
train[usecol] = le.transform(train[usecol])+1
test[usecol] = le.transform(test[usecol])+1
agg_tr = (train
.groupby([usecol])
.aggregate({'MachineIdentifier':'count'})
.reset_index()
.rename({'MachineIdentifier':'Train'}, axis=1))
agg_te = (test
.groupby([usecol])
.aggregate({'MachineIdentifier':'count'})
.reset_index()
.rename({'MachineIdentifier':'Test'}, axis=1))
agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)
#Select values with more than 1000 observations
agg = agg[(agg['Train'] > 1000)].reset_index(drop=True)
agg['Total'] = agg['Train'] + agg['Test']
#Drop unbalanced values
agg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]
agg[usecol+'Copy'] = agg[usecol]
train[usecol] = (pd.merge(train[[usecol]],
agg[[usecol, usecol+'Copy']],
on=usecol, how='left')[usecol+'Copy']
.replace(np.nan, 0).astype('int').astype('category'))
test[usecol] = (pd.merge(test[[usecol]],
agg[[usecol, usecol+'Copy']],
on=usecol, how='left')[usecol+'Copy']
.replace(np.nan, 0).astype('int').astype('category'))
del le, agg_tr, agg_te, agg, usecol
gc.collect()
#Fit OneHotEncoder
ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(train)
print("OneHotEncoder==\n", ohe)
yt = np.array(train['HasDetections'])
xt = train.drop(['HasDetections'],axis=1)
xt =np.array(xt)
del train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier']
gc.collect()
train 데이터셋에서 train과 test를 8:2 형태로 나누었다.
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(xt, yt, test_size=0.2, random_state=42)
모델 별 정리
1. CatBoostClassifier
분류기: train_test_split
정확도: 66.39
2. xgboost
분류기: train_test_split
정확도: 65.22
3. LightGBM
분류기: train_test_split
정확도: 64.83
4. DecisionTree
개념
- 분류와 회귀 작업 그리고 다중출력 작업도 가능한 모델이다.
- 기본적으로 결정 트리는 결정에 다다르기 위해 예/아니오 질문을 이어 나가면서 학습한다.
- 마지막 노드를 리프(leaf)라고 하며, 엣지(edge)는 질문의 답과 다음 질문을 연결한다.
분류기: train_test_split
정확도: 64.83
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from IPython.display import Image
import pydot
#tree max_depth(20)
print('DecisionTree-maxDepth(20)')
dTreeLimit4=DecisionTreeClassifier(max_depth=20, random_state=0)
#train
dTreeLimit4.fit(xtrain, ytrain)
print("Train: training score5 : {:2f}".format(dTreeLimit4.score(xtrain, ytrain)))
print("Test: test score5 : {:2f}".format(dTreeLimit4.score(xtest, ytest)))
5. RandomForest
개념: 결정트리(Decision Tree)가 모여 랜덤 포레스트(Random Forest)를 구성한다. 결정 트리 하나만으로도 머신러닝을 할 수 있지만 결정 트리의 단점은 훈련 데이터에 오버피팅이 되는 경향이 있다는 것이다. 여러 개의 결정 트리를 통해 랜덤 포레스트를 만들면 오버피팅 되는 단점을 해결할 수 있다.
분류기: train_test_split
정확도: 62.12
#RandomForest Model Create(1)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
random_forest_model1 = RandomForestClassifier(n_estimators=20,
max_depth=5,
random_state=42)
model1=random_forest_model1.fit(xtrain, ytrain)
predict1=model1.predict(xtest)
print('Accuracy1: %.2f' %(accuracy_score(ytest, predict1)*100),'%')
6. NaiveBayes
개념
스팸 메일 필터, 텍스트 분류, 감정 분석, 추천 시스템 등에 광범위하게 활용되는 분류 기법
베이즈 정리에 기반한 통계적 분류 기법이다.
장점
1) 간단하고, 빠르며, 정확한 모델
2) computation cost가 작다. (따라서 빠름)
3) 큰 데이터셋에 적합하다
4) 연속형보다 이산형 데이터에서 성능이 좋다.
5) Multiple class 예측을 위해서도 사용할 수 있다.
단점
feature 간의 독립성이 있어야 합니다. 하지만 실제 데이터에서 모든 feature가 독립인 경우는 드물다
분류기: train_test_split
정확도: 55.28
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
#Create a Gaussian Classifier
gnb = GaussianNB()
#Train the model using the training sets
gnb.fit(xtrain, ytrain)
#Predict the response for test dataset
y_pred = gnb.predict(xtest)
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(ytest, y_pred))
결론
|
CatBoostClassifier |
xgboost |
LightGBM |
DecisionTree |
RandomForest |
NaiveBayes |
분류기 |
train_test_split |
train_test_split |
train_test_split |
train_test_split |
train_test_split |
train_test_split |
Accuracy |
0.6639 |
0.6547 |
0.6483 |
0.634917 |
0.6212 |
0.5528 |
참고
https://www.kaggle.com/bogorodvo/lightgbm-baseline-model-using-sparse-matrix
나이브베이즈
'AI > deep learning' 카테고리의 다른 글
[Tensorflow] CUDA, cuDNN 설치(오류 해결) (10321) | 2020.10.20 |
---|---|
[Microsoft Malware Prediction] Data analysis (0) | 2020.05.18 |
Object Detection: Faster-RCNN Model (0) | 2020.05.07 |
[DeepLearning] Tensorflow Object Detection을 통한 실시간 객체 탐지 (1513) | 2020.04.02 |
[Tensorflow] 아나콘다(Anaconda) 설치하기 (2) | 2020.04.02 |