LGB + KFold 代码 (1)
生活随笔
收集整理的這篇文章主要介紹了
LGB + KFold 代码 (1)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
來自Kaggle的一個比賽。
原網址鏈接
目的是將此代碼保存起來,方便以后用。
代碼如下
代碼有些復雜,不過能給我們提供一種思路。
import pandas as pd import numpy as np import lightgbm as lgb #import xgboost as xgb #vstack 垂直拼接稀疏矩陣 from scipy.sparse import vstack, csr_matrix, save_npz, load_npz from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.model_selection import StratifiedKFold #from sklearn.metrics import roc_auc_score import gc gc.enable()dtypes = {'MachineIdentifier': 'category','ProductName': 'category','EngineVersion': 'category','AppVersion': 'category','AvSigVersion': 'category','IsBeta': 'int8','RtpStateBitfield': 'float16','IsSxsPassiveMode': 'int8','DefaultBrowsersIdentifier': 'float16','AVProductStatesIdentifier': 'float32','AVProductsInstalled': 'float16','AVProductsEnabled': 'float16','HasTpm': 'int8','CountryIdentifier': 'int16','CityIdentifier': 'float32','OrganizationIdentifier': 'float16','GeoNameIdentifier': 'float16','LocaleEnglishNameIdentifier': 'int8','Platform': 'category','Processor': 'category','OsVer': 'category','OsBuild': 'int16','OsSuite': 'int16','OsPlatformSubRelease': 'category','OsBuildLab': 'category','SkuEdition': 'category','IsProtected': 'float16','AutoSampleOptIn': 'int8','PuaMode': 'category','SMode': 'float16','IeVerIdentifier': 'float16','SmartScreen': 'category','Firewall': 'float16','UacLuaenable': 'float32','Census_MDC2FormFactor': 'category','Census_DeviceFamily': 'category','Census_OEMNameIdentifier': 'float16','Census_OEMModelIdentifier': 'float32','Census_ProcessorCoreCount': 'float16','Census_ProcessorManufacturerIdentifier': 'float16','Census_ProcessorModelIdentifier': 'float16','Census_ProcessorClass': 'category','Census_PrimaryDiskTotalCapacity': 'float32','Census_PrimaryDiskTypeName': 'category','Census_SystemVolumeTotalCapacity': 'float32','Census_HasOpticalDiskDrive': 'int8','Census_TotalPhysicalRAM': 'float32','Census_ChassisTypeName': 'category','Census_InternalPrimaryDiagonalDisplaySizeInInches': 'float16','Census_InternalPrimaryDisplayResolutionHorizontal': 'float16','Census_InternalPrimaryDisplayResolutionVertical': 'float16','Census_PowerPlatformRoleName': 'category','Census_InternalBatteryType': 'category','Census_InternalBatteryNumberOfCharges': 'float32','Census_OSVersion': 'category','Census_OSArchitecture': 'category','Census_OSBranch': 'category','Census_OSBuildNumber': 'int16','Census_OSBuildRevision': 'int32','Census_OSEdition': 'category','Census_OSSkuName': 'category','Census_OSInstallTypeName': 'category','Census_OSInstallLanguageIdentifier': 'float16','Census_OSUILocaleIdentifier': 'int16','Census_OSWUAutoUpdateOptionsName': 'category','Census_IsPortableOperatingSystem': 'int8','Census_GenuineStateName': 'category','Census_ActivationChannel': 'category','Census_IsFlightingInternal': 'float16','Census_IsFlightsDisabled': 'float16','Census_FlightRing': 'category','Census_ThresholdOptIn': 'float16','Census_FirmwareManufacturerIdentifier': 'float16','Census_FirmwareVersionIdentifier': 'float32','Census_IsSecureBootEnabled': 'int8','Census_IsWIMBootEnabled': 'float16','Census_IsVirtualDevice': 'float16','Census_IsTouchEnabled': 'int8','Census_IsPenCapable': 'int8','Census_IsAlwaysOnAlwaysConnectedCapable': 'float16','Wdft_IsGamer': 'float16','Wdft_RegionIdentifier': 'float16','HasDetections': 'int8'}print('Download Train and Test Data.\n') train = pd.read_csv('../input/train.csv', dtype=dtypes, low_memory=True) train['MachineIdentifier'] = train.index.astype('uint32') test = pd.read_csv('../input/test.csv', dtype=dtypes, low_memory=True) test['MachineIdentifier'] = test.index.astype('uint32')gc.collect()print('Transform all features to category.\n') for usecol in train.columns.tolist()[1:-1]:train[usecol] = train[usecol].astype('str')test[usecol] = test[usecol].astype('str')#Fit LabelEncoderle = LabelEncoder().fit(np.unique(train[usecol].unique().tolist()+test[usecol].unique().tolist()))#At the end 0 will be used for dropped valuestrain[usecol] = le.transform(train[usecol])+1test[usecol] = le.transform(test[usecol])+1agg_tr = (train.groupby([usecol]).aggregate({'MachineIdentifier':'count'}).reset_index().rename({'MachineIdentifier':'Train'}, axis=1))agg_te = (test.groupby([usecol]).aggregate({'MachineIdentifier':'count'}).reset_index().rename({'MachineIdentifier':'Test'}, axis=1))agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)#Select values with more than 1000 observationsagg = agg[(agg['Train'] > 1000)].reset_index(drop=True) agg['Total'] = agg['Train'] + agg['Test']#Drop unbalanced valuesagg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]agg[usecol+'Copy'] = agg[usecol]train[usecol] = (pd.merge(train[[usecol]], agg[[usecol, usecol+'Copy']], on=usecol, how='left')[usecol+'Copy'].replace(np.nan, 0).astype('int').astype('category'))test[usecol] = (pd.merge(test[[usecol]], agg[[usecol, usecol+'Copy']], on=usecol, how='left')[usecol+'Copy'].replace(np.nan, 0).astype('int').astype('category'))del le, agg_tr, agg_te, agg, usecolgc.collect()y_train = np.array(train['HasDetections']) train_ids = train.index test_ids = test.indexdel train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier'] gc.collect()print("If you don't want use Sparse Matrix choose Kernel Version 2 to get simple solution.\n")print('--------------------------------------------------------------------------------------------------------') print('Transform Data to Sparse Matrix.') print('Sparse Matrix can be used to fit a lot of models, eg. XGBoost, LightGBM, Random Forest, K-Means and etc.') print('To concatenate Sparse Matrices by column use hstack()') print('Read more about Sparse Matrix https://docs.scipy.org/doc/scipy/reference/sparse.html') print('Good Luck!') print('--------------------------------------------------------------------------------------------------------')#Fit OneHotEncoder ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(train)#Transform data using small groups to reduce memory usage m = 100000 train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)]) test = vstack([ohe.transform(test[i*m:(i+1)*m]) for i in range(test.shape[0] // m + 1)]) save_npz('train.npz', train, compressed=True) save_npz('test.npz', test, compressed=True)del ohe, train, test gc.collect()skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) skf.get_n_splits(train_ids, y_train)lgb_test_result = np.zeros(test_ids.shape[0]) #lgb_train_result = np.zeros(train_ids.shape[0]) #xgb_test_result = np.zeros(test_ids.shape[0]) #xgb_train_result = np.zeros(train_ids.shape[0]) counter = 0print('\nLightGBM\n') #train_index 和 test_index是train_ids和y_train中元素的索引,也就是說train_index 和 test_index是train的索引的索引 for train_index, test_index in skf.split(train_ids, y_train):print('Fold {}\n'.format(counter + 1))train = load_npz('train.npz')#[train[train_index[i*m:(i+1)*m]] :依此看,train_index可以直接當作train的索引X_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m + 1)]) #訓練集,并將其分塊降低消耗X_val = vstack([train[test_index[i*m:(i+1)*m]] for i in range(test_index.shape[0] // m + 1)]) #測試集,并將其分塊降低消耗X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32') #csr_matrix 將訓練集轉化為稀疏矩陣用于機器學習分析y_fit, y_val = y_train[train_index], y_train[test_index] #y_fit:訓練集的標簽 y_val:測試集的標簽del traingc.collect()lgb_model = lgb.LGBMClassifier(max_depth=-1, #最大樹深度,當模型過擬合時,可以考慮降低n_estimators=30000, #訓練輪數learning_rate=0.05, #學習率num_leaves=2**12-1, #樹最大葉子節點數colsample_bytree=0.28, #訓練特征采樣率objective='binary', #目標函數:二分類n_jobs=-1) #并行運行多線程核心數目#xgb_model = xgb.XGBClassifier(max_depth=6,# n_estimators=30000,# colsample_bytree=0.2,# learning_rate=0.1,# objective='binary:logistic', # n_jobs=-1)lgb_model.fit(X_fit, y_fit, eval_metric='auc', #評價指標eval_set=[(X_val, y_val)], #測試集verbose=100, #???????????????????????early_stopping_rounds=100) #每訓練100輪時停止訓練并打分,防止過擬合#xgb_model.fit(X_fit, y_fit, eval_metric='auc', # eval_set=[(X_val, y_val)], # verbose=1000, early_stopping_rounds=300)#lgb_train_result[test_index] += lgb_model.predict_proba(X_val)[:,1]#xgb_train_result[test_index] += xgb_model.predict_proba(X_val)[:,1]del X_fit, X_val, y_fit, y_val, train_index, test_indexgc.collect()test = load_npz('test.npz')test = csr_matrix(test, dtype='float32')lgb_test_result += lgb_model.predict_proba(test)[:,1] #預測為真的概率之和,一共預測了5次#xgb_test_result += xgb_model.predict_proba(test)[:,1]counter += 1del testgc.collect()#Stop fitting to prevent time limit error#if counter == 3 : break#print('\nLigthGBM VAL AUC Score: {}'.format(roc_auc_score(y_train, lgb_train_result))) #print('\nXGBoost VAL AUC Score: {}'.format(roc_auc_score(y_train, xgb_train_result)))submission = pd.read_csv('../input/sample_submission.csv') submission['HasDetections'] = lgb_test_result / counter #submission['HasDetections'] 預測為真的概率的平均值 submission.to_csv('lgb_submission.csv', index=False) #submission['HasDetections'] = xgb_test_result / counter #submission.to_csv('xgb_submission.csv', index=False) #submission['HasDetections'] = 0.5 * lgb_test_result / counter + 0.5 * xgb_test_result / counter ##submission.to_csv('lgb_xgb_submission.csv', index=False)print('\nDone.')
總結
以上是生活随笔為你收集整理的LGB + KFold 代码 (1)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 作者:姚前(1970-),男,中国人民银
- 下一篇: 作者:牛新(1983-),男,博士,国防