Kaggle常用函数总结 原创 2017年07月03日 21:47:34 标签: kaggle 493 kaggle比赛也参加了好几次,在这里就把自己在做比赛中用到的函数汇总到这,方便自己以后查阅
生活随笔
收集整理的這篇文章主要介紹了
Kaggle常用函数总结 原创 2017年07月03日 21:47:34 标签: kaggle 493 kaggle比赛也参加了好几次,在这里就把自己在做比赛中用到的函数汇总到这,方便自己以后查阅
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
Kaggle常用函數總結
原創 2017年07月03日 21:47:34kaggle比賽也參加了好幾次,在這里就把自己在做比賽中用到的函數匯總到這,方便自己以后查閱,當然也會不斷地更新。
1. 數據處理
# 數據預處理 # 1. 讀取數據: data_macro = pd.read_csv("macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)# 2. 顯示為object的屬性: data_train.dtypes[data_train.dtypes=='object']# 3. 改變數據類型 data_train['material'] = data_train['material'].astype('object')# 4. 概覽數據 data_train.describe(include=['object']) # 5. 合并兩個表(上下) data_all = pd.concat([data_train, data_test], ignore_index=True)# 6. 合并兩個表(左右) data_all = pd.merge(data_all, data_macro, on='timestamp', how='left')# 7. 提取Number, Object特征: object_columns = data_all.columns[data_all.dtypes == 'object'] number_columns = data_all.columns[data_all.dtypes != 'object']# 8. 計算兩個特征平均 sa_price = train_df.groupby('sub_area')[['work_share', 'price_doc']].mean()- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
2. 數據可視化
# 數據可視化# 1. seaborn 畫圖技巧 # https://zhuanlan.zhihu.com/p/24464836 plt.figure(figsize=(8, 6)) sns.distplot(a=np.log1p(data_train['price_doc']), bins=50, kde=True) plt.xlabel("price", fontsize=12) plt.show()# 2. 數據中各特征值缺失的個數排序 missing_df = (data_train.isnull().sum(axis=0)/data_train.shape[0]).reset_index() missing_df.columns = ['column_name', 'missing_count'] missing_df = missing_df.ix[missing_df['missing_count']>0] missing_df = missing_df.sort_values('missing_count', axis=0, ascending=True) width = 0.8 ind = np.arange(missing_df.shape[0]) fig, ax = plt.subplots(figsize=(12, 18)) ax.barh(ind, missing_df['missing_count'], color='y') ax.set_yticks(ind) ax.set_yticklabels(missing_df['column_name'], rotation='horizontal') ax.set_xlabel("Count of missing values") ax.set_title("Number of missing values in each column") plt.show()train_na = (train_df.isnull().sum() / len(train_df)) * 100 train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)f, ax = plt.subplots(figsize=(12, 8)) plt.xticks(rotation='90') sns.barplot(x=train_na.index, y=train_na) ax.set(title='Percent missing data by feature', ylabel='% missing')# 3. 相關性熱圖 internal_chars = ['full_sq', 'life_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'price_doc'] corrmat = train_df[internal_chars].corr()f, ax = plt.subplots(figsize=(10, 7)) plt.xticks(rotation='90') sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)# 4. 散點圖 f, ax = plt.subplots(figsize=(10, 7)) ind = train_df[train_df['full_sq'] > 2000].index plt.scatter(x=train_df.drop(ind)['full_sq'], y=train_df.drop(ind)['price_doc'], c='r', alpha=0.5) ax.set(title='Price by area in sq meters', xlabel='Area', ylabel='Price')# 5. 個數圖 f, ax = plt.subplots(figsize=(10, 7)) plt.xticks(rotation='90') sns.countplot(x=train_df['num_room']) ax.set(title='Distribution of room count', xlabel='num_room')# 6. 曲線和擬合曲線圖 f, ax = plt.subplots(figsize=(12, 6)) by_price = by_df.groupby('build_year')[['build_year', 'price_doc']].mean() sns.regplot(x="build_year", y="price_doc", data=by_price, scatter=False, order=3, truncate=True) plt.plot(by_price['build_year'], by_price['price_doc'], color='r') ax.set(title='Mean price by year of build')# 7. 小提琴圖 f, ax = plt.subplots(figsize=(12, 8)) ind = train_df[train_df['state'].isnull()].index train_df['price_doc_log10'] = np.log10(train_df['price_doc']) sns.violinplot(x="state", y="price_doc_log10", data=train_df.drop(ind), inner="box") # sns.swarmplot(x="state", y="price_doc_log10", data=train_df.dropna(), color="w", alpha=.2); ax.set(title='Log10 of median price by state of home', xlabel='state', ylabel='log10(price)')# 8. barplot ax = sns.barplot(x="count", y="sub_area", data=sa_vc, orient="h")- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
3.特征工程
# 特征工程 # 1. 移除異常點 ulimit = np.percentile(data_train.price_doc.values, 99) llimit = np.percentile(data_train.price_doc.values, 1) data_train.loc[data_train['price_doc'] >ulimit, 'price_doc'] = ulimit data_train.loc[data_train['price_doc'] <llimit, 'price_doc'] = llimit# 2. 刪除缺失值過半的特征 drop_columns = missing_df.ix[missing_df['missing_count']>0.5, 'column_name'].values data_train.drop(drop_columns, axis=1, inplace=True) data_test.drop(drop_columns, axis=1, inplace=True)# 3. 刪除不正常的行數據 data_all.drop(data_train[data_train["life_sq"] > 7000].index, inplace=True)# 4. 提取時間 # week of year # data_all["week_of_year"] = data_all["timestamp"].dt.weekofyear # day of week # data_all["day_of_week"] = data_all["timestamp"].dt.weekday # yearmonth data_all['yearmonth'] = pd.to_datetime(data_all['timestamp']) data_all['yearmonth'] = data_all['yearmonth'].dt.year*100 + data_all['yearmonth'].dt.month data_all_groupby = data_all.groupby('yearmonth')# 5. 連續數據離散化 data_all['floor_25'] = (data_all['floor']>25.0)*1# 6. 分組來填補平均值 for num in number_columns:if(sum(data_all[num].isnull())>0):isnull_raw = data_all[num].isnull()isnull_yearmonth = data_all.ix[isnull_raw, 'yearmonth'].valuesdata_all_groupby[num].transform(lambda x: x.fillna(x.mean()))# 7. Get_dummies離散化 dummies = pd.get_dummies(data=data_all[ob], prefix="{}#".format(ob)) data_all.drop(ob, axis=1, inplace=True) data_all = data_all.join(dummies)# 8. 用radio中位數填補空缺 kitch_ratio = train_df['full_sq']/train_df['kitch_sq'] train_df['kitch_sq']=train_df['kitch_sq'].fillna(train_df['full_sq'] /kitch_ratio.median())# 9. LabelEncoder for ob in object_columns:lbl = preprocessing.LabelEncoder()lbl.fit(list(data_train[ob].values))data_train[ob] = lbl.fit_transform(list(data_train[ob].values))# 10. PCA的可視化與轉換 from sklearn.decomposition import PCA components = 20 model = PCA(n_components=components) model.fit(data_train) ex_variance = pd.DataFrame({'ex_variance':model.explained_variance_ratio_ [0:components], 'n_component':range(1,components+1)}) ax = sns.barplot(x='n_component', y='ex_variance', data=ex_variance) ax.set_title('PCA_variance_explained') plt.show() data_train = model.fit_transform(data_train) data_test = model.fit_transform(data_test)- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
4. 創建模型
# 創建模型 # 1. import xgboost as xgb from sklearn.cross_validation import KFold, cross_val_score from sklearn.grid_search import GridSearchCV train_X = data_train test_X = data_test dtrain = xgb.DMatrix(train_X, train_y) xgb_params={ 'eta': 0.05,'max_depth': 5,'subsample': 0.7,'colsample_bytree': 0.7,'objective': 'reg:linear','eval_metric': 'rmse','silent': 1 }cv_output = xgb.cv(dict(xgb_params, silent=0), dtrain, num_boost_round=1000, early_stopping_rounds=20, verbose_eval=20) num_boost_round = len(cv_output) model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=320) num_boost_round = model.best_iteration model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_round) preds = np.exp(model.predict(xgb.DMatrix(test_X, feature_names=test_X.columns.values)))-1 submission = pd.DataFrame() submission['id'] = test_id submission["price_doc"]= predssubmission.to_csv("sub.csv",index=False)# 畫feature_importance %matplotlib inline fig, ax = plt.subplots(1, 1, figsize=(8, 60)) xgb.plot_importance(model, height=0.5, ax=ax)# 提取feature_importance import operator importance = model.get_fscore()df_importance = pd.DataFrame(importance, columns=['feature', 'fscore']) df_importance.sort_values(ascending=False)- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
5. 其他
# 其他 # 去除共線性 from statsmodels.stats.outliers_influence import variance_inflation_factor from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import Imputerfrom statsmodels.stats.outliers_influence import variance_inflation_factorclass ReduceVIF(BaseEstimator, TransformerMixin):def __init__(self, thresh=5.0, impute=True, impute_strategy='median'):# From looking at documentation, values between 5 and 10 are "okay".# Above 10 is too high and so should be removed.self.thresh = thresh# The statsmodel function will fail with NaN values, as such we have to impute them.# By default we impute using the median value.# This imputation could be taken out and added as part of an sklearn Pipeline.if impute:self.imputer = Imputer(strategy=impute_strategy)def fit(self, X, y=None):print('ReduceVIF fit')if hasattr(self, 'imputer'):self.imputer.fit(X)return selfdef transform(self, X, y=None):print('ReduceVIF transform')columns = X.columns.tolist()if hasattr(self, 'imputer'):X = pd.DataFrame(self.imputer.transform(X), columns=columns)return ReduceVIF.calculate_vif(X, self.thresh)@staticmethoddef calculate_vif(X, thresh=5.0):# Taken from https://stats.stackexchange.com/a/253620/53565 and modifieddropped=Truewhile dropped:# Loop repeatedly until we find that all columns within our dataset# have a VIF value we're happy with.variables = X.columnsdropped=Falsevif = []new_vif = 0for var in X.columns:new_vif = variance_inflation_factor(X[variables].values, X.columns.get_loc(var))vif.append(new_vif)if np.isinf(new_vif):breakmax_vif = max(vif)if max_vif > thresh:maxloc = vif.index(max_vif)#print 'Dropping {X.columns[{0}]} with vif={{1}}'.format(maxloc, max_vif)print X.columns[maxloc]print max_vifX = X.drop([X.columns.tolist()[maxloc]], axis=1)dropped=Truereturn X transformer = ReduceVIF() X = data_all # Only use 10 columns for speed in this example data_all = transformer.fit_transform(data_train[data_train.columns[0:50]], train_y)data_all.head() # 2. Stacking # Stacking Starter based on Allstate Faron's Script #https://www.kaggle.com/mmueller/allstate-claims-severity/stacking-starter/run/390867 # Preprocessing from Alexandru Papiu #https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-modelsSEED = 1 NFOLDS = 3 import pandas as pd import numpy as np from scipy.stats import skew import xgboost as xgb from sklearn.cross_validation import KFold from sklearn.ensemble import ExtraTreesRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso from math import sqrtntrain = data_train.shape[0] ntest = data_test.shape[0] print ntrain print ntestx_train = np.array(data_train) x_test = np.array(data_test) y_train = train_y kf = KFold(ntrain, n_folds=3, shuffle=True, random_state=SEED) class SklearnWrapper(object):def __init__(self, clf, seed=0, params=None):params['random_state'] = seedself.clf = clf(**params)def train(self, x_train, y_train):self.clf.fit(x_train, y_train)def predict(self, x):return self.clf.predict(x)class XgbWrapper(object):def __init__(self, seed=0, params=None):self.param = paramsself.param['seed'] = seedself.nrounds = params.pop('nrounds', 250)def train(self, x_train, y_train):dtrain = xgb.DMatrix(x_train, label=y_train)self.gbdt = xgb.train(self.param, dtrain, self.nrounds)def predict(self, x):return self.gbdt.predict(xgb.DMatrix(x))def get_oof(clf):oof_train = np.zeros((ntrain,))oof_test = np.zeros((ntest,))oof_test_skf = np.empty((NFOLDS, ntest))for i, (train_index, test_index) in enumerate(kf):x_tr = x_train[train_index]y_tr = y_train[train_index]x_te = x_train[test_index]clf.train(x_tr, y_tr)oof_train[test_index] = clf.predict(x_te)oof_test_skf[i, :] = clf.predict(x_test)oof_test[:] = oof_test_skf.mean(axis=0)return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)et_params = {'n_jobs': 16,'n_estimators': 100,'max_features': 0.5,'max_depth': 12,'min_samples_leaf': 2, }rf_params = {'n_jobs': 16,'n_estimators': 100,'max_features': 0.2,'max_depth': 12,'min_samples_leaf': 2, }xgb_params = {'seed': 0,'colsample_bytree': 0.7,'silent': 1,'subsample': 0.7,'learning_rate': 0.075,'objective': 'reg:linear','max_depth': 4,'num_parallel_tree': 1,'min_child_weight': 1,'eval_metric': 'rmse','nrounds': 500 }rd_params={'alpha': 10 }ls_params={'alpha': 0.005 }xg = XgbWrapper(seed=SEED, params=xgb_params) et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params) rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params) rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params) ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)xg_oof_train, xg_oof_test = get_oof(xg) et_oof_train, et_oof_test = get_oof(et) rf_oof_train, rf_oof_test = get_oof(rf) rd_oof_train, rd_oof_test = get_oof(rd) ls_oof_train, ls_oof_test = get_oof(ls)print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train)))) print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train)))) print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train)))) print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train)))) print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1) x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)print("{},{}".format(x_train.shape, x_test.shape))dtrain = xgb.DMatrix(x_train, label=y_train) dtest = xgb.DMatrix(x_test)xgb_params = {'seed': 0,'colsample_bytree': 0.8,'silent': 1,'subsample': 0.6,'learning_rate': 0.01,'objective': 'reg:linear','max_depth': 1,'num_parallel_tree': 1,'min_child_weight': 1,'eval_metric': 'rmse'}res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,early_stopping_rounds=25, verbose_eval=10, show_stdv=True)best_nrounds = res.shape[0] - 1 cv_mean = res.iloc[-1, 0] cv_std = res.iloc[-1, 1]print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std)) 《新程序員》:云原生和全面數字化實踐50位技術專家共同創作,文字、視頻、音頻交互閱讀總結
以上是生活随笔為你收集整理的Kaggle常用函数总结 原创 2017年07月03日 21:47:34 标签: kaggle 493 kaggle比赛也参加了好几次,在这里就把自己在做比赛中用到的函数汇总到这,方便自己以后查阅的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 从重采样到数据合成:如何处理机器学习中的
- 下一篇: The Wide and Deep Le