天池入门赛--蒸汽预测
生活随笔
收集整理的這篇文章主要介紹了
天池入门赛--蒸汽预测
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
首先查看數據?
#coding:utf-8 """ Created on Wen Jan 9 2019@author: fzh """ import warnings warnings.filterwarnings("ignore") import matplotlib.pyplot as plt plt.rcParams.update({'figure.max_open_warning': 0}) import seaborn as sns from sklearn.model_selection import train_test_split import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold from sklearn.metrics import make_scorer,mean_squared_error from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet from sklearn.svm import LinearSVR, SVR from sklearn.neighbors import KNeighborsRegressor from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor from xgboost import XGBRegressor from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler from sklearn.metrics import make_scorer from scipy import stats import os#load_dataset with open("data/zhengqi_train.txt") as fr:data_train=pd.read_table(fr,sep="\t")print('data_train.shape=',data_train.shape)with open("data/zhengqi_test.txt") as fr_test:data_test=pd.read_table(fr_test,sep="\t")print('data_test.shape=',data_test.shape) #merge train_set and test_set add origin data_train["oringin"]="train" data_test["oringin"]="test" data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True) #View data print('data_all.shape=',data_all.shape) # Explore feature distibution fig = plt.figure(figsize=(6, 6)) for column in data_all.columns[0:-2]:g = sns.kdeplot(data_all[column][(data_all["oringin"] == "train")], color="Red", shade = True)g = sns.kdeplot(data_all[column][(data_all["oringin"] == "test")], color="Blue", shade= True)g.set(xlabel=column,ylabel='Frequency')g = g.legend(["train","test"])plt.show()可看出,train有2888行數據,test有1925行數據,然后查看train和test的每列數據的分布,橫坐標是特征名,從v0到37,下面選擇了v4和v5,可看出v4的train和test分布接近,而v5相差較大,類似較大的還有"V9","V11","V17","V22","V28",故可以刪掉這些列特征。
fig = plt.figure(figsize=(10, 10)) for i in range(len(data_all.columns)-2):#初始網格g = sns.FacetGrid(data_all, col='oringin')#利用map方法可視化直方圖g = g.map(sns.distplot, data_all.columns[i])plt.savefig('distplot'+str(i)+'.jpg')這里選取了v5
刪掉"V5","V9","V11","V17","V22","V28"這些列特征。?
"""刪除特征"V5","V9","V11","V17","V22","V28",訓練集和測試集分布不一致""" data_all.drop(["V5","V9","V11","V17","V22","V28"],axis=1,inplace=True) print('drop after data_all.shape=',data_all.shape) """刪除特征"V5","V9","V11","V17","V22","V28",訓練集和測試集分布不一致""" data_all.drop(["V5","V9","V11","V17","V22","V28"],axis=1,inplace=True) print('drop after data_all.shape=',data_all.shape)# figure parameters data_train= data_all[data_all["oringin"] == "train"].drop("oringin", axis=1) print('drop after data_train.shape=',data_train.shape)"""找出相關程度""" plt.figure(figsize=(20, 16)) # 指定繪圖對象寬度和高度 colnm = data_train.columns.tolist() # 列表頭 mcorr = data_train.corr(method="spearman") # 相關系數矩陣,即給出了任意兩個變量之間的相關系數 print('mcorr.shape=',mcorr.shape) mask = np.zeros_like(mcorr, dtype=np.bool) # 構造與mcorr同維數矩陣 為bool型 #畫上三角相關系數矩陣 mask[np.triu_indices_from(mask)] = True # 上三角為1 g = sns.heatmap(mcorr, mask=mask, cmap=plt.cm.jet, square=True, annot=True, fmt='0.2f') # 熱力圖(看兩兩相似度) plt.savefig('mcorr.jpg')丟掉與target相關系數小于0.1的特征?
# Threshold for removing correlated variables threshold = 0.1 # Absolute value correlation matrix corr_matrix = data_train.corr().abs() drop_col=corr_matrix[corr_matrix["target"]<threshold].index print('drop_col=',drop_col) data_all.drop(drop_col,axis=1,inplace=True) print('data_all.shape=',data_all.shape)對每一列特征進行最小最大歸一化?
"""歸一化""" cols_numeric=list(data_all.columns) cols_numeric.remove("oringin") def scale_minmax(col):return (col-col.min())/(col.max()-col.min()) scale_cols = [col for col in cols_numeric if col!='target'] print('scale_cols=',scale_cols) data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax,axis=0) print('data_all[scale_cols].shape=',data_all[scale_cols].shape)劃分train,valid,和test
"""function to get training samples""" def get_training_data():df_train = data_all[data_all["oringin"]=="train"]print('df_train.shape=',df_train.shape)y = df_train.targetX = df_train.drop(["oringin","target"],axis=1)X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.3,random_state=100)return X_train,X_valid,y_train,y_valid"""extract test data (without SalePrice)""" def get_test_data():df_test = data_all[data_all["oringin"]=="test"].reset_index(drop=True)return df_test.drop(["oringin","target"],axis=1)"""metric for evaluation""" def rmse(y_true, y_pred):diff = y_pred - y_truesum_sq = sum(diff ** 2)n = len(y_pred)return np.sqrt(sum_sq / n)def mse(y_ture, y_pred):return mean_squared_error(y_ture, y_pred)利用Ridge回歸去掉邊界點,邊界點滿足誤差的正態分布方差大于3
"""function to detect outliers based on the predictions of a model""" def find_outliers(model, X, y, sigma=3):# predict y values using modeltry:y_pred = pd.Series(model.predict(X), index=y.index)# if predicting fails, try fitting the model firstexcept:model.fit(X, y)y_pred = pd.Series(model.predict(X), index=y.index)# calculate residuals between the model prediction and true y valuesresid = y - y_predmean_resid = resid.mean()std_resid = resid.std()# calculate z statistic, define outliers to be where |z|>sigmaz = (resid - mean_resid) / std_resid#找出方差大于3的數據的索引,然后丟掉outliers = z[abs(z) > sigma].index# print and plot the resultsprint('score=', model.score(X, y))print('rmse=', rmse(y, y_pred))print("mse=", mean_squared_error(y, y_pred))print('---------------------------------------')print('mean of residuals:', mean_resid)print('std of residuals:', std_resid)print('---------------------------------------')print(len(outliers), 'outliers:')print(outliers.tolist())plt.figure(figsize=(15, 5))plt.subplot(1, 3, 1)plt.plot(y, y_pred, '.')plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')plt.legend(['Accepted', 'Outlier'])plt.xlabel('y')plt.ylabel('y_pred')plt.subplot(1, 3, 2)plt.plot(y, y - y_pred, '.')plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro')plt.legend(['Accepted', 'Outlier'])plt.xlabel('y')plt.ylabel('y - y_pred')plt.subplot(1, 3, 3)plt.hist(z,bins=50)plt.hist(z.loc[outliers],color='r', bins=50)plt.legend(['Accepted', 'Outlier'])plt.xlabel('normal res error')plt.ylabel('frequency')plt.savefig('outliers.png')return outliers# get training data from sklearn.linear_model import Ridge X_train, X_valid,y_train,y_valid = get_training_data() # find and remove outliers using a Ridge model outliers = find_outliers(Ridge(), X_train, y_train)將移除的點進行訓練
""" permanently remove these outliers from the data""" X_t=X_train.drop(outliers) y_t=y_train.drop(outliers) # def get_trainning_data_omitoutliers():y1=y_t.copy()X1=X_t.copy()return X1,y1from sklearn.preprocessing import StandardScalerdef train_model(model, param_grid,splits=5, repeats=5):X, y = get_trainning_data_omitoutliers()poly_trans=PolynomialFeatures(degree=2)X=poly_trans.fit_transform(X)X=MinMaxScaler().fit_transform(X)# create cross-validation methodrkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)# perform a grid search if param_grid givenif len(param_grid) > 0:# setup grid search parametersgsearch = GridSearchCV(model, param_grid, cv=rkfold,scoring="neg_mean_squared_error",verbose=1, return_train_score=True)# search the gridgsearch.fit(X, y)# extract best model from the gridmodel = gsearch.best_estimator_best_idx = gsearch.best_index_# get cv-scores for best modelgrid_results = pd.DataFrame(gsearch.cv_results_)cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])cv_std = grid_results.loc[best_idx, 'std_test_score']# no grid search, just cross-val score for given modelelse:grid_results = []cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold)cv_mean = abs(np.mean(cv_results))cv_std = np.std(cv_results)# combine mean and std cv-score in to a pandas seriescv_score = pd.Series({'mean': cv_mean, 'std': cv_std})# predict y using the fitted modely_pred = model.predict(X)# print stats on model performanceprint('----------------------')print(model)print('----------------------')print('score=', model.score(X, y))print('rmse=', rmse(y, y_pred))print('mse=', mse(y, y_pred))print('cross_val: mean=', cv_mean, ', std=', cv_std)return model, cv_score, grid_results # # places to store optimal models and scores opt_models = dict() score_models = pd.DataFrame(columns=['mean','std'])# no. k-fold splits splits=5 # no. k-fold iterations repeats=5print('=========Ridge model========================') model = 'Ridge' opt_models[model] = Ridge() alph_range = np.arange(0.25,6,0.25) param_grid = {'alpha': alph_range} opt_models[model],cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,splits=splits, repeats=repeats)cv_score.name = model score_models = score_models.append(cv_score)plt.figure() plt.errorbar(alph_range, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats)) plt.xlabel('alpha') plt.ylabel('score') plt.show() #print('===========RandomForest model================') #model = 'RandomForest' #opt_models[model] = RandomForestRegressor() #param_grid = {'n_estimators':[100,150,200],'max_features':[8,12,16,20,24],'min_samples_split':[2,4,6]}#opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid,splits=5, repeats=1) #cv_score.name = model #score_models = score_models.append(cv_score) #print('score_models=',score_models) #import pickle #with open("prediction.pkl", "wb") as f: # pickle.dump(opt_models[model], f)明顯score高了。
后面用隨機森林訓練了一版并進行了保存。
完整train.py
#coding:utf-8 """ Created on Wen Jan 9 2019@author: fzh """ import warnings warnings.filterwarnings("ignore") import matplotlib.pyplot as plt plt.rcParams.update({'figure.max_open_warning': 0}) import seaborn as sns from sklearn.model_selection import train_test_split import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold from sklearn.metrics import make_scorer,mean_squared_error from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet from sklearn.svm import LinearSVR, SVR from sklearn.neighbors import KNeighborsRegressor from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor # from xgboost import XGBRegressor from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler from sklearn.metrics import make_scorer import os from scipy import stats from sklearn.preprocessing import StandardScaler from beautifultable import BeautifulTable import pickle from sklearn.ensemble import ExtraTreesRegressor,AdaBoostRegressor from sklearn.ensemble import AdaBoostClassifierdef del_feature(data_train,data_test):data_train["oringin"]="train"data_test["oringin"]="test"data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)"""刪除特征"V5","V9","V11","V17","V22","V28",訓練集和測試集分布不一致"""data_all.drop(["V5","V9","V11","V17","V22","V28"],axis=1,inplace=True)# print('drop after data_all.shape=',data_all.shape)# figure parametersdata_train= data_all[data_all["oringin"] == "train"].drop("oringin", axis=1)# print('drop after data_train.shape=',data_train.shape)"""'V14', u'V21', u'V25', u'V26', u'V32', u'V33', u'V34'"""# Threshold for removing correlated variablesthreshold = 0.1# Absolute value correlation matrixcorr_matrix = data_train.corr().abs()drop_col=corr_matrix[corr_matrix["target"]<threshold].index# print('drop_col=',drop_col)data_all.drop(drop_col,axis=1,inplace=True)# print('data_all.shape=',data_all.shape)return data_all """function to get training samples""" def get_training_data(data_all):df_train = data_all[data_all["oringin"]=="train"]y = df_train.targetX = df_train.drop(["oringin","target"],axis=1)X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.3,random_state=100)return X_train,X_valid,y_train,y_valid"""extract test data (without SalePrice)""" def get_test_data(data_all):df_test = data_all[data_all["oringin"]=="test"].reset_index(drop=True)return df_test.drop(["oringin","target"],axis=1) # """metric for evaluation""" def rmse(y_true, y_pred):diff = y_pred - y_truesum_sq = sum(diff ** 2)n = len(y_pred)return np.sqrt(sum_sq / n)def mse(y_ture, y_pred):return mean_squared_error(y_ture, y_pred)"""function to detect outliers based on the predictions of a model""" def find_outliers(model, X, y, sigma=3):# predict y values using modeltry:y_pred = pd.Series(model.predict(X), index=y.index)# if predicting fails, try fitting the model firstexcept:model.fit(X, y)y_pred = pd.Series(model.predict(X), index=y.index)# calculate residuals between the model prediction and true y valuesresid = y - y_predmean_resid = resid.mean()std_resid = resid.std()# calculate z statistic, define outliers to be where |z|>sigmaz = (resid - mean_resid) / std_resid#找出方差大于3的數據的索引,然后丟掉outliers = z[abs(z) > sigma].index# print and plot the resultsprint('score=', model.score(X, y))print('rmse=', rmse(y, y_pred))print("mse=", mean_squared_error(y, y_pred))print('---------------------------------------')print('mean of residuals:', mean_resid)print('std of residuals:', std_resid)print('---------------------------------------')return outliersdef get_trainning_data_omitoutliers(X_t,y_t):y1=y_t.copy()X1=X_t.copy()return X1,y1def scale_minmax(col):return (col - col.min()) / (col.max() - col.min())def normal(data_all):"""歸一化"""cols_numeric = list(data_all.columns)cols_numeric.remove("oringin")scale_cols = [col for col in cols_numeric if col != 'target']print('scale_cols=', scale_cols)data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax, axis=0)return data_allif __name__ == '__main__':with open("data/zhengqi_train.txt") as fr:data_train = pd.read_table(fr, sep="\t")with open("data/zhengqi_test.txt") as fr_test:data_test = pd.read_table(fr_test, sep="\t")data_all=del_feature(data_train,data_test)print('clear data_all.shape',data_all.shape)data_all=normal(data_all)X_train, X_valid, y_train, y_valid = get_training_data(data_all)print('X_train.shape=', X_train.shape)print('X_valid.shape=', X_valid.shape)X_test=get_test_data(data_all)print('X_test.shape',X_test.shape)# find and remove outliers using a Ridge modeloutliers = find_outliers(Ridge(), X_train, y_train)""" permanently remove these outliers from the data"""X_train,y_train=get_trainning_data_omitoutliers(X_train.drop(outliers),y_train.drop(outliers))X1=pd.concat([X_train,y_train],axis=1)X2=pd.concat([X_valid,y_valid],axis=1)X_all=pd.concat([X1,X2],axis=0)print(X_all)y = X_all['target']X = X_all.drop(["target"], axis=1)print(X.shape)X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=100)poly_trans = PolynomialFeatures(degree=2)X_train = poly_trans.fit_transform(X_train)print(X_train.shape)X_valid = poly_trans.fit_transform(X_valid)print(X_valid.shape)print('==============forest_model========================')forest_model = RandomForestRegressor(n_estimators=500,criterion='mse',max_depth=20,min_samples_leaf=3,max_features=0.4,random_state=1,bootstrap=False,n_jobs=-1)forest_model.fit(X_train,y_train)importance =forest_model.feature_importances_table = BeautifulTable()# table.column_headers = ["feature", "importance"]print('RF feature importance:')# print(data_all)for i, cols in enumerate(X_all.iloc[:, :-1]):table.append_row([cols, round(importance[i], 3)])print(table)y_pred = forest_model.predict(X_valid)y_valid_rmse=rmse(y_valid,y_pred)print('y_valid_rmse=',y_valid_rmse)y_valid_mse = mse(y_valid, y_pred)print('y_valid_mse=',y_valid_mse)y_valid_score=forest_model.score(X_valid,y_valid)print('y_valid_score=',y_valid_score)with open("forest_model.pkl", "wb") as f:pickle.dump(forest_model, f)with open("forest_model.pkl", "rb") as f:model = pickle.load(f)y_pred = model.predict(X_valid)y_valid_rmse = rmse(y_valid, y_pred)print('y_valid_rmse=', y_valid_rmse)y_valid_mse = mse(y_valid, y_pred)print('y_valid_mse=', y_valid_mse)y_valid_score = model.score(X_valid, y_valid)print('y_valid_score=', y_valid_score)inference.py 如下
#coding:utf-8 """ Created on Wen Jan 9 2019@author: fzh """ import pickle import numpy as np import os import pandas as pd from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler def del_feature(data_train,data_test):data_train["oringin"]="train"data_test["oringin"]="test"data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)"""刪除特征"V5","V9","V11","V17","V22","V28",訓練集和測試集分布不一致"""data_all.drop(["V5","V9","V11","V17","V22","V28"],axis=1,inplace=True)# print('drop after data_all.shape=',data_all.shape)# figure parametersdata_train= data_all[data_all["oringin"] == "train"].drop("oringin", axis=1)# print('drop after data_train.shape=',data_train.shape)"""'V14', u'V21', u'V25', u'V26', u'V32', u'V33', u'V34'"""# Threshold for removing correlated variablesthreshold = 0.1# Absolute value correlation matrixcorr_matrix = data_train.corr().abs()drop_col=corr_matrix[corr_matrix["target"]<threshold].index# print('drop_col=',drop_col)data_all.drop(drop_col,axis=1,inplace=True)# print('data_all.shape=',data_all.shape)return data_all def scale_minmax(col):return (col - col.min()) / (col.max() - col.min())def normal(data_all):"""歸一化"""cols_numeric = list(data_all.columns)cols_numeric.remove("oringin")scale_cols = [col for col in cols_numeric if col != 'target']print('scale_cols=', scale_cols)data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax, axis=0)return data_all"""extract test data (without SalePrice)""" def get_test_data(data_all):df_test = data_all[data_all["oringin"]=="test"].reset_index(drop=True)return df_test.drop(["oringin","target"],axis=1) if __name__ == '__main__':with open("data/zhengqi_train.txt") as fr:data_train=pd.read_table(fr,sep="\t")with open("data/zhengqi_test.txt") as fr_test:data_test=pd.read_table(fr_test,sep="\t")data_all = del_feature(data_train, data_test)print('clear data_all.shape', data_all.shape)data_all=normal(data_all)X_test = get_test_data(data_all)print('X_test.shape', X_test.shape)poly_trans = PolynomialFeatures(degree=2)X_test = poly_trans.fit_transform(X_test)print(X_test.shape)with open("forest_model.pkl", "rb") as f:model = pickle.load(f)X_pre=model.predict(X_test)print(X_pre.shape)X_pre=list(map(lambda x:round(x,3),X_pre))X_pre=np.reshape(X_pre,(-1,1))print(X_pre.shape)X_pre=pd.DataFrame(X_pre)print(X_pre)X_pre.to_csv('result.txt',index=False,header=False)得分0.1298,排名100多,第一次實踐還算可以吧。
總結
以上是生活随笔為你收集整理的天池入门赛--蒸汽预测的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 产品金字塔
- 下一篇: 图论基础知识--最小生成树算法krusk