大数据挑战赛(大佬篇)
相對于我的雜亂無章,dalao們做比賽時將每個環節分文件處理,這樣好看也好改,分別分為以下幾個步驟:
1.數據處理
2.特征選擇
3.調參
4.模型融合
5.過擬合的處理
在做特征提取時,可以提取得到的數據都提取了12種特征,一共提取了大概一百多個特征...可能這就是我與大佬的差距。
這是特征提取的代碼:
# -*- coding: utf-8 -*- """Created on Sat Jul 8 10:45:13 2017 @author: Yang E-mail: xieear@qq.com """ #運行時間 30min import pandas as pd import numpy as np#from sklearn.externals.joblib import Parallel, delayed import os import warnings import json warnings.filterwarnings("ignore")#原始數據處理 def data_process(data):data['point'] = data['point'].apply(lambda x:[list(map(float,point.split(','))) for point in x.split(';')[:-1]])data['target'] = data['target'].apply(lambda x: list(map(float,x.split(","))))# 提取 x坐標 y坐標 t 目標點x坐標 目標點y坐標df = pd.DataFrame()df['x'] = data['point'].apply(lambda x:np.array(x)[:,0])df['y'] = data['point'].apply(lambda x:np.array(x)[:,1])df['t'] = data['point'].apply(lambda x:np.array(x)[:,2])df['target_x'] = np.array(data['target'].tolist())[:,0]df['target_y'] = np.array(data['target'].tolist())[:, 1]return df#差分處理 def data_diff(data, name_list):for name in name_list:data['diff_'+name] = data[name].apply(lambda x: pd.Series(x).diff().dropna().tolist())data['diff_'_name] = data['diff_'+name].apply(lambda x: [0] if x==[] else x)#!!注意 一個點的情況return data#獲取距離數據 def get_dist(data):dist_target = []dist = []dist_x_target = []dist_y_target = []# 各點與目標點的距離for x,y,target_x, target_y in zip(data['x'],data['y'],data['target_x'],data['target_y']):dist_target.append(np.sqrt((x-target_x)**2 + (y-target_y)**2))#兩點之間的距離for x,y in zip(data['diff_x'], data['diff_y']):dist.append(np.sqrt(np.array(x)**2+np.array(y)**2))#各點x坐標與目標點x坐標的距離for x,target_x in zip(data['x'], data['target_x']):dist_x_target.append(np.sqrt((x-target_x)**2))#各點y坐標與目標點y坐標的距離for y,target_y in zip(data['y'], data['target_y']):dist_y_target.append(np.sqrt((y-target_y)**2))data['dist_target'] = dist_targetdata['dist'] = distdata['dist_x_target'] = dist_x_targetdata['dist_y_target'] = dist_y_targetreturn data#獲取速度數據 def get_v(data):v = []v_x = []v_y = []#獲取兩點之間的速度for dist, t in zip(data['dist'], data['diff_t']);v0 = dist/tv0 = list(map(lambda x: 0 if x==np.inf or x==-np.inf else x, v0))#!!注意除數為0的情況v.append(v0)#獲取兩點x坐標之間的速度for x, t in zip(data['diff_x'], data['diff_t']):v1 = np.array(x)/np.array(t)v1 = list(map(lambda x: 0 if x==np.inf or x==-mp.inf or np.isnan(x) else x, v1))v_x.append(v1)#獲取兩點之間的速度for y, t in zip(data['diff_y'], data['diff_t']):v2 = np.array(y)/np.array(t)v2 = list(map(lambda x: 0 if x==np.inf or x==-np.inf or np.isnan(x) else x, v2))v_y.append(v2)data['v'] = vdata['v_x'] = v_xdata['v_y'] = v_yreturn data#獲取加速度數據 def get_a(data):a = []a_x = []a_y = []#獲取兩點之間的加速度for v, t in zip(data['diff_v'], data['diff_t']):v = np.array(v)t = np.array(t)a_t = (t[:-1] + t[1:])/2a0 = v/a_ta0 = list(map(lambda x: 0 if x==np.inf or x==-np.inf else x, a0)) #!!注意除數為0的情況#!!注意 列表為空if a0==[]:a0=[0]a.append(a0)#獲取兩點x坐標之間的加速度for v_x, t in zip(data['diff_v_x'], data['diff_t']):v_x = np.array(v_x)t = np.array(t)a_t = (t[:-1] + t[1:])/2a1 = v_x/a_ta1 = list(map(lambda x: 0 if x==np.inf or x==-np.inf else x, a1))#!! 注意除數為0的情況if a1==[]:a1 =[0]a_x.append(a1)#獲取兩點x坐標之間的加速度for v_y, t in zip(data['diff_v_y'], data['diff_t']):v_y = np.array(v_y)t = np.array(t)a_t = (t[:-1] + t[1:])/2a2 = v_y/a_ta2 = list(map(lambda x: 0 if x==np.inf or x==-np.inf else x, a2))#!!注意除數為0的情況if a2 == [] :a2=[0]a_y.append(a2)data['a'] = adata['a_x'] = a_xdata['a_y'] = a_yreturn data def get_feature(data, name):dfGroup =pd.DataFrame()dfGroup[name+'_start'] = data.apply(lambda x: x[0])dfGroup[name+'_end'] = data.apply(lambda x: x[len(x)-1])dfGroup[name+'_max'] = data.apply(lambda x: max(x))dfGroup[name+'_min'] = data.apply(lambda x: min(x))dfGroup[name+'_ptp'] = dfGroup[name+'_max'].sub(dfGroup[name+'_min'])dfGroup[name+'_mean'] = data.apply(lambda x: np.mean(x))dfGroup[name+'_std'] = data.apply(lambda x: np.std(x))dfGroup[name+'_cv'] = dfGroup[name+'_std'].div(dfGroup[name+'_mean'], fill_value=0)dfGroup[name+'_cv'] = dfGroup[name+'_cv'].replace([np.inf,-np.inf],[0,0])dfGroup[name+'_cv'] = dfGroup[name+'_cv'].fillna(0)dfGroup[name+'_Q1'] = data.apply(lambda x: np.percentile(x, 0.25))dfGroup[name+'_Q2'] = data.apply(lambda x: np.percentile(x, 0.5))dfGroup[name+'_Q3'] = data.apply(lambda x: np.percentile(x, 0.75))dfGroup[name+'_interRan'] = dfGroup[name+'_Q3'].sub(dfGroup[name+'_Q1'])dfGroup[name+'_skew'] = data.apply(lambda x: pd.Series(x).skew()).fillna(0)dfGroup[name+'_kurt'] = data.apply(lambda x: pd.Series(x).kurt()).fillna(0)return dfGroupdef get_point_feature(df):point_x = get_feature(df['x'], 'x')point_y = get_feature(df['y'], 'y')point = pd.concat([point_x, point_y], axis=1)point['target_x'] = df['target_x'].valuespoint['target_y'] = df['target_y'].valuesreturn pointdef get_dist_feature(df):dist_target = get_feature(df['dist_target'], 'dist_target')dist_x_target = get_feature(df['dist_x_target'], 'dist_x_target')dist_y_target = get_feature(df['dist_y_target'], 'dist_y_target')diff = get_feature(df['dist'], 'dist')diff_x = get_feature(df['diff_x'], 'diff_x')diff_y = get_feature(df['diff_y'], 'diff_y')dist = pd.concat([dist_target, dist_x_target, dist_y_target, diff, diff_x, diff_y], axis=1)return distdef get_time_feature(df):t = get_feature(df['t'], 't')t_diff = get_feature(df['diff_t'], 'diff_t')t = pd.concat([t, t_diff], axis=1)return tdef get_v_feature(df):v_x = get_feature(df['v_x'], 'v_x')v_y = get_feature(df['v_y'], 'v_y')v = get_feature(df['v'], 'v')v_diff_x = get_feature(df['diff_v_x'], 'diff_v_x')v_diff_y = get_feature(df['diff_v_y'], 'diff_v_y')v_diff = get_feature(df['diff_v'], 'diff_v')v = pd.concat([v_x, v_y, v,v_diff_x, v_diff_y, v_diff], axis=1)return vdef get_a_feature(df):a_x = get_feature(df['a_x'], 'a_x')a_y = get_feature(df['a_y'], 'a_y')a = get_feature(df['a'], 'a')a = pd.concat([a_x, a_y, a], axis=1)with open('a_feature.json', 'w', encoding='utf-8')as f:json.dump(list(a.columns), f, ensure_ascii=False)file = open('a_feature.json', 'w', encoding='utf-8')json.dump(list(a.columns), file, ensure_ascii=False)file.close()return adef get_other_feature(data):dfGroup = pd.DataFrame()dfGroup['point_count'] = data['x'].apply(lambda x: len(x))dfGroup['x_back_num'] = data['diff_x'].apply(lambda x: min( (np.array(x) > 0).sum(), (np.array(x) < 0).sum()))dfGroup['y_back_num'] = data['diff_y'].apply(lambda x: min( (np.array(x) > 0).sum(), (np.array(x) < 0).sum()))dfGroup['x_equal_0'] = data['diff_x'].apply(lambda x: (np.array(x) == 0).sum())dfGroup['y_equal_0'] = data['diff_y'].apply(lambda x: (np.array(x) == 0).sum())dfGroup['equal_0'] = data['dist'].apply(lambda x: (np.array(x) == 0).sum())return dfGroupdef make_df(df):df = data_process(df)df = data_diff(df, ['x', 'y', 't'])df = get_dist(df)df = get_v(df)df = data_diff(df, ['v', 'v_x', 'v_y'])df = get_a(df)point = get_point_feature(df[['x', 'y', 'target_x', 'target_y']])dist = get_dist_feature(df[['diff_x', 'diff_y', 'dist_target', 'dist', 'dist_x_target', 'dist_y_target']])t = get_time_feature(df[['t', 'diff_t']])v =get_V_feature(df[['v', 'v_x', 'v_y', 'diff_v', 'diff_v_x', 'diff_v_y']])a = get_a_feature(df[['a', 'a_x', 'a_y']])other = get_other_feature(df)df1 = pd.concat([point, dist, t, v, a, other], axis=1)return fd1.fillna(0)def save_df(df, name):global pathglobal id_dataglobal labelglobal train_lenglobal test_lendf['id'] = id_datatrain = df.ix[:train_len-1, :]train['label'] = labeltest =df.ix[train_len:train_len+test_len-1, :]testB = df.ix[train_len+test_len:,:]train.to_csv(path+"\\"+name+"train.csv", index=None)test.to_csv(path+"\\"+name+"test.csv", index=None)testB.to_csv(path+"\\"+name+"testB.csv", index=None)def input_df():#set pathpath = r'G:\比賽分享\data'train_path = os.path.join(path, 'dsjtzs_txfz_training.txt')test_path = os.path.join(path, 'dsjtzs_txfz_test1.txt')testB_path = os.path.join(path, 'dsjtzs_txfz_testB.txt')#load datatrain = pd.read_csv(train_path, sep=' ', names=['id', 'point', 'target', 'label']).ix[:100] #實際運行 去掉 .ix[:100]test = pd.read_csv(test_path, sep=' ', names=['id', 'point', 'target']).ix[:100]testB = pd.read_csv(testB_path, sep=' ', names=['id', 'point', 'target']).ix[:100]#合并數據集label = train['label'].copy()train.drop('label', axis=1, inplace=True)df = pd.concat([train, test, testB], ignore_index=True)id_data = df['id'].copy()df.drop('id', axis=1, inplace=True)train_len = len(train)test_len = len(test)global pathglobal id_dataglobal labelglobal train_lenglobal test_lenreturn dfif __name__ == '__main__':df = input_df()df = make_df(df)save_df(df, 'all')這是最基礎,也是最重要的環節,在這之后,需要進行特征選擇來篩選掉無用的特征,
可以使用LDA、PCA進行特征提取,然后使用filter、wrapper封裝器進行特征選擇。下面是相關博客:
http://www.dataivy.cn/blog/%E7%BA%BF%E6%80%A7%E5%88%A4%E5%88%AB%E5%88%86%E6%9E%90linear-discriminant-analysis_lda/
1.13. Feature selection — scikit-learn 0.17.1 documentation?http://scikit-learn.org/0.17/modules/feature_selection.html#rfe
對于PCA、filter的描述(可能)有后續更新,看我心情吧~
以上兩個步驟,基本上完成了比賽的一半,接下來就是調參和模型融合以及過擬合的處理
對于調參這門玄學,我目前還沒有參透,最多只會使用grid_scearch暴力搜索一下,所以這部分我先不講。
模型融合,就是使用xgboost等一系列大殺器對數據進行擬合,這個在上個博客已經詳細提到。
下面介紹幾個手動模型融合的方法:
1.投票法(VOTE)
?
多個模型進行預測,把他們的結果融合起來
對于分類問題 相當于取眾數
對于回歸問題 可以取平均值
2.Stacking
?
相比 Blending,Stacking 能更好地利用訓練數據。以 5-Fold Stacking 為例,它的基本原理如圖所示:
無。
?
整個過程很像 Cross Validation。首先將訓練數據分為 5 份,接下來一共 5 個迭代,每次迭代時,將 4 份數據作為 Training Set 對每個 Base Model 進行訓練,然后在剩下一份 Hold-out Set 上進行預測。同時也要將其在測試數據上的預測保存下來。這樣,每個 Base Model 在每次迭代時會對訓練數據的其中 1 份做出預測,對測試數據的全部做出預測。5 個迭代都完成以后我們就獲得了一個?#訓練數據行數 x #Base Model 數量?的矩陣,這個矩陣接下來就作為第二層的 Model 的訓練數據。當第二層的 Model 訓練完以后,將之前保存的 Base Model 對測試數據的預測(因為每個 Base Model 被訓練了 5 次,對測試數據的全體做了 5 次預測,所以對這 5 次求一個平均值,從而得到一個形狀與第二層訓練數據相同的矩陣)拿出來讓它進行預測,就得到最后的輸出。
再接下來就是對過擬合的處理,通過繪制learning_carve等評分曲線來觀察過擬合程度,根據相應的特點,我們采取不同方式處理過擬合:
http://blog.csdn.net/heyongluoyao8/article/details/49429629
大致過程到此結束。
?
coding the new world
與君共勉
創作挑戰賽新人創作獎勵來咯,堅持創作打卡瓜分現金大獎總結
以上是生活随笔為你收集整理的大数据挑战赛(大佬篇)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Python控制鼠标和键盘-PyAuto
- 下一篇: 第二届高校大数据比赛之鼠标轨迹识别