【算法竞赛学习】资金流入流出预测-挑战Baseline_特征工程
生活随笔
收集整理的這篇文章主要介紹了
【算法竞赛学习】资金流入流出预测-挑战Baseline_特征工程
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
賽題簡介
螞蟻金服擁有上億會員并且業務場景中每天都涉及大量的資金流入和流出,面對如此龐大的用戶群,資金管理壓力會非常大。在既保證資金流動性風險最小,又滿足日常業務運轉的情況下,精準地預測資金的流入流出情況變得尤為重要。此屆大賽以《資金流入流出預測》為題,期望參賽者能夠通過對例如余額寶用戶的申購贖回數據的把握,精準預測未來每日的資金流入流出情況。對貨幣基金而言,資金流入意味著申購行為,資金流出為贖回行為 。
賽題與數據
競賽中使用的數據主要包含四個部分,分別為用戶基本信息數據、用戶申購贖回數據、收益率表和銀行間拆借利率表。https://tianchi.aliyun.com/competition/entrance/231573/information
特征工程
import pandas as pd import numpy as npimport datetime # import shap # import eli5 import seaborn as sns import matplotlib.pyplot as plt# from mvtpy import mvtest # from wordcloud import WordCloud from scipy import stats # from eli5.sklearn import PermutationImportance from sklearn import tree from sklearn.preprocessing import OneHotEncoder from sklearn.linear_model import LinearRegressionfrom typing import * import warnings warnings.filterwarnings('ignore') # 為方面后面操作,設置全局index變量labels = ['total_purchase_amt','total_redeem_amt'] date_indexs = ['week','year','month','weekday','day'] # Load the balance data def load_data(path: str = 'user_balance_table.csv')->pd.DataFrame:data_balance = pd.read_csv(path)return data_balance.reset_index(drop=True)# add tiemstamp to dataset def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date')->pd.DataFrame:data_balance = data.copy()data_balance['date'] = pd.to_datetime(data_balance[time_index], format= "%Y%m%d")data_balance['day'] = data_balance['date'].dt.daydata_balance['month'] = data_balance['date'].dt.monthdata_balance['year'] = data_balance['date'].dt.yeardata_balance['week'] = data_balance['date'].dt.weekdata_balance['weekday'] = data_balance['date'].dt.weekdayreturn data_balance.reset_index(drop=True)# total amount def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31')->pd.DataFrame:df_tmp = data.copy()df_tmp = df_tmp.groupby(['date'])['total_purchase_amt','total_redeem_amt'].sum()df_tmp.reset_index(inplace=True)return df_tmp[(df_tmp['date']>= date)].reset_index(drop=True)# Generate the test data def generate_test_data(data: pd.DataFrame)->pd.DataFrame:total_balance = data.copy()start = datetime.datetime(2014,9,1)testdata = []while start != datetime.datetime(2014,10,15):temp = [start, np.nan, np.nan]testdata.append(temp)start += datetime.timedelta(days = 1)testdata = pd.DataFrame(testdata)testdata.columns = total_balance.columnstotal_balance = pd.concat([total_balance, testdata], axis = 0)total_balance = total_balance.reset_index(drop=True)return total_balance.reset_index(drop=True)# Load user's information def load_user_information(path: str = 'user_profile_table.csv')->pd.DataFrame:return pd.read_csv(path) # 讀取數據集balance_data = load_data('Data/user_balance_table.csv') balance_data = add_timestamp(balance_data, time_index='report_date') total_balance = get_total_balance(balance_data) total_balance = generate_test_data(total_balance) total_balance = add_timestamp(total_balance, 'date') user_information = load_user_information('Data/user_profile_table.csv') balance_data特征提取
基于日期的靜態特征
# 獲取節假日集合def get_holiday_set()->Set[datetime.date]:holiday_set = set()# 清明節holiday_set = holiday_set | {datetime.date(2014,4,5), datetime.date(2014,4,6), datetime.date(2014,4,7)}# 勞動節holiday_set = holiday_set | {datetime.date(2014,5,1), datetime.date(2014,5,2), datetime.date(2014,5,3)}# 端午節holiday_set = holiday_set | {datetime.date(2014,5,31), datetime.date(2014,6,1), datetime.date(2014,6,2)}# 中秋節holiday_set = holiday_set | {datetime.date(2014,9,6), datetime.date(2014,9,7), datetime.date(2014,9,8)}# 國慶節holiday_set = holiday_set | {datetime.date(2014,10,1), datetime.date(2014,10,2), datetime.date(2014,10,3),\datetime.date(2014,10,4), datetime.date(2014,10,5), datetime.date(2014,10,6),\datetime.date(2014,10,7)}# 中秋節holiday_set = holiday_set | {datetime.date(2013,9,19), datetime.date(2013,9,20), datetime.date(2013,9,21)}# 國慶節holiday_set = holiday_set | {datetime.date(2013,10,1), datetime.date(2013,10,2), datetime.date(2013,10,3),\datetime.date(2013,10,4), datetime.date(2013,10,5), datetime.date(2013,10,6),\datetime.date(2013,10,7)}return holiday_set # 提取所有 is特征def extract_is_feature(data: pd.DataFrame)->pd.DataFrame:total_balance = data.copy().reset_index(drop=True)# 是否是Weekendtotal_balance['is_weekend'] = 0total_balance.loc[total_balance['weekday'].isin((5,6)), 'is_weekend'] = 1# 是否是假期total_balance['is_holiday'] = 0total_balance.loc[total_balance['date'].isin(get_holiday_set()), 'is_holiday'] = 1# 是否是節假日的第一天last_day_flag = 0total_balance['is_firstday_of_holiday'] = 0for index, row in total_balance.iterrows():if last_day_flag == 0 and row['is_holiday'] == 1:total_balance.loc[index, 'is_firstday_of_holiday'] = 1last_day_flag = row['is_holiday']# 是否是節假日的最后一天total_balance['is_lastday_of_holiday'] = 0for index, row in total_balance.iterrows():if row['is_holiday'] == 1 and total_balance.loc[index+1, 'is_holiday'] == 0:total_balance.loc[index, 'is_lastday_of_holiday'] = 1# 是否是節假日后的上班第一天total_balance['is_firstday_of_work'] = 0last_day_flag = 0for index, row in total_balance.iterrows():if last_day_flag == 1 and row['is_holiday'] == 0:total_balance.loc[index, 'is_firstday_of_work'] = 1last_day_flag = row['is_lastday_of_holiday']# 是否不用上班total_balance['is_work'] = 1total_balance.loc[(total_balance['is_holiday'] == 1) | (total_balance['is_weekend'] == 1), 'is_work'] = 0special_work_day_set = {datetime.date(2014,5,4), datetime.date(2014,9,28)}total_balance.loc[total_balance['date'].isin(special_work_day_set), 'is_work'] = 1# 是否明天要上班total_balance['is_gonna_work_tomorrow'] = 0for index, row in total_balance.iterrows():if index == len(total_balance)-1:breakif row['is_work'] == 0 and total_balance.loc[index+1, 'is_work'] == 1:total_balance.loc[index, 'is_gonna_work_tomorrow'] = 1# 昨天上班了嗎total_balance['is_worked_yestday'] = 0for index, row in total_balance.iterrows():if index <= 1:continueif total_balance.loc[index-1, 'is_work'] == 1:total_balance.loc[index, 'is_worked_yestday'] = 1# 是否是放假前一天total_balance['is_lastday_of_workday'] = 0for index, row in total_balance.iterrows():if index == len(total_balance)-1:breakif row['is_holiday'] == 0 and total_balance.loc[index+1, 'is_holiday'] == 1:total_balance.loc[index, 'is_lastday_of_workday'] = 1# 是否周日要上班total_balance['is_work_on_sunday'] = 0for index, row in total_balance.iterrows():if index == len(total_balance)-1:breakif row['weekday'] == 6 and row['is_work'] == 1:total_balance.loc[index, 'is_work_on_sunday'] = 1# 是否是月初第一天total_balance['is_firstday_of_month'] = 0total_balance.loc[total_balance['day'] == 1, 'is_firstday_of_month'] = 1# 是否是月初第二天total_balance['is_secday_of_month'] = 0total_balance.loc[total_balance['day'] == 2, 'is_secday_of_month'] = 1# 是否是月初total_balance['is_premonth'] = 0total_balance.loc[total_balance['day'] <= 10, 'is_premonth'] = 1# 是否是月中total_balance['is_midmonth'] = 0total_balance.loc[(10 < total_balance['day']) & (total_balance['day'] <= 20), 'is_midmonth'] = 1# 是否是月末total_balance['is_tailmonth'] = 0total_balance.loc[20 < total_balance['day'], 'is_tailmonth'] = 1# 是否是每個月第一個周total_balance['is_first_week'] = 0total_balance.loc[total_balance['week'] % 4 == 1, 'is_first_week'] = 1# 是否是每個月第一個周total_balance['is_second_week'] = 0total_balance.loc[total_balance['week'] % 4 == 2, 'is_second_week'] = 1# 是否是每個月第一個周total_balance['is_third_week'] = 0total_balance.loc[total_balance['week'] % 4 == 3, 'is_third_week'] = 1# 是否是每個月第四個周total_balance['is_fourth_week'] = 0total_balance.loc[total_balance['week'] % 4 == 0, 'is_fourth_week'] = 1return total_balance.reset_index(drop=True) # 提取is特征到數據集total_balance = extract_is_feature(total_balance) # 編碼翌日特征def encode_data(data: pd.DataFrame, feature_name:str = 'weekday', encoder=OneHotEncoder())->pd.DataFrame():total_balance = data.copy()week_feature = encoder.fit_transform(np.array(total_balance[feature_name]).reshape(-1, 1)).toarray()week_feature = pd.DataFrame(week_feature,columns= [feature_name + '_onehot_'+ str(x) for x in range(len(week_feature[0]))])#featureWeekday = pd.concat([total_balance, week_feature], axis = 1).drop(feature_name, axis=1)featureWeekday = pd.concat([total_balance, week_feature], axis = 1)return featureWeekday # 編碼翌日特征到數據集total_balance = encode_data(total_balance) # 生成is特征集合feature = total_balance[[x for x in total_balance.columns if x not in date_indexs]]is特征的下標簽分布分析
# 繪制箱型圖def draw_boxplot(data: pd.DataFrame)->None:f, axes = plt.subplots(7, 4, figsize=(18, 24))global date_indexs, labelscount = 0for i in [x for x in data.columns if x not in date_indexs + labels + ['date']]:sns.boxenplot(x=i, y='total_purchase_amt', data=data, ax=axes[count // 4][count % 4])count += 1 draw_boxplot(feature) ## 剔除看起來較差的特征purchase_feature_seems_useless = [#樣本量太少,建模時無效;但若確定這是一個有用規則,可以對結果做修正'is_work_on_sunday',#中位數差異不明顯'is_first_week' ]特征的相關性分析
# 畫相關性熱力圖def draw_correlation_heatmap(data: pd.DataFrame, way:str = 'pearson')->None:feature = data.copy()plt.figure(figsize=(20,10))plt.title('The ' + way +' coleration between total purchase and each feature')sns.heatmap(feature[[x for x in feature.columns if x not in ['total_redeem_amt', 'date'] ]].corr(way),linecolor='white',linewidths=0.1,cmap="RdBu") draw_correlation_heatmap(feature, 'spearman') # 剔除相關性較低的特征temp = np.abs(feature[[x for x in feature.columns if x not in ['total_redeem_amt', 'date'] ]].corr('spearman')['total_purchase_amt']) feature_low_correlation = list(set(temp[temp < 0.1].index))基于距離的特征
距離特征提取
# 提取距離特征def extract_distance_feature(data: pd.DataFrame)->pd.DataFrame:total_balance = data.copy()# 距離放假還有多少天total_balance['dis_to_nowork'] = 0for index, row in total_balance.iterrows():if row['is_work'] == 0:step = 1flag = 1while flag:if index - step >= 0 and total_balance.loc[index - step, 'is_work'] == 1:total_balance.loc[index - step, 'dis_to_nowork'] = stepstep += 1else:flag = 0total_balance['dis_from_nowork'] = 0step = 0for index, row in total_balance.iterrows():step += 1if row['is_work'] == 1:total_balance.loc[index, 'dis_from_nowork'] = stepelse:step = 0# 距離上班還有多少天total_balance['dis_to_work'] = 0for index, row in total_balance.iterrows():if row['is_work'] == 1:step = 1flag = 1while flag:if index - step >= 0 and total_balance.loc[index - step, 'is_work'] == 0:total_balance.loc[index - step, 'dis_to_work'] = stepstep += 1else:flag = 0total_balance['dis_from_work'] = 0step = 0for index, row in total_balance.iterrows():step += 1if row['is_work'] == 0:total_balance.loc[index, 'dis_from_work'] = stepelse:step = 0# 距離節假日還有多少天total_balance['dis_to_holiday'] = 0for index, row in total_balance.iterrows():if row['is_holiday'] == 1:step = 1flag = 1while flag:if index - step >= 0 and total_balance.loc[index - step, 'is_holiday'] == 0:total_balance.loc[index - step, 'dis_to_holiday'] = stepstep += 1else:flag = 0total_balance['dis_from_holiday'] = 0step = 0for index, row in total_balance.iterrows():step += 1if row['is_holiday'] == 0:total_balance.loc[index, 'dis_from_holiday'] = stepelse:step = 0# 距離節假日最后一天還有多少天total_balance['dis_to_holiendday'] = 0for index, row in total_balance.iterrows():if row['is_lastday_of_holiday'] == 1:step = 1flag = 1while flag:if index - step >= 0 and total_balance.loc[index - step, 'is_lastday_of_holiday'] == 0:total_balance.loc[index - step, 'dis_to_holiendday'] = stepstep += 1else:flag = 0total_balance['dis_from_holiendday'] = 0step = 0for index, row in total_balance.iterrows():step += 1if row['is_lastday_of_holiday'] == 0:total_balance.loc[index, 'dis_from_holiendday'] = stepelse:step = 0# 距離月初第幾天total_balance['dis_from_startofmonth'] = np.abs(total_balance['day'])# 距離月的中心點有幾天total_balance['dis_from_middleofmonth'] = np.abs(total_balance['day'] - 15)# 距離星期的中心有幾天total_balance['dis_from_middleofweek'] = np.abs(total_balance['weekday'] - 3)# 距離星期日有幾天total_balance['dis_from_endofweek'] = np.abs(total_balance['weekday'] - 6)return total_balance # 拼接距離特征到原數據集total_balance = extract_distance_feature(total_balance)距離特征分析
# 獲取距離特征的列名feature = total_balance[[x for x in total_balance.columns if x not in date_indexs]] dis_feature_indexs = [x for x in feature.columns if (x not in date_indexs + labels + ['date']) & ('dis' in x)] # 畫點線def draw_point_feature(data: pd.DataFrame)->None:feature = data.copy()f, axes = plt.subplots(data.shape[1] // 3, 3, figsize=(30, data.shape[1] // 3 * 4))count = 0for i in [x for x in feature.columns if (x not in date_indexs + labels + ['date'])]:sns.pointplot(x=i, y="total_purchase_amt",markers=["^", "o"], linestyles=["-", "--"],kind="point", data=feature, ax=axes[count // 3][count % 3] if data.shape[1] > 3 else axes[count])count += 1 draw_point_feature(feature[['total_purchase_amt'] + dis_feature_indexs]) # 處理距離過遠的時間點 def dis_change(x):if x > 5:x = 10return x # 處理特殊距離 dis_holiday_feature = [x for x in total_balance.columns if 'dis' in x and 'holi' in x] dis_month_feature = [x for x in total_balance.columns if 'dis' in x and 'month' in x] total_balance[dis_holiday_feature] = total_balance[dis_holiday_feature].applymap(dis_change) total_balance[dis_month_feature] = total_balance[dis_month_feature].applymap(dis_change) feature = total_balance[[x for x in total_balance.columns if x not in date_indexs]] # 畫處理后的點線圖 draw_point_feature(feature[['total_purchase_amt'] + dis_feature_indexs]) ## 剔除看起來用處不大的特征 purchase_feature_seems_useless += [#即使做了處理,但方差太大,不可信,規律不明顯'dis_to_holiday',#方差太大,不可信'dis_from_startofmonth',#方差太大,不可信'dis_from_middleofmonth' ] # 畫出相關性圖 draw_correlation_heatmap(feature[['total_purchase_amt'] + dis_feature_indexs]) # 剔除相關性較差的特征 temp = np.abs(feature[[x for x in feature.columns if ('dis' in x) | (x in ['total_purchase_amt']) ]].corr()['total_purchase_amt']) feature_low_correlation += list(set(temp[temp < 0.1].index) )波峰波谷特征
提取波峰特征
# 觀察波峰特點 fig = plt.figure(figsize=(15,15)) for i in range(6, 10):plt.subplot(5,1,i - 5)total_balance_2 = total_balance[(total_balance['date'].dt.date >= datetime.date(2014,8,1)) & (total_balance['date'].dt.date < datetime.date(2014,9,1))]sns.pointplot(x=total_balance_2['day'],y=total_balance_2['total_purchase_amt'])plt.legend().set_title('Month:' + str(i)) #Purchase #0401(周二) 0406(周日,清明節第二天) #0410(周四,與周二近似) 0412(周六,與周日近似) #0415(周二) 0420(周日) #0424(周四,與周二在近似水平) 0427(周日) #0429(周二) 0502(周五,勞動節第二天) #0507(周三,與周二差異較大,可能受勞務節影響) 0511(周日) #0512(周一,與周二有一定差距) 0518(周日) #0519(周二) 0525(周日) #0526(周一,與周二有一定差距) 0531(周六,月末) #0605(周四,與周二差異大,可能受端午節影響) 0607(周六,可能受端午節影響) #0609(周一,與周二近似) 0615(周日) #0616(周一,與周二差異大) 0622(周日) #0626(周四,與周二差異不大) 0629(周日) #0701(周二) 0705(周六,與周日差距不大) #0707(周一,與周二有差距) 0713(周日) #0716(周三,與周二有一定差距) 0720(周日) #0721(周一,與周二有明顯差距) 0726(周六,與周日近似) #0728(周一,與周二有明顯差距) 0803(周日) #0805(周二) 0809(周六,與周日有較大差距) #0811(周一,有周二有較大差距) 0817(周日) #0818(周一,與周二差距不大) 0824(周日) # 設定波峰日期 def extract_peak_feature(data: pd.DataFrame)->pd.DataFrame:total_balance = data.copy()# 距離purchase波峰(即周二)有幾天total_balance['dis_from_purchase_peak'] = np.abs(total_balance['weekday'] - 1)# 距離purchase波谷(即周日)有幾天,與dis_from_endofweek相同total_balance['dis_from_purchase_valley'] = np.abs(total_balance['weekday'] - 6)return total_balance # 提取波峰特征 total_balance = extract_peak_feature(total_balance) feature = total_balance[[x for x in total_balance.columns if x not in date_indexs]] draw_point_feature(feature[['total_purchase_amt'] + ['dis_from_purchase_peak','dis_from_purchase_valley']])分析波峰特征
draw_point_feature(feature[['total_purchase_amt'] + ['dis_from_purchase_peak','dis_from_purchase_valley']])分析波峰特征相關性
temp = np.abs(feature[[x for x in feature.columns if ('peak' in x) or ('valley' in x) or (x in ['total_purchase_amt']) ]].corr()['total_purchase_amt'])加入周期因子作為特征
提取周期因子
def generate_rate(df, month_index):total_balance = df.copy()pure_balance = total_balance[['date','total_purchase_amt','total_redeem_amt']]pure_balance = pure_balance[(pure_balance['date'].dt.date >= datetime.date(2014,3,1)) & (pure_balance['date'].dt.date < datetime.date(2014, month_index, 1))]pure_balance['weekday'] = pure_balance['date'].dt.weekdaypure_balance['day'] = pure_balance['date'].dt.daypure_balance['week'] = pure_balance['date'].dt.weekpure_balance['month'] = pure_balance['date'].dt.monthweekday_rate = pure_balance[['weekday']+labels].groupby('weekday',as_index=False).mean()for name in labels:weekday_rate = weekday_rate.rename(columns={name: name+'_weekdaymean'})weekday_rate['total_purchase_amt_weekdaymean'] /= np.mean(pure_balance['total_purchase_amt'])weekday_rate['total_redeem_amt_weekdaymean'] /= np.mean(pure_balance['total_redeem_amt'])pure_balance = pd.merge(pure_balance, weekday_rate, on='weekday', how='left')weekday_count = pure_balance[['day','weekday','date']].groupby(['day','weekday'],as_index=False).count()weekday_count = pd.merge(weekday_count, weekday_rate, on = 'weekday')weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date'] / (len(set(pure_balance['month'])) - 1)weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date'] / (len(set(pure_balance['month'])) - 1)day_rate = weekday_count.drop(['weekday','date'],axis=1).groupby('day',as_index=False).sum()weekday_rate.columns = ['weekday','purchase_weekdayrate','redeem_weekdayrate']day_rate.columns = ['day','purchase_dayrate','redeem_dayrate']day_rate['date'] = datetime.datetime(2014, month_index, 1)for index, row in day_rate.iterrows():if month_index in (2,4,6,9) and row['day'] == 31:continueday_rate.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))day_rate['weekday'] = day_rate['date'].dt.weekdayday_rate = pd.merge(day_rate, weekday_rate, on='weekday')day_rate['purchase_dayrate'] = day_rate['purchase_weekdayrate'] / day_rate['purchase_dayrate']day_rate['redeem_dayrate'] = day_rate['redeem_weekdayrate'] / day_rate['redeem_dayrate']weekday_rate['month'] = month_indexday_rate['month'] = month_indexreturn weekday_rate, day_rate[['day','purchase_dayrate','redeem_dayrate','month']].sort_values('day') # 生成周期因子并合并到數據集 weekday_rate_list = [] day_rate_list = [] for i in range(3, 10):weekday_rate, day_rate = generate_rate(total_balance, i)weekday_rate_list.append(weekday_rate.reset_index(drop=True))day_rate_list.append(day_rate.reset_index(drop=True))weekday_rate_list = pd.concat(weekday_rate_list).reset_index(drop=True) day_rate_list = pd.concat(day_rate_list).reset_index(drop=True) total_balance = pd.merge(total_balance, weekday_rate_list, on=['weekday','month'], how='left') total_balance = pd.merge(total_balance, day_rate_list, on=['day','month'], how='left') # 對周期因子進行特殊處理 for i in [x for x in total_balance.columns if 'rate' in x and x not in labels + date_indexs]:total_balance[i] = total_balance[i].fillna(np.nanmedian(total_balance[i])) # 畫出相關性圖 draw_correlation_heatmap(total_balance[['total_purchase_amt'] + [x for x in total_balance.columns if 'rate' in x and x not in labels + date_indexs]])分析周期因子的相關性
# 畫出相關性圖 draw_correlation_heatmap(total_balance[['total_purchase_amt'] + [x for x in total_balance.columns if 'rate' in x and x not in labels + date_indexs]]) # 剔除相關性低的特征 feature = total_balance.drop(date_indexs, axis=1)加入動態時序特征
提取動態特征
## 提取動態特征 def get_amtfeature_with_time(data: pd.DataFrame)->pd.DataFrame:df_tmp_ = data[labels + date_indexs + ['date']].copy()total_balance = data.copy()df_tmp_ = df_tmp_[(df_tmp_['date'].dt.date>=datetime.date(2014,3,3))]df_tmp_['weekday'] = df_tmp_['date'].dt.weekday + 1df_tmp_['week'] = df_tmp_['date'].dt.week - min(df_tmp_['date'].dt.week) + 1df_tmp_['day'] = df_tmp_['date'].dt.daydf_tmp_['month'] = df_tmp_['date'].dt.monthdf_tmp_.reset_index(inplace=True)del df_tmp_['index']df_purchase = pd.DataFrame(columns = ['weekday1','weekday2','weekday3','weekday4','weekday5','weekday6','weekday7'])count = 0for i in range(len(df_tmp_)):df_purchase.loc[count,'weekday'+str(df_tmp_.loc[i,'weekday'])] = df_tmp_.loc[i,'total_purchase_amt']if df_tmp_.loc[i,'weekday'] == 7:count = count + 1df_tmp_['purchase_weekday_median'] = np.nandf_tmp_['purchase_weekday_mean'] = np.nandf_tmp_['purchase_weekday_min'] = np.nandf_tmp_['purchase_weekday_max'] = np.nandf_tmp_['purchase_weekday_std'] = np.nandf_tmp_['purchase_weekday_skew'] = np.nanfor i in range(len(df_tmp_)):#從2014年3月31日開始統計if i > 4*7-1:df_tmp_.loc[i,'purchase_weekday_median'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,'weekday'+str(df_tmp_.loc[i,'weekday'])].median()df_tmp_.loc[i,'purchase_weekday_mean'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,'weekday'+str(df_tmp_.loc[i,'weekday'])].mean()df_tmp_.loc[i,'purchase_weekday_min'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,'weekday'+str(df_tmp_.loc[i,'weekday'])].min() df_tmp_.loc[i,'purchase_weekday_max'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,'weekday'+str(df_tmp_.loc[i,'weekday'])].max() df_tmp_.loc[i,'purchase_weekday_std'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,'weekday'+str(df_tmp_.loc[i,'weekday'])].std() df_tmp_.loc[i,'purchase_weekday_skew'] = df_purchase.loc[:df_tmp_.loc[i,'week']-2,'weekday'+str(df_tmp_.loc[i,'weekday'])].skew() colList = ['purchase_weekday_median','purchase_weekday_mean','purchase_weekday_min','purchase_weekday_max','purchase_weekday_std','purchase_weekday_skew']total_balance = pd.merge(total_balance, df_tmp_[colList+['day','month']], on=['day','month'], how='left')return total_balance # 合并特征到數據集 total_balance = get_amtfeature_with_time(total_balance) # 對動態特征做特殊處理 for i in [x for x in total_balance.columns if '_weekday_' in x and x not in labels + date_indexs]:total_balance[i] = total_balance[i].fillna(np.nanmedian(total_balance[i])) # 繪制動態特征的相關性圖 draw_correlation_heatmap(total_balance[['total_purchase_amt'] + ['purchase_weekday_median','purchase_weekday_mean','purchase_weekday_min','purchase_weekday_max','purchase_weekday_std','purchase_weekday_skew']])分析動態特征相關性
# 繪制動態特征的相關性圖 draw_correlation_heatmap(total_balance[['total_purchase_amt'] + ['purchase_weekday_median','purchase_weekday_mean','purchase_weekday_min','purchase_weekday_max','purchase_weekday_std','purchase_weekday_skew']]) feature[labels + ['dis_to_nowork', 'dis_to_work', 'dis_from_work', 'purchase_weekdayrate','redeem_dayrate', 'weekday_onehot_5', 'weekday_onehot_6','dis_from_nowork', 'is_holiday', 'weekday_onehot_1', 'weekday_onehot_2','weekday_onehot_0', 'dis_from_middleofweek', 'dis_from_holiendday','weekday_onehot_3', 'is_lastday_of_holiday', 'is_firstday_of_holiday','weekday_onehot_4', 'is_worked_yestday', 'is_second_week','is_third_week', 'dis_from_startofmonth', 'dis_from_holiday', 'total_purchase_amt','total_redeem_amt', 'date']].to_csv('Data/0615_residual_purchase_origined.csv', index=False)特征劣汰剔除
剔除無法有效分割數據集的特征
# 畫出各個特征分割數據集的分布估計圖 plt.figure(figsize=(4 * 6, 6 * len(feature.columns) / 6)) count = 0 for i in [x for x in feature.columns if (x not in labels + date_indexs + ['date']) & ('amt' not in x) & ('dis' not in x) & ('rate' not in x)]:count += 1if feature[feature[i] == 0].empty:continueplt.subplot(len(feature.columns) / 4, 4, count)ax = sns.kdeplot(feature[feature[i] == 0]['total_purchase_amt'], label= str(i) + ' == 0, purchase')ax = sns.kdeplot(feature[feature[i] == 1]['total_purchase_amt'], label= str(i) + ' == 1, purchase') # 畫出各個特征分割數據集的分布估計圖 plt.figure(figsize=(4 * 6, 6 * len(feature.columns) / 6)) count = 0 for i in [x for x in feature.columns if (x not in labels + date_indexs + ['date']) & ('amt' not in x) & ('dis' not in x) & ('rate' not in x)]:count += 1if feature[feature[i] == 0].empty:continueplt.subplot(len(feature.columns) / 4, 4, count)ax = sns.kdeplot(feature[feature[i] == 0]['total_purchase_amt'], label= str(i) + ' == 0, purchase')ax = sns.kdeplot(feature[feature[i] == 1]['total_purchase_amt'], label= str(i) + ' == 1, purchase') # 剔除對數據集劃分不明顯的特征 purchase_feature_seems_useless += ['is_gonna_work_tomorrow','is_fourth_week','weekday_onehot_4']剔除復共線特征
feature = feature[[x for x in feature.columns if (x not in feature_low_correlation + purchase_feature_seems_useless) or\(x in feature_saved_from_mv_purchase )]] purchase_cors = feature.corr() purchase_cors['total_purchase_amt'] = np.abs(purchase_cors['total_purchase_amt']) feature_lists = list(purchase_cors.sort_values(by='total_purchase_amt',ascending=False).index)[2:] feature_temp = feature.dropna() # 這里要注意 保留的時候按照相關性降序排序 剔除按照相關性升序排序的順序 thershold = 0.8 for i in range(len(feature_lists)):for k in range(len(feature_lists)-1, -1, -1):if i >= len(feature_lists) or k >= len(feature_lists) or i == k:breakif np.abs(np.corrcoef(feature_temp[feature_lists[i]], feature_temp[feature_lists[k]])[0][1]) > thershold:higher_feature_temp = feature_temp[feature_lists[i]] * feature_temp[feature_lists[k]]if np.abs(np.corrcoef(feature_temp[feature_lists[i]], higher_feature_temp)[0][1]) <= thershold:name = str(feature_lists[i]) + '%%%%' + str(feature_lists[k])feature_temp[name] = higher_feature_tempfeature[name] = feature[feature_lists[i]] * feature[feature_lists[k]]feature_lists.append(name)feature_temp = feature_temp.drop(feature_lists[k], axis=1)feature_lists.remove(feature_lists[k]) feature = feature[[x for x in feature_lists if x not in labels] + labels + ['date']] feature_lists feature.to_csv('Feature/purchase_feature_droped_0614.csv',index=False)選出優勝特征
# 分割數據集 def split_data_underline(data):trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < datetime.date(2014,8,1))]testset = data[(datetime.date(2014,8,1) <= data['date']) & (data['date'] < datetime.date(2014,9,1))]return trainset, testset總結
以上是生活随笔為你收集整理的【算法竞赛学习】资金流入流出预测-挑战Baseline_特征工程的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 联想小新 Air 14 2023 烟霞紫
- 下一篇: 关于Dijkstra 和 Bellman