【算法竞赛学习】资金流入流出预测-挑战Baseline_建模预测
賽題簡介
螞蟻金服擁有上億會員并且業務場景中每天都涉及大量的資金流入和流出,面對如此龐大的用戶群,資金管理壓力會非常大。在既保證資金流動性風險最小,又滿足日常業務運轉的情況下,精準地預測資金的流入流出情況變得尤為重要。此屆大賽以《資金流入流出預測》為題,期望參賽者能夠通過對例如余額寶用戶的申購贖回數據的把握,精準預測未來每日的資金流入流出情況。對貨幣基金而言,資金流入意味著申購行為,資金流出為贖回行為 。
賽題與數據
競賽中使用的數據主要包含四個部分,分別為用戶基本信息數據、用戶申購贖回數據、收益率表和銀行間拆借利率表。https://tianchi.aliyun.com/competition/entrance/231573/information
建模預測
import pandas as pd import sklearn as skr import numpy as np import datetime import matplotlib.pyplot as plt import seaborn as sns from dateutil.relativedelta import relativedelta from typing import * import random from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.neural_network import MLPRegressor import xgboost as xgbimport warnings warnings.filterwarnings('ignore') np.random.seed(1024)labels = ['total_purchase_amt', 'total_redeem_amt'] # 分割數據集def split_data_underline(data: pd.DataFrame)->pd.DataFrame:trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < datetime.date(2014,8,1))]testset = data[(datetime.date(2014,8,1) <= data['date']) & (data['date'] < datetime.date(2014,9,1))]return trainset, testsetdef split_data_online(data: pd.DataFrame)->pd.DataFrame:trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < datetime.date(2014,9,1))]testset = data[(datetime.date(2014,9,1) <= data['date']) & (data['date'] < datetime.date(2014,10,1))]return trainset, testset # 定義評價函數def AE(y: Iterable, yhat: Iterable)->Iterable:return np.abs(y - yhat) / np.abs(y)def total_AE(purchasehat: Iterable, redeemhat: Iterable, purchase: Iterable, redeem: Iterable, h: int = 0.3)->Iterable:return sum(map(lambda x : np.exp(-x/h)*10, AE(purchase, purchasehat))) * 0.45 + sum(map(lambda x : np.exp(-x/h)*10, AE(redeem, redeemhat))) * 0.55 # 在不同的時間段對模型進行驗證def week_evalution_single(data: pd.DataFrame, model: object, types: str)->pd.DataFrame:results = []a_month = relativedelta(months=1)for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11), datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]testset = data[(i <= data['date']) & (data['date'] < i + a_month)]if len(testset) == 0 or len(trainset) == 0:i = datetime.date(2014, 4, 20)trainset = data[(i - 4 * a_month <= data['date']) & (data['date'] < i)]testset = data[(i <= data['date']) & (data['date'] < datetime.date(2014, 9, 1))]feature = [x for x in trainset.columns if x not in ['total_purchase_amt','total_redeem_amt','date']]model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])result_lr = model.predict(testset[feature])h = 0.3results.append(sum(AE(testset['total_' + types + '_amt'], result_lr).apply(lambda x : np.exp(-x/h))*10))return pd.DataFrame(results) # 輸出評級表格def draw_eva_table(df: pd.DataFrame)->pd.DataFrame:rest = df.copy()rest['interval'] = [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11), datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]return rest # 對生成結果進行可視化def visual(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->None:fig = plt.figure(figsize=(10,4))plt.plot(testset['date'], result_purchase_lr, label='predicted_purchase')plt.plot(testset['date'], testset['total_purchase_amt'], label='real_redeem')plt.legend(loc='best')plt.title("The distribution of real and predict purchase")plt.xlabel("Time")plt.ylabel("Amount")plt.show()fig = plt.figure(figsize=(10,4))sns.barplot(testset['date'].dt.day ,result_purchase_lr - testset['total_purchase_amt'])fig = plt.figure(figsize=(10,4))plt.plot(testset['date'], result_redeem_lr, label='predicted_redeem')plt.plot(testset['date'], testset['total_redeem_amt'], label='real_redeem')plt.legend(loc='best')plt.title("The distribution of real and predict redeem")plt.xlabel("Time")plt.ylabel("Amount")plt.show()fig = plt.figure(figsize=(10,4))sns.barplot(testset['date'].dt.day ,result_redeem_lr - testset['total_redeem_amt']) # 定義提取線下最好效果特征的函數def feature_extract(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], List[float]]:features = [x for x in data.columns if x not in labels + ['date']]random.shuffle(features)results = []score = -1for i in features:score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types))if score_update > score:score = score_updateresults.append(i)return results, scoredef robust_feature_extract(data: pd.DataFrame, model: object, types: str):results = []score = -1for i in range(10):results_update, score_update = feature_extract(data, model, types)if score_update > score:score = score_updateresults = results_updateprint(results_update, score_update)return results # 定義AIC,BIC評價指標def AIC(L: Iterable, delta: float, n_features: int):return L * np.log10(delta) + 2 * (n_features + 1) def AIC(L: Iterable, delta: float, n_features: int):return L * np.log10(delta) + (n_features + 1) * np.log10(L) # 使用AIC指標融合模型def feature_extract_AIC(data: pd.DataFrame, model: object, types: str)->Tuple[List[str], float]:features = [x for x in data.columns if x not in labels + ['date']]random.shuffle(features)results = []test_score = 1e9train_score = 0for i in features:test_score_update = np.mean(week_evalution_single(data[results + [i] + labels + ['date']], model, types)[0])if test_score_update < test_score:test_score = test_score_updateresults.append(i)trainset, testset = split_data_underline(data)feature = resultsmodel.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])train_result_lr = model.predict(trainset[feature])delta = mean_squared_error(train_result_lr, trainset['total_' + types + '_amt'])#delta = np.sum(AE(trainset['total_' + types + '_amt'], train_result_lr).apply(lambda x : np.exp(-x/0.1))*10)return results, AIC(len(trainset), delta, len(feature))def multi_model(data: pd.DataFrame, model: object, types: str)->Tuple[List[List[str]], float]:features = []weights = []for i in range(100):results_update, score_update = feature_extract_AIC(data, model, types)features.append(results_update)weights.append(score_update)avg = np.mean(weights)weights = [x - avg for x in weights]weights = [np.power((-1 * x / 2), 10) for x in weights]summ = np.sum(weights)weights = [x / summ for x in weights]return features, weights # 生成線上結果def generate_online_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:trainset, testset = split_data_online(df)model.fit(X=trainset[feature], y=trainset[target])result_purchase_lr = model.predict(testset[feature])return result_purchase_lr def generate_under_result(df: pd.DataFrame, feature: Iterable, model = LinearRegression(), target:str = 'total_purchase_amt')->Iterable:trainset, testset = split_data_underline(df)model.fit(X=trainset[feature], y=trainset[target])result_purchase_lr = model.predict(testset[feature])return result_purchase_lr # 生成線上提交的格式def normalize_upload_file(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame)->pd.DataFrame:testset['total_purchase_amt'] = result_purchase_lrtestset['total_redeem_amt'] = result_redeem_lronline_upload = testset[['date','total_purchase_amt','total_redeem_amt']]online_upload['date'] = online_upload['date'].astype(str)online_upload['date'] = online_upload['date'].str.replace('-','')return online_upload # 線上結果可視化def draw_result(result_purchase_lr: Iterable, result_redeem_lr: Iterable, testset: pd.DataFrame):fig = plt.figure(figsize=(10,4))plt.plot(testset['date'].dt.day, result_purchase_lr, label='online_purchase')plt.plot(testset['date'].dt.day, result_redeem_lr, label='online_redeem')plt.legend(loc='best')plt.title("The predict values")plt.xlabel("Time")plt.ylabel("Amount") # 重載DataFrame加法def add_two_df(df1, df2, features = None, left_a = 0.45, right_a = 0.55):data = df1.copy()if not features:features = [x for x in data.columns if x != 'interval']for i in features:data[i] = (data[i] * left_a + df2[i] * right_a)return data # 重載DataFrame乘法def scale_df(df1, features = None, eta = 1):data = df1.copy()if not features:features = [x for x in data.columns if x != 'interval']for i in features:data[i] *= etareturn data建模測試
僅使用IS特征
data = pd.read_csv('Dataset/feature0522.csv') data['date'] = pd.to_datetime(data['date']) trainset, testset = split_data_underline(data) result_purchase_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_purchase_amt') result_redeem_lr = generate_under_result(data, [x for x in data.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_redeem_amt')在八月份預測結果
total_AE(result_purchase_lr, result_redeem_lr, testset['total_purchase_amt'], testset['total_redeem_amt'])滑窗測試結果
draw_eva_table(week_evalution_single(data, model=LinearRegression(), types = 'purchase')) draw_eva_table(week_evalution_single(data, LinearRegression(), 'redeem'))八月份預測圖與真實圖
visual(result_purchase_lr, result_redeem_lr, testset)
九月份預測效果圖(線性)
trainset, testset = split_data_online(data) draw_result(result_purchase_lr, result_redeem_lr, testset) normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190612_only_is.csv',index=False,header=None)多模型對比
def multi_model_eva(data, types:str = 'purchase'):results = pd.DataFrame()for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs'), xgb.XGBRegressor(objective='reg:squarederror')]:if results.empty:results = draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]})else:results = pd.merge(results, \draw_eva_table(week_evalution_single(data, model, types)).rename(columns={0: repr(model).split('(')[0]}), on='interval')results = results[['interval'] + [x for x in results.columns if x != 'interval']]return results add_two_df(multi_model_eva(data, 'purchase'), multi_model_eva(data, 'redeem'))劣汰后特征對比
data_purchase = pd.read_csv('Feature/purchase_feature_droped_0614.csv') data_purchase['date'] = pd.to_datetime(data_purchase['date']) data_redeem = pd.read_csv('Feature/redeem_feature_droped_0614.csv') data_redeem['date'] = pd.to_datetime(data_redeem['date']) trainset_purchase, testset_purchase = split_data_underline(data_purchase) result_purchase_lr = generate_under_result(data_purchase, [x for x in data_purchase.columnsif x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_purchase_amt') trainset_redeem, testset_redeem = split_data_underline(data_redeem) result_redeem_lr = generate_under_result(data_redeem, [x for x in data_redeem.columnsif x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_redeem_amt') total_AE(result_purchase_lr, result_redeem_lr, testset_purchase['total_purchase_amt'], testset_redeem['total_redeem_amt']) add_two_df(multi_model_eva(data_purchase, 'purchase'), multi_model_eva(data_redeem, 'redeem'))八月份預測效果(線性)
trainset, testset = split_data_underline(data) visual(result_purchase_lr, result_redeem_lr, testset)
生成線上效果(線性)
trainset, testset = split_data_online(data) draw_result(result_purchase_lr, result_redeem_lr, testset)
purchase feature
‘dis_to_nowork’, ‘dis_to_work’, ‘dis_from_work’, ‘purchase_weekdayrate’,
‘redeem_dayrate’, ‘weekday_onehot_5’, ‘weekday_onehot_6’,
‘dis_from_nowork’, ‘is_holiday’, ‘weekday_onehot_1’, ‘weekday_onehot_2’,
‘weekday_onehot_0’, ‘dis_from_middleofweek’, ‘dis_from_holiendday’,
‘weekday_onehot_3’, ‘is_lastday_of_holiday’, ‘is_firstday_of_holiday’,
‘weekday_onehot_4’, ‘is_worked_yestday’, ‘is_second_week’,
‘is_third_week’, ‘dis_from_startofmonth’, ‘dis_from_holiday’,
‘dis_to_nowork%%%%dis_from_purchase_peak’, ‘total_purchase_amt’,
‘total_redeem_amt’, ‘date’
Redeem feature
‘is_work’, ‘dis_from_redeem_valley’, ‘purchase_weekdayrate’,
‘redeem_dayrate’, ‘weekday_onehot_5’, ‘is_gonna_work_tomorrow’,
‘is_holiday’, ‘dis_from_nowork’, ‘weekday_onehot_0’, ‘weekday_onehot_1’,
‘is_firstday_of_holiday’, ‘weekday_onehot_2’, ‘is_lastday_of_holiday’,
‘dis_from_holiday’, ‘is_work_on_sunday’, ‘is_firstday_of_work’,
‘is_secday_of_month’, ‘dis_from_holiendday’,
‘dis_from_redeem_valley%%%%dis_from_redeem_peak’, ‘total_purchase_amt’,
‘total_redeem_amt’, ‘date’
生成線上效果(MLP)
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], MLPRegressor(solver='lbfgs'),'total_purchase_amt') result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], MLPRegressor(solver='lbfgs'),'total_redeem_amt') trainset, testset = split_data_online(data) draw_result(result_purchase_lr, result_redeem_lr, testset) normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190614_droped_MLP.csv',index=False,header=None)生成線上效果(Xgboost)
result_purchase_lr = generate_online_result(data_purchase, [x for x in data_purchase.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], xgb.XGBRegressor(objective='reg:squarederror'),'total_purchase_amt') result_redeem_lr = generate_online_result(data_redeem, [x for x in data_redeem.columns if x not in ['total_purchase_amt','total_redeem_amt','date']], xgb.XGBRegressor(objective='reg:squarederror'),'total_redeem_amt') trainset, testset = split_data_online(data) draw_result(result_purchase_lr, result_redeem_lr, testset) normalize_upload_file(result_purchase_lr, result_redeem_lr, testset).to_csv('20190615_droped_XGB.csv',index=False,header=None)AIC模型平均
purchase_features, purchase_weight = multi_model(data_purchase, model=LinearRegression(), types = 'purchase') redeem_features, redeem_weight = multi_model(data_redeem, model=LinearRegression(), types = 'redeem') def eva_for_aic(data_purchase, purchase_features, purchase_weight):results = pd.DataFrame()for index, feature in enumerate(purchase_features):if results.empty:results = scale_df(multi_model_eva(data_purchase[['date'] + labels + feature], 'purchase'), eta = purchase_weight[index])else:results = add_two_df(results, multi_model_eva(data_purchase[['date'] + labels + feature], 'purchase'), left_a = 1,right_a = purchase_weight[index])return results add_two_df(eva_for_aic(data_purchase, purchase_features, purchase_weight), eva_for_aic(data_redeem, redeem_features, redeem_weight))針對殘差建模
data_purchase = pd.read_csv('Feature/residual_feature_purchase_0621.csv') data_purchase['date'] = pd.to_datetime(data_purchase['date']) data_redeem = pd.read_csv('Feature/residual_feature_redeem_0621.csv') data_redeem['date'] = pd.to_datetime(data_redeem['date']) base = pd.read_csv('Dataset/base.csv') def generate_residual_result(data, base, model=LinearRegression(), types = 'purchase', split_time = datetime.date(2014,8,1)):a_month = relativedelta(months=1)trainset = data[(datetime.date(2014,4,1) <= data['date']) & (data['date'] < split_time)]testset = data[(split_time <= data['date']) & (data['date'] < split_time + a_month)]feature = [x for x in data_purchase.columns if x not in ['total_purchase_amt','total_redeem_amt','date']]model.fit(X=trainset[feature], y=trainset['total_' + types + '_amt'])result_purchase_rate = model.predict(testset[feature])base['date'] = pd.to_datetime(base['date'], format= "%Y%m%d")result_purchase_cycle = np.array(base[(base['date'] >= split_time)&(base['date'] < split_time + a_month)]['total_'+types+'_predicted_by_cycle'])result_purchase_residual = result_purchase_rate * np.array(result_purchase_cycle)return result_purchase_residual def generate_evaluate_for_residual(model=LinearRegression()):result = []for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11), datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:result_purchase_residual = generate_residual_result(data_purchase, base, model=model, types='purchase', split_time = i)result_redeem_residual = generate_residual_result(data_purchase, base, model=model, types='redeem', split_time= i)a_month = relativedelta(months=1)testset = data[(data['date'] >= i) & (data['date'] < i + a_month)]real_purchase = testset['total_purchase_amt']real_redeem = testset['total_redeem_amt']result.append(total_AE(result_purchase_residual, result_redeem_residual, real_purchase, real_redeem))return pd.DataFrame(result) def multi_model_eva_for_residual():results = pd.DataFrame()for model in [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs'), xgb.XGBRegressor(objective='reg:squarederror')]:if results.empty:results = draw_eva_table(generate_evaluate_for_residual(model)).rename(columns={0: repr(model).split('(')[0]})else:results = pd.merge(results, \draw_eva_table(generate_evaluate_for_residual(model)).rename(columns={0: repr(model).split('(')[0]}))results = results[['interval'] + [x for x in results.columns if x != 'interval']]return results def generate_evaluate_for_cycle():result = []for i in [datetime.date(2014, 8, 1), datetime.date(2014, 7, 25), datetime.date(2014, 7, 18), datetime.date(2014, 7, 11), datetime.date(2014, 7, 4), datetime.date(2014, 6, 27), datetime.date(2014, 6,20)]:a_month = relativedelta(months=1)testset = base[(base['date'] >= i) & (base['date'] < i + a_month)].reset_index(drop=True)result_purchase_residual = testset['total_purchase_predicted_by_cycle']result_redeem_residual = testset['total_redeem_predicted_by_cycle']testset = data[(data['date'] >= i) & (data['date'] < i + a_month)].reset_index(drop=True)real_purchase = testset['total_purchase_amt']real_redeem = testset['total_redeem_amt']result.append(total_AE(result_purchase_residual, result_redeem_residual, real_purchase, real_redeem))return pd.DataFrame(result).rename(columns={0: 'PureTimeSeries'}) pd.merge(multi_model_eva_for_residual(), draw_eva_table(generate_evaluate_for_cycle()))只使用周期因子在8月份的預測效果
_, testset = split_data_underline(data) real_purchase = testset['total_purchase_amt'] real_redeem = testset['total_redeem_amt'] result_purchase_cycle = np.array(base[(base['date'] >= datetime.date(2014,8,1))&(base['date'] < datetime.date(2014,9,1))]['total_purchase_predicted_by_cycle']) result_redeem_cycle = np.array(base[(base['date'] >= datetime.date(2014,8,1))&(base['date'] < datetime.date(2014,9,1))]['total_redeem_predicted_by_cycle']) total_AE(result_purchase_cycle, result_redeem_cycle, real_purchase, real_redeem) trainset, testset = split_data_underline(data) visual(result_purchase_cycle, result_redeem_cycle, testset)
只使用周期因子+預測殘差在8月份的預測效果(比單純用因子好)
trainset_purchase, testset_purchase = split_data_underline(data_purchase) result_purchase_rate = generate_under_result(data_purchase, [x for x in data_purchase.columnsif x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_purchase_amt') trainset_redeem, testset_redeem = split_data_underline(data_redeem) result_redeem_rate = generate_under_result(data_redeem, [x for x in data_redeem.columnsif x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_redeem_amt') total_AE(result_purchase_rate * result_purchase_cycle, result_redeem_rate * result_redeem_cycle, real_purchase, real_redeem) trainset, testset = split_data_underline(data) visual(result_purchase_rate * result_purchase_cycle, result_redeem_rate * result_redeem_cycle, testset)
生成線上結果
trainset_purchase, testset_purchase = split_data_online(data_purchase) result_purchase_rate = generate_online_result(data_purchase, [x for x in data_purchase.columnsif x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_purchase_amt') trainset_redeem, testset_redeem = split_data_online(data_redeem) result_redeem_rate = generate_online_result(data_redeem, [x for x in data_redeem.columnsif x not in ['total_purchase_amt','total_redeem_amt','date']], target='total_redeem_amt') # 修正一下預測結果試試 result_purchase_rate = result_purchase_rate / np.mean(result_purchase_rate) result_redeem_rate = result_redeem_rate / np.mean(result_redeem_rate) result_purchase_cycle = np.array(base[(base['date'] >= datetime.date(2014,9,1))&(base['date'] < datetime.date(2014,10,1))]['total_purchase_predicted_by_cycle']) result_redeem_cycle = np.array(base[(base['date'] >= datetime.date(2014,9,1))&(base['date'] < datetime.date(2014,10,1))]['total_redeem_predicted_by_cycle']) result_purchase_residual = result_purchase_rate * result_purchase_cycle result_redeem_residual = result_redeem_rate * result_redeem_cycle draw_result(result_purchase_cycle, result_redeem_cycle, testset_redeem)殘差處理后的結果
draw_result(result_purchase_residual, result_redeem_residual, testset_redeem) normalize_upload_file(result_purchase_residual, result_redeem_residual, testset_redeem).to_csv('20190622_residual_liner.csv',index=False,header=None) result_score135 = pd.read_csv('Result/timeseries0606.csv',header=None) result_residual = normalize_upload_file(result_purchase_residual, result_redeem_residual, testset_redeem).reset_index(drop=True) result_residual['date'] = result_residual['date'].astype(int) days_need_to_change = [20140906,20140907,20140908,20140928 ] for index,row in result_score135.iterrows():if row[0] in days_need_to_change:result_score135.loc[index, 1] = result_residual.loc[index, 'total_purchase_amt']result_score135.loc[index, 2] = result_residual.loc[index, 'total_redeem_amt'] result_score135.to_csv('result135_fixed_by_residual_0621.csv',index=False,header=None)總結
以上是生活随笔為你收集整理的【算法竞赛学习】资金流入流出预测-挑战Baseline_建模预测的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 关于Dijkstra 和 Bellman
- 下一篇: 【算法竞赛学习】数据分析达人赛2:产品关