當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

构建五种机器学习模型作比较（某金融数据集）

發布時間：2023/12/10 编程问答 24 豆豆

生活随笔收集整理的這篇文章主要介紹了构建五种机器学习模型作比较（某金融数据集）小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

導入各種包

import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,roc_curve,auc from xgboost import XGBClassifier from lightgbm import LGBMClassifier from sklearn.linear_model import LogisticRegression from sklearn import svm from sklearn.tree import DecisionTreeClassifier

導入數據

data=pd.read_csv('./data.csv',index_col=0,encoding='gbk')

數據理解

#單獨提取出y列標簽，和其余的88列標記為x y=data['status'] X=data.drop('status',axis=1) #X值的行列數，以及y的分布類型 print('X.shape:',X.shape) print('y的分布:',y.value_counts()) X.shape: (4754, 88) y的分布: 0 3561 1 1193 Name: status, dtype: int64

數據準備

#首先剔除一些明顯無用的特征，如id_name,custid,trade_no,bank_card_no X.drop(['id_name','custid','trade_no','bank_card_no'],axis=1,inplace=True) print(X.shape) #選取數值型特征 X_num=X.select_dtypes('number').copy() print(X_num.shape) type(X_num.mean()) #使用均值填充缺失值 X_num.fillna(X_num.mean(),inplace=True) #觀察數值型以外的變量 X_str=X.select_dtypes(exclude='number').copy() X_str.describe() #把reg_preference用虛擬變量代替，其它三個變量刪除 X_str['reg_preference_for_trad'] = X_str['reg_preference_for_trad'].fillna(X_str['reg_preference_for_trad'].mode()[0]) X_str_dummy = pd.get_dummies(X_str['reg_preference_for_trad']) X_str_dummy.head() #合并數值型變量和名義型（字符型）變量 X_cl = pd.concat([X_num,X_str_dummy],axis=1,sort=False) #X_cl.shape (4754, 84) (4754, 80)

數據建模和評估（建立函數和類整合五個模型）

#以三七比例分割訓練集和測試集 random_state = 1118 X_train,X_test,y_train,y_test = train_test_split(X_cl,y,test_size=0.3,random_state=1118) print(X_train.shape) print(X_test.shape)#建立xgboost模型 xgboost_model=XGBClassifier() xgboost_model.fit(X_train,y_train) """ #用建立好的xgboost模型運用到訓練集和測試集上，進行預測 y_train_pred = xgboost_model.predict(X_train) y_test_pred = xgboost_model.predict(X_test) """#建立lightgbm模型 lgbm_model=LGBMClassifier() lgbm_model.fit(X_train,y_train) """ #用建立好的lightbm模型運用到訓練集和測試集上，進行預測 y_train_pred = lgbm_model.predict(X_train) y_test_pred = lgbm_model.predict(X_test) """ #建立lin_svc模型 Lin_SVC = svm.SVC(probability=True) Lin_SVC.fit(X_train,y_train) #建立決策樹模型 dt = DecisionTreeClassifier() dt.fit(X_train, y_train) #建立邏輯回歸模型 lr = LogisticRegression() lr.fit(X_train, y_train)#定義一個函數 def model_metrics(clf, X_train, X_test, y_train, y_test):# 預測y_train_pred = clf.predict(X_train)y_test_pred = clf.predict(X_test)y_train_proba = clf.predict_proba(X_train)[:, 1]y_test_proba = clf.predict_proba(X_test)[:, 1]# 準確率print('[準確率]', end=' ')print('訓練集：', '%.4f' % accuracy_score(y_train, y_train_pred), end=' ')print('測試集：', '%.4f' % accuracy_score(y_test, y_test_pred))# 精準率print('[精準率]', end=' ')print('訓練集：', '%.4f' % precision_score(y_train, y_train_pred), end=' ')print('測試集：', '%.4f' % precision_score(y_test, y_test_pred))# 召回率print('[召回率]', end=' ')print('訓練集：', '%.4f' % recall_score(y_train, y_train_pred), end=' ')print('測試集：', '%.4f' % recall_score(y_test, y_test_pred))# f1-scoreprint('[f1-score]', end=' ')print('訓練集：', '%.4f' % f1_score(y_train, y_train_pred), end=' ')print('測試集：', '%.4f' % f1_score(y_test, y_test_pred))# auc取值：用roc_auc_score或aucprint('[auc值]', end=' ')print('訓練集：', '%.4f' % roc_auc_score(y_train, y_train_proba), end=' ')print('測試集：', '%.4f' % roc_auc_score(y_test, y_test_proba))# roc曲線fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_proba, pos_label=1)fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_proba, pos_label=1)label = ["Train - AUC:{:.4f}".format(auc(fpr_train, tpr_train)),"Test - AUC:{:.4f}".format(auc(fpr_test, tpr_test))]plt.plot(fpr_train, tpr_train)plt.plot(fpr_test, tpr_test)plt.plot([0, 1], [0, 1], 'd--')plt.xlabel('False Positive Rate')plt.ylabel('True Positive Rate')plt.legend(label, loc=4)plt.title("ROC curve")plt.show()if __name__ == "__main__":model_metrics(lr, X_train, X_test, y_train, y_test)model_metrics(dt, X_train, X_test, y_train, y_test)model_metrics(Lin_SVC, X_train, X_test, y_train, y_test)model_metrics(xgboost_model, X_train, X_test, y_train, y_test)model_metrics(lgbm_model, X_train, X_test, y_train, y_test) (3327, 85) (1427, 85) [準確率] 訓練集： 0.7532 測試集： 0.7393 [精準率] 訓練集： 0.0000 測試集： 0.0000 [召回率] 訓練集： 0.0000 測試集： 0.0000 [f1-score] 訓練集： 0.0000 測試集： 0.0000 [auc值] 訓練集： 0.5880 測試集： 0.5720[準確率] 訓練集： 1.0000 測試集： 0.6959 [精準率] 訓練集： 1.0000 測試集： 0.4139 [召回率] 訓練集： 1.0000 測試集： 0.4005 [f1-score] 訓練集： 1.0000 測試集： 0.4071 [auc值] 訓練集： 1.0000 測試集： 0.6003[準確率] 訓練集： 1.0000 測試集： 0.7393 [精準率] 訓練集： 1.0000 測試集： 0.0000 [召回率] 訓練集： 1.0000 測試集： 0.0000 [f1-score] 訓練集： 1.0000 測試集： 0.0000 [auc值] 訓練集： 0.0000 測試集： 0.5000[準確率] 訓練集： 0.8533 測試集： 0.7835 [精準率] 訓練集： 0.8447 測試集： 0.6684 [召回率] 訓練集： 0.4970 測試集： 0.3360 [f1-score] 訓練集： 0.6258 測試集： 0.4472 [auc值] 訓練集： 0.9145 測試集： 0.7676[準確率] 訓練集： 0.9973 測試集： 0.7694 [精準率] 訓練集： 0.9988 測試集： 0.5931 [召回率] 訓練集： 0.9903 測試集： 0.3683 [f1-score] 訓練集： 0.9945 測試集： 0.4544 [auc值] 訓練集： 0.9999 測試集： 0.7621下圖分別為邏輯回歸、決策樹、SVM、xgboost、lightgbm的ROC圖

PS:有兩個模型結果很奇怪，有待研究

總結

以上是生活随笔為你收集整理的构建五种机器学习模型作比较（某金融数据集）的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： mysql中char与varchar的区
下一篇： java方法使用