sklearn机器学习常用过程总结
生活随笔
收集整理的這篇文章主要介紹了
sklearn机器学习常用过程总结
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
由于前面對sklearn或多或少接觸了一下,但是不深入,隨著最近學習,我下面介紹一下機器學習常用過程。
1. 加載數(shù)據(jù)集
scikit-learn中自帶了一些數(shù)據(jù)集,比如說最著名的Iris數(shù)據(jù)集。 數(shù)據(jù)集中第3列和第4列數(shù)據(jù)表示花瓣的長度和寬度。而類別已經(jīng)轉(zhuǎn)成了數(shù)字,比如 0=Iris-Setosa, 1=Iris-Versicolor, 2=Iris-Virginica. from distutils.version import LooseVersion as Version from sklearn import __version__ as sklearn_version from sklearn import datasets import numpy as npiris = datasets.load_iris() iris.data #查看數(shù)據(jù) X = iris.data[:, [2, 3]] y = iris.targetprint('Class labels:', np.unique(y))2. 通常我們會把數(shù)據(jù)集切分成訓練集和測試集,這里70%的訓練集,30%的測試集。 if Version(sklearn_version) < '0.18':from sklearn.cross_validation import train_test_split else:from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)3. 對特征做標準化from sklearn.preprocessing import StandardScalersc = StandardScaler() sc.fit(X_train)sc.scale_X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) 4 .用scikit-learn中的感知器做分類 from sklearn.linear_model import Perceptron#ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0) ppn = Perceptron()ppn.fit(X_train_std, y_train)ppn.coef_ppn.intercept_y_pred = ppn.predict(X_test_std) y_pred y_testy_pred == y_testprint('Misclassified samples: %d' % (y_test != y_pred).sum())from sklearn.metrics import accuracy_scoreprint('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) 5.使用scikit-learn訓練LR from sklearn.linear_model import LogisticRegressionlr = LogisticRegression(C=1000.0, random_state=0) lr.fit(X_train_std, y_train)plot_decision_regions(X_combined_std, y_combined,classifier=lr, test_idx=range(105, 150)) plt.xlabel('petal length [standardized]') plt.ylabel('petal width [standardized]') plt.legend(loc='upper left') plt.tight_layout() # plt.savefig('./figures/logistic_regression.png', dpi=300) plt.show()if Version(sklearn_version) < '0.17':lr.predict_proba(X_test_std[0, :]) else:lr.predict_proba(X_test_std[0, :].reshape(1, -1))lr.predict_proba(X_test_std[0, :]) 6.來談談過擬合/overfitting 與 正則化/regularization
正則化L1 和L2??? L1截斷,會產(chǎn)生很多0,使矩陣稀疏;L2是縮放,把權(quán)重縮放到很小。
7.最大間隔分類與支持向量機
8.神奇的SVM核函數(shù)完成非線性切分
9.使用kernel trick在高維空間內(nèi)找到一個可切分的超平面
svm = SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0) svm.fit(X_xor, y_xor) plot_decision_regions(X_xor, y_xor,classifier=svm)plt.legend(loc='upper left') plt.tight_layout() # plt.savefig('./figures/support_vector_machine_rbf_xor.png', dpi=300) plt.show()
from sklearn.svm import SVCsvm = SVC(kernel='rbf', random_state=0, gamma=0.2, C=1.0) svm.fit(X_train_std, y_train)plot_decision_regions(X_combined_std, y_combined,classifier=svm, test_idx=range(105, 150)) plt.xlabel('petal length [standardized]') plt.ylabel('petal width [standardized]') plt.legend(loc='upper left') plt.tight_layout() # plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300) plt.show() svm = SVC(kernel='rbf', random_state=0, gamma=100.0, C=1.0) svm.fit(X_train_std, y_train)plot_decision_regions(X_combined_std, y_combined, classifier=svm, test_idx=range(105, 150)) plt.xlabel('petal length [standardized]') plt.ylabel('petal width [standardized]') plt.legend(loc='upper left') plt.tight_layout() # plt.savefig('./figures/support_vector_machine_rbf_iris_2.png', dpi=300) plt.show()10.決策樹學習
import matplotlib.pyplot as plt import numpy as npdef gini(p):return p * (1 - p) + (1 - p) * (1 - (1 - p))def entropy(p):return - p * np.log2(p) - (1 - p) * np.log2((1 - p))def error(p):return 1 - np.max([p, 1 - p])x = np.arange(0.0, 1.0, 0.01)ent = [entropy(p) if p != 0 else None for p in x] sc_ent = [e * 0.5 if e else None for e in ent] err = [error(i) for i in x]fig = plt.figure() ax = plt.subplot(111) for i, lab, ls, c, in zip([ent, sc_ent, gini(x), err], ['Entropy', 'Entropy (scaled)', 'Gini Impurity', 'Misclassification Error'],['-', '-', '--', '-.'],['black', 'lightgray', 'red', 'green', 'cyan']):line = ax.plot(x, i, label=lab, linestyle=ls, lw=2, color=c)ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15),ncol=3, fancybox=True, shadow=False)ax.axhline(y=0.5, linewidth=1, color='k', linestyle='--') ax.axhline(y=1.0, linewidth=1, color='k', linestyle='--') plt.ylim([0, 1.1]) plt.xlabel('p(i=1)') plt.ylabel('Impurity Index') plt.tight_layout() #plt.savefig('./figures/impurity.png', dpi=300, bbox_inches='tight') plt.show()
from sklearn.tree import DecisionTreeClassifiertree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0) tree.fit(X_train, y_train)X_combined = np.vstack((X_train, X_test)) y_combined = np.hstack((y_train, y_test)) plot_decision_regions(X_combined, y_combined, classifier=tree, test_idx=range(105, 150))plt.xlabel('petal length [cm]') plt.ylabel('petal width [cm]') plt.legend(loc='upper left') plt.tight_layout() # plt.savefig('./figures/decision_tree_decision.png', dpi=300) plt.show()
from sklearn.tree import export_graphvizexport_graphviz(tree, out_file='tree.dot', feature_names=['petal length', 'petal width']) Image(filename='./images/03_18.png', width=600)
11.使用隨機森林對樹做疊加,變成增強分類器
12.K最近鄰,樸素的分類器
from sklearn.neighbors import KNeighborsClassifierknn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski') knn.fit(X_train_std, y_train)plot_decision_regions(X_combined_std, y_combined, classifier=knn, test_idx=range(105, 150))plt.xlabel('petal length [standardized]') plt.ylabel('petal width [standardized]') plt.legend(loc='upper left') plt.tight_layout() # plt.savefig('./figures/k_nearest_neighbors.png', dpi=300) plt.show()
總結(jié)
以上是生活随笔為你收集整理的sklearn机器学习常用过程总结的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 服务化实战之 dubbo、dubbox、
- 下一篇: pandas常见的时间处理函数