當前位置：首頁 > 编程语言 > python >内容正文

python

python实现knn算法鸢尾花_Python学习之knn实现鸢尾花分类

發布時間：2025/3/15 python 13 豆豆

生活随笔收集整理的這篇文章主要介紹了 python实现knn算法鸢尾花_Python学习之knn实现鸢尾花分类小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

# K近鄰算法

# 導入相關庫文件

import numpy as np

import matplotlib.pyplot as plt

#import pandas as pd

from sklearn import neighbors, datasets

# 導入數據集，數據集sklearn自帶，X與y一一對應

dataset = datasets.load_iris()

# 獲取鳶尾花前兩列花萼長度和花萼寬度(sepal_length、sepal_width)數據作為X

X = dataset.data[:, :2]

# 獲取鳶尾花種類作為Y

# 2表示Iris-virginica，1表示Iris-versicolor，0表示Iris-setosa

y = dataset.target

# 這里沒有進行特征縮放，是因為X屬于一個都在一個較小的區間，所以無需進行特征縮放(已經達到特征縮放后的要求，觀察數據很重要)

attributes_dict = {0:"sepal_length",1:"sepal_width"}

for attribute in attributes_dict:

print("{} 最大值：{}".format(attributes_dict[attribute], np.max(X[:,attribute])))

print("{} 最小值：{}".format(attributes_dict[attribute], np.min(X[:,attribute])))

# round 函數將float數據格式化小數點后一位

print("{} 平均值：{}".format(attributes_dict[attribute], round(np.average(X[:, attribute]),1)))

print("-------------------------------------")

# 劃分數據為訓練集和測試集

from sklearn.model_selection import train_test_split

"""train_test_split(train_data,train_target,test_size=0.4, random_state=0,stratify=y_train)Parameters：train_data：所要劃分的樣本特征集train_target：所要劃分的樣本結果test_size：樣本占比，如果是整數的話就是樣本的數量random_state：是隨機數的種子。隨機數種子：其實就是該組隨機數的編號，在需要重復試驗的時候，保證得到一組一樣的隨機數。比如你每次都填1，其他參數一樣的情況下你得到的隨機數組是一樣的。但填0或不填，每次都會不一樣。"""

# train_test_split返回四個參數

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# 使用訓練集訓練KNN

from sklearn.neighbors import KNeighborsClassifier

'''class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,SupervisedIntegerMixin, ClassifierMixin):Parameters:n_neighbors: 默認鄰居的數量weights：權重可選參數uniform: 統一的權重. 在每一個鄰居區域里的點的權重都是一樣的。distance: 權重點等于他們距離的倒數。使用此函數，更近的鄰居對于所預測的點的影響更大[callable]: 一個用戶自定義的方法，此方法接收一個距離的數組，然后返回一個相同形狀并且包含權重的數組。algorithm：采用的算法可選參數ball_tree: 使用算法 BallTreekd_tree: 使用算法 KDTreebrute: 使用暴力搜索auto: 會基于傳入fit方法的內容，選擇最合適的算法。p: 距離度量的類型metric：樹的距離矩陣metric_params：矩陣參數n_jobs：用于搜索鄰居，可并行運行的任務數量'''

# p=2表示選取歐式距離

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)

classifier.fit(X_train, y_train) #knn無訓練過程，只是做數據保存到內存

# 預測測試集結果

y_pred = classifier.predict(X_test)

# 創建混淆矩陣

from sklearn.metrics import confusion_matrix

"""def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None):Parameters：y_true: 樣本真實分類結果y_pred: 樣本預測分類結果labels: 給出的類別sample_weigh: 樣本權重"""

# 所有正確預測的結果都在對角線上，非對角線上的值為預測錯誤數量

cm = confusion_matrix(y_test, y_pred)

print('cm',cm)

# 可視化訓練集結果

from matplotlib.colors import ListedColormap

X_set, y_set = X_train, y_train

# meshgrid函數用兩個坐標軸上的點在平面上畫網格。

# X1，X2為坐標矩陣，用來畫網格

X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),

np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))

# 繪制二維等高線

# 在網格的基礎上添加高度值

plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),

alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))

plt.xlim(X1.min(), X1.max())

plt.ylim(X2.min(), X2.max())

for i, j in enumerate(np.unique(y_set)):

# 繪制散點圖

# 自matplotlib 3.0.3 之后，scatter的c參數接收的數據類型為numpy的二維數組

# 這里的color_list，有三種類別的點，采用紅、綠、藍、三種顏色辨識

# 數組內容為rgb數組

color_list = [[[1,0,0],[0,1,0],[0,0,1]][i]]

# 使用掩碼方法獲取所有類別為0、1、2的數據點個數

count = np.sum((y_set == j)==True)

# 通過掩碼的方式從X_set中獲取當類別為0、1、2時的x坐標和y坐標

'''plt.scatter(x, y, c, marker, cmap,alpha, linewidths, edgecolors):Parameters:x, y: 數據的坐標c: 顏色，顏色序列marker: 繪制數據點的形狀，默認是點cmap： atplotlib.colors.Colormap 內置的顏色序列alpha: 繪制數據點的透明度范圍是[0-1] 0到1表示完全透明到完全不透明linewidths: 數據點形狀的邊框粗細edgecolors : 數據點形狀的邊框顏色'''

plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],

c = color_list*count, label = j)

plt.title('K-NN (Training set)')

plt.xlabel('Sepal Length')

plt.ylabel('Sepal Width')

plt.legend()

plt.show()

# 可視化測試集結果