#The first machine learning algorithm--kNN'''導入科學計算包Numpy和運算符模塊
'''from numpy import*import operatordefcreatDataSet():group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])labels=['A','A','B','B']return group,labels'''
inX是用于分類的輸入向量,dataSet輸入的訓練樣本集,
labels標簽向量,k確定選擇最近鄰居的數(shù)目
'''defclassify0(inX,dataSet,labels,k):#計算距離dataSetSize=dataSet.shape[0]#數(shù)據(jù)集的行diffMat=tile(inX,(dataSetSize,1))-dataSetsqDiffMat=diffMat**2sqDistance=sqDiffMat.sum(axis=1)distances=sqDistance**0.5sortedDistanceIndicies=distances.argsort()classCount={}#選擇距離最小的K個點for i inrange(k):voteIlabel=labels[sortedDistanceIndicies[i]]classCount[voteIlabel]=classCount.get(voteIlabel,0)+1#排序sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)return sortedClassCount[0][0]# group,labels=creatDataSet()# classify0([0,0],group,labels,3)# print(classify0([0,0],group,labels,3))'''
準備數(shù)據(jù):從文本文件中解析數(shù)據(jù),將文本記錄轉為numpy的解析程序
'''deffile2matrix(filename):fr=open(filename)arrayOLines=fr.readlines()numberOfLines=len(arrayOLines)#得到文件行數(shù)returnMat=zeros((numberOfLines,3))#創(chuàng)建0矩陣,用作返回的numpy矩陣classLabelVector=[]index=0#解析文件數(shù)據(jù)到列表for line in arrayOLines:line=line.strip()#去掉所有回車字符listFromLine=line.split('\t')#使用tab字符將上一步得到的整行數(shù)據(jù)分割為一個元素列表returnMat[index,:]=listFromLine[0:3]#選取前三個元素,存到特征矩陣中classLabelVector.append(int(listFromLine[-1]))#-1代表最后一列元素index+=1return returnMat,classLabelVectordatingDataMat,datingLabels=file2matrix('datingTestSet.txt')'''
分析數(shù)據(jù):使用matplotlib創(chuàng)建散點圖
'''import matplotlib
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))#使用datingDataMat矩陣第二列和第三列的數(shù)據(jù)
plt.show
'''
準備數(shù)據(jù):歸一化數(shù)據(jù),將數(shù)據(jù)的取值范圍處理為0到1或者-1到1
autoNorm()函數(shù)可以自動將數(shù)字特征值轉化為0到1的區(qū)間
tile(x,(2))函數(shù)將x將函數(shù)沿著X軸擴大兩倍,tile(x,(1,2))將x向Y軸擴大1倍,向X軸擴大2倍
'''defautoNorm(dataSet):minVals=dataSet.min(0)#使得函數(shù)可以從列中選取最小值,列!!!maxVals=dataSet.max(0)#最大特征值ranges=maxVals-minValsnormDataSet=zeros(shape(dataSet))m=dataSet.shape[0]#1000,和前面特征矩陣的行數(shù)相同normDataSet=dataSet-tile(minVals,(m,1))#統(tǒng)一特征矩陣1000x3和minVals、range(1x3)的大小normDataSet=normDataSet/tile(ranges,(m,1))#特征值相除return normDataSet,ranges,minValsnormMat,ranges,minVals=autoNorm(datingDataMat)'''
測試算法:作為完整程序驗證分類器
'''defdatingClassTest():hoRatio=0.1datingDataMat,datingLabels=file2matrix('datingTestSet.txt')normMat,ranges,minVals=autoNorm(datingDataMat)m=normMat.shape[0]numTestVecs=int(m*hoRatio)errorCount=0.0for i inrange(numTestVecs):classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)print("the classifier came back with: %d, the real answer is: %d" \%(classifierResult,datingLabels[i]))if(classifierResult !=datingLabels[i]):errorCount+=1.0print("the total error rate is %f"%(errorCount/float(numTestVecs)))