python 决策树 math库 c45算法
生活随笔
收集整理的這篇文章主要介紹了
python 决策树 math库 c45算法
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
2019獨角獸企業重金招聘Python工程師標準>>>
每周一搏,提升自我。
這段時間對python的應用,對python的理解越來越深。摸索中修改網上實例代碼,有了自己的理解。
c45是ID3算法的升級版,比ID3高級。個人建議,用CART算法,感覺比C45好。
下面是c45代碼,其中顯示決策樹結構的代碼,下篇博文發布。
#!/usr/bin/python #coding:utf-8import operator from math import log import time import os,sys import string#已文件為數據源 def createDataSet(trainDataFile):print trainDataFiledataSet=[]try:fin=open(trainDataFile)for line in fin:line=line.strip('\n') #清除行皆為換行符cols=line.split(',') #逗號分割行信息row =[cols[1],cols[2],cols[3],cols[4],cols[5],cols[6],cols[7],cols[8],cols[9],cols[10],cols[0]]dataSet.append(row)#print rowexcept:print 'Usage xxx.py trainDataFilePath'sys.exit()labels=['cip1', 'cip2', 'cip3', 'cip4', 'sip1', 'sip2', 'sip3', 'sip4', 'sport', 'domain']print 'dataSetlen',len(dataSet)return dataSet,labels#c4.5 信息熵算法 def calcShannonEntOfFeature(dataSet,feat):numEntries=len(dataSet)labelCounts={}for feaVec in dataSet:currentLabel=feaVec[feat]if currentLabel not in labelCounts:labelCounts[currentLabel]=0labelCounts[currentLabel]+=1shannonEnt=0.0for key in labelCounts:prob=float(labelCounts[key])/numEntriesshannonEnt-=prob * log(prob,2)return shannonEntdef splitDataSet(dataSet,axis,value):retDataSet=[]for featVec in dataSet:if featVec[axis] ==value:reducedFeatVec=featVec[:axis]reducedFeatVec.extend(featVec[axis+1:])retDataSet.append(reducedFeatVec)return retDataSetdef chooseBestFeatureToSplit(dataSet):numFeatures=len(dataSet[0])-1baseEntropy=calcShannonEntOfFeature(dataSet,-1)bestInfoGainRate=0.0bestFeature=-1for i in range(numFeatures):featList=[example[i] for example in dataSet]uniqueVals=set(featList)newEntropy=0.0for value in uniqueVals:subDataSet=splitDataSet(dataSet,i,value)prob=len(subDataSet) / float(len(dataSet))newEntropy+=prob * calcShannonEntOfFeature(subDataSet,-1)infoGain=baseEntropy- newEntropyiv = calcShannonEntOfFeature(dataSet,i)if(iv == 0):continueinfoGainRate= infoGain /ivif infoGainRate > bestInfoGainRate:bestInfoGainRate = infoGainRatebestFeature = ireturn bestFeaturedef majorityCnt(classList):classCount={}for vote in classList:if vote not in classCount.keys():classCount[vote]=0classCount[vote] +=1return max(classCount)def createTree(dataSet,labels):classList= [example[-1] for example in dataSet]if classList.count(classList[0]) == len(classList):return classList[0]if len(dataSet[0]) == 1:return majorityCnt(classList)bestFeat = chooseBestFeatureToSplit(dataSet)bestFeatLabel = labels[bestFeat]if(bestFeat == -1): #特征一樣,但類別不一樣,即類別與特征不相關,隨機選第一個類別分類結果return classList[0]myTree={bestFeatLabel:{}}del(labels[bestFeat])featValues = [example[bestFeat] for example in dataSet]uniqueVals =set(featValues)for value in uniqueVals:subLabels = labels [:]myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)return myTree#創建簡單的數據集 武器類型(0 步槍 1機槍),子彈(0 少 1多),血量(0 少,1多) fight戰斗 1逃跑 def createDataSet():dataSet =[[1,1,0,'fight'],[1,0,1,'fight'],[1,0,1,'fight'],[1,0,1,'fight'],[0,0,1,'run'],[0,1,0,'fight'],[0,1,1,'run']]lables=['weapon','bullet','blood']return dataSet,lables#按行打印數據集 def printData(myData):for item in myData:print '%s' %(item)#使用決策樹分類 def classify(inputTree,featLabels,testVec):firstStr=inputTree.keys()[0]secondDict=inputTree[firstStr]featIndex=featLabels.index(firstStr)for key in secondDict.keys():if testVec[featIndex] ==key:if type(secondDict[key]).__name__=='dict':classLabel=classify(secondDict[key],featLabels,testVec)else:classLabel=secondDict[key]return classLabel#存儲決策樹 def storeTree(inputTree,filename):import picklefw=open(filename,'w')pickle.dump(inputTree,fw)fw.close()#獲取決策樹 def grabTree(filename):import picklefr=open(filename)return pickle.load(fr)def main():data,label =createDataSet()myTree=createTree(data,label)print(myTree)#打印決策樹import showTree as showshow.createPlot(myTree)if __name__ == '__main__':main()調用的showTree.py,內容如下:
#!/usr/bin/python #coding:utf-8import matplotlib.pyplot as plt#決策樹屬性設置 decisionNode=dict(boxstyle="sawtooth",fc="0.8") leafNode=dict(boxstyle="round4",fc="0.8") arrow_args=dict(arrowstyle="<-")#createPlot 主函數,調用即可畫出決策樹,其中調用登了剩下的所有的函數,inTree的形式必須為嵌套的決策樹 def createPlot(inThree):fig=plt.figure(1,facecolor='white')fig.clf()axprops=dict(xticks=[],yticks=[])createPlot.ax1=plt.subplot(111,frameon=False,**axprops) #no ticks# createPlot.ax1=plt.subplot(111,frameon=False) #ticks for demo puropsesplotTree.totalW=float(getNumLeafs(inThree))plotTree.totalD=float(getTreeDepth(inThree))plotTree.xOff=-0.5/plotTree.totalW;plotTree.yOff=1.0plotTree(inThree,(0.5,1.0),'')plt.show()#決策樹上節點之間的箭頭設置 def plotNode(nodeTxt,centerPt,parentPt,nodeType):createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',xytext=centerPt,textcoords='axes fraction',va="center",ha="center",bbox=nodeType,arrowprops=arrow_args)#決策樹文字的添加位置和角度 def plotMidText(cntrPt,parentPt,txtString):xMid=(parentPt[0] -cntrPt[0])/2.0 +cntrPt[0]yMid=(parentPt[1] -cntrPt[1])/2.0 +cntrPt[1]createPlot.ax1.text(xMid,yMid,txtString,va="center",ha="center",rotation=30)#得到葉子節點的數量 def getNumLeafs(myTree):numLeafs=0firstStr=myTree.keys()[0]secondDict=myTree[firstStr]for key in secondDict.keys():if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodesnumLeafs += getNumLeafs(secondDict[key])else: numLeafs+=1return numLeafs#得到決策樹的深度 def getTreeDepth(myTree):maxDepthh=0firstStr=myTree.keys()[0]secondDict=myTree[firstStr]for key in secondDict.keys():if type(secondDict[key]).__name__=='dict':thisDepth=1+getTreeDepth(secondDict[key])else: thisDepth=1if thisDepth>maxDepthh:maxDepthh=thisDepthreturn maxDepthh#父子節點之間畫決策樹 def plotTree(myTree,parentPt,nodeTxt):numLeafs=getNumLeafs(myTree)depth=getTreeDepth(myTree)firstStr=myTree.keys()[0]cntrPt=(plotTree.xOff +(1.0+float(numLeafs))/2.0/plotTree.totalW,plotTree.yOff)plotMidText(cntrPt,parentPt,nodeTxt)plotNode(firstStr,cntrPt,parentPt,decisionNode)secondDict=myTree[firstStr]plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalDfor key in secondDict.keys():if type(secondDict[key]).__name__=='dict':plotTree(secondDict[key],cntrPt,str(key))else:plotTree.xOff=plotTree.xOff+1.0/plotTree.totalWplotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),cntrPt,leafNode)plotMidText((plotTree.xOff,plotTree.yOff),cntrPt,str(key))plotTree.yOff=plotTree.yOff+1.0/plotTree.totalD?
轉載于:https://my.oschina.net/wangzonghui/blog/1617580
總結
以上是生活随笔為你收集整理的python 决策树 math库 c45算法的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【Altium Designer】Dat
- 下一篇: RecycleView的正确打开方式