统计学习方法第五章作业:ID3/C4.5算法分类决策树、平方误差二叉回归树代码实现
生活随笔
收集整理的這篇文章主要介紹了
统计学习方法第五章作业:ID3/C4.5算法分类决策树、平方误差二叉回归树代码实现
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
ID3/C4.5算法分類決策樹
import numpy as np import math class Node:def __init__(self,feature_index=None,value=None,label=None):self.feature_index=feature_indexself.value=valueself.child=[]self.label=labelclass C4_5:def __init__(self,X,Y,c=0.1,way='ID3'):self.c = cself.root=Node()self.X = Xself.Y = Yself.feature_num = len(X[0])self.label_num = len(Y)self.feature_set = list(range(self.feature_num))self.getac()self.way = waydef getac(self):self.dict_x = {}self.dict_y = set(self.Y)for i in range(self.feature_num):self.dict_x[i] = set([X[i] for X in self.X])@staticmethoddef get_label(list_):return max(list_, key=list_.count)@staticmethoddef count_Y(Y):dict_y = {}for i in Y:if i in dict_y.keys():dict_y[i]+=1else:dict_y[i] = 1return dict_ydef experience_entropy(self,Y):dict_y = self.count_Y(Y)D = len(Y)set_y = set(Y)return -sum([dict_y[x]/D*math.log(dict_y[x]/D,2) for x in set_y])def get_feature(self,X,Y,rest_x):HD = self.experience_entropy(Y)Y = np.array(Y)X = np.array(X)entropy_ = []if self.way == 'ID3':for i in rest_x:sum_ = 0list_x = np.array([x[i] for x in X])for j in self.dict_x[i]:sum__ = 0Di = sum(list_x == j)if Di != 0:for m in self.dict_y:Dik = sum(Y[list_x == j]==m)if Dik != 0:sum__ += Dik/Di*math.log(Dik/Di,2)sum_ -= Di/len(list_x)*sum__add_entropy = HD - sum_entropy_.append(add_entropy)if self.way == 'C45':for i in rest_x:sum_ = 0list_x = np.array([x[i] for x in X])for j in self.dict_x[i]:sum__ = 0HAD = 0Di = sum(list_x == j)if Di != 0:for m in self.dict_y:Dik = sum(Y[list_x == j]==m)if Dik != 0:sum__ += Dik/Di*math.log(Dik/Di,2)sum_ -= Di/len(list_x)*sum__HAD -= Di/len(list_x)*math.log(Di/len(list_x),2)add_entropy = (HD - sum_)/HADentropy_.append(add_entropy)max_add = max(entropy_)index_ = entropy_.index(max_add)spilt_feature = self.feature_set[index_]return spilt_feature,max_adddef build_tree(self):def build_tree_(node, X, Y, rest_x):X = np.array(X)Y = np.array(Y)if len(set(Y))==1:node.label=list(set(Y))[0]returnelif len(X[0])==0:node.label=self.get_label(self.Y)returnspilt_feature,max_add = self.get_feature(X,Y,rest_x)if max_add < self.c:node.label=self.get_label(self.Y)returnrest_x.remove(spilt_feature)for i in self.dict_x[spilt_feature]:Node_child = Node(feature_index=spilt_feature,value=i)build_tree_(Node_child,X[np.array([x[spilt_feature] for x in X])==i],Y[np.array([x[spilt_feature] for x in X])==i],rest_x)node.child.append(Node_child)build_tree_(self.root,self.X,self.Y,self.feature_set)def print_tree(self):root = self.rootdef pre_order(root):if root:print(root.feature_index,root.value,root.label,len(root.child))for i in root.child:pre_order(i)pre_order(root)def predict_single(self,X):if len(X) != self.feature_num:raise IndexErrorroot = self.rootwhile root.child:for node in root.child:if X[node.feature_index] == node.value:root = nodecontinuereturn root.valuedef predict(self,X):X = np.array(X)if len(X.shape) == 1:return self.predict_single(X)else:result = []for i in X:result.append(self.predict_single(i))return resultdef main():datasets = [['青年', '否', '否', '一般', '否'],['青年', '否', '否', '好', '否'],['青年', '是', '否', '好', '是'],['青年', '是', '是', '一般', '是'],['青年', '否', '否', '一般', '否'],['中年', '否', '否', '一般', '否'],['中年', '否', '否', '好', '否'],['中年', '是', '是', '好', '是'],['中年', '否', '是', '非常好', '是'],['中年', '否', '是', '非常好', '是'],['老年', '否', '是', '非常好', '是'],['老年', '否', '是', '好', '是'],['老年', '是', '否', '好', '是'],['老年', '是', '否', '非常好', '是'],['老年', '否', '否', '一般', '否']]X = [x[0:-1] for x in datasets]Y = [x[-1] for x in datasets]for i,j in zip(X,Y):print(i,j)C4_5_trainer = C4_5(X,Y)C4_5_trainer.build_tree()C4_5_trainer.print_tree()predict_single_x = [['中年', '是', '否', '一般'],['老年', '否', '否', '一般']]print(C4_5_trainer.predict((predict_single_x)))if __name__ == '__main__':main()#############result################### /usr/bin/python3 /Users/zhengyanzhao/PycharmProjects/tongjixuexi/C4.5_ID3.py ['青年', '否', '否', '一般'] 否 ['青年', '否', '否', '好'] 否 ['青年', '是', '否', '好'] 是 ['青年', '是', '是', '一般'] 是 ['青年', '否', '否', '一般'] 否 ['中年', '否', '否', '一般'] 否 ['中年', '否', '否', '好'] 否 ['中年', '是', '是', '好'] 是 ['中年', '否', '是', '非常好'] 是 ['中年', '否', '是', '非常好'] 是 ['老年', '否', '是', '非常好'] 是 ['老年', '否', '是', '好'] 是 ['老年', '是', '否', '好'] 是 ['老年', '是', '否', '非常好'] 是 ['老年', '否', '否', '一般'] 否 None None None 2 2 是 是 0 2 否 None 2 1 是 是 0 1 否 否 0 ['是', '否']平方誤差二叉回歸樹
import numpy as npclass Node:def __init__(self,feature_index=None,cut_value=None,y_value=None,left_l=None,right_s=None):self.feature_index=feature_indexself.y_value=y_valueself.cut_value=cut_valueself.left_l=left_lself.right_s=right_sclass Cart_reg:def __init__(self,X,Y,min_leave_data=3):self.min_leave_data = min_leave_dataself.root=Node()self.X = Xself.Y = Yself.feature_num = len(X[0])def cauclate_loss(self,Y1,Y2):if len(Y1)!=0 and len(Y2)!=0:sum_1 = sum([(y - np.mean(Y1))**2 for y in Y1])sum_2 = sum([(y - np.mean(Y2))**2 for y in Y2])return sum_1 + sum_2elif len(Y1)!=0:return sum([(y - np.mean(Y1))**2 for y in Y1])elif len(Y2)!=0:return sum([(y - np.mean(Y1))** 2 for y in Y2])def find_spilt(self,X,Y):X = np.array(X)Y = np.array(Y)save_loss = []save_feature = []save_value = []for i in range(self.feature_num):list_ix = np.array([x[i] for x in X])for j in list_ix:loss_ = self.cauclate_loss(Y[list_ix <= j],Y[list_ix > j])save_loss.append(loss_)save_feature.append(i)save_value.append(j)min_loss = min(save_loss)min_index = save_loss.index(min_loss)min_feature = save_feature[min_index]min_value = save_value[min_index]return min_feature,min_valuedef build_tree(self):def build_tree_(node, X, Y):X = np.array(X)Y = np.array(Y)# print(len(X))if len(X) <= self.min_leave_data:node.y_value=np.mean(Y)returnmin_feature,min_value = self.find_spilt(X,Y)node.feature_index = min_featurenode.cut_value = min_valuelarge_index = np.array([x[min_feature] for x in X])>min_valuesmall_index = np.array([x[min_feature] for x in X])<=min_valueX_left = X[large_index]X_right = X[small_index]Y_left = Y[large_index]Y_right = Y[small_index]node.left_l = Node()node.right_s = Node()build_tree_(node.left_l,X_left,Y_left)build_tree_(node.right_s,X_right,Y_right)build_tree_(self.root,self.X,self.Y)def print_tree(self):root = self.rootdef pre_order(root):if root:print(root.feature_index,root.cut_value,root.y_value)pre_order(root.left_l)pre_order(root.right_s)pre_order(root)def predict_single(self,X):if len(X) != self.feature_num:raise IndexErrornode = self.rootwhile node.y_value is None:if X[node.feature_index] <= node.cut_value:node = node.right_selse:node = node.left_lreturn node.y_valuedef predict(self,X):X = np.array(X)if len(X.shape) == 1:return self.predict_single(X)else:result = []for i in X:result.append(self.predict_single(i))return resultdef main():X=[[1,5,7,4,8,1,2],[2,3,5,5,2,7,8],[1,2,3,4,5,6,7],[1,2,1,2,2,3,9],[2,8,9,7,0,1,4],[4,8,3,4,5,6,7],[4,1,3,1,5,8,0]]Y= [2,6,2,5,8,3,2]reg_t = Cart_reg(X,Y,2)reg_t.build_tree()reg_t.print_tree()print(reg_t.predict_single([4,1,3,1,5,8,0]))print(reg_t.predict([[1,5,7,4,8,1,2],[2,3,5,5,2,7,8],[1,2,3,4,5,6,7]]))if __name__ == '__main__':main()#############result################### /usr/bin/python3 /Users/zhengyanzhao/PycharmProjects/tongjixuexi/cart_reg_tree 4 2 None 1 5 None None None 3.0 0 1 None None None 2.0 None None 2.0 1 3 None None None 8.0 None None 5.5 2.0 [2.0, 5.5, 2.0]總結(jié)
以上是生活随笔為你收集整理的统计学习方法第五章作业:ID3/C4.5算法分类决策树、平方误差二叉回归树代码实现的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 统计学习方法第三章作业:一般k邻近、平衡
- 下一篇: 统计学习方法第六章作业:逻辑斯谛梯度下降