Python实现决策树(Decision Tree)分类
生活随笔
收集整理的這篇文章主要介紹了
Python实现决策树(Decision Tree)分类
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
關于決策樹的簡介可以參考:?http://blog.csdn.net/fengbingchun/article/details/78880934
在??https://machinelearningmastery.com/implement-decision-tree-algorithm-scratch-python/?中給出了CART(Classification and Regression Trees,分類回歸樹算法,簡稱CART)算法的Python實現,采用的數據集為Banknote Dataset,關于此數據集的介紹可以參考:http://blog.csdn.net/fengbingchun/article/details/78624358?,這里在原作者的基礎上,進行了略微改動,使其可以直接執行,code如下:
# reference: https://machinelearningmastery.com/implement-decision-tree-algorithm-scratch-python/
# http://zhuanlan.51cto.com/art/201702/531945.htm
# using CART(Classification and Regression Trees,分類回歸樹算法,簡稱CART算法)) for classification# CART on the Bank Note dataset
from random import seed
from random import randrange
from csv import reader# Load a CSV file
def load_csv(filename):file = open(filename, "r")lines = reader(file)dataset = list(lines)return dataset# Convert string column to float
def str_column_to_float(dataset, column):for row in dataset:row[column] = float(row[column].strip())# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):dataset_split = list()dataset_copy = list(dataset)fold_size = int(len(dataset) / n_folds)for i in range(n_folds):fold = list()while len(fold) < fold_size:index = randrange(len(dataset_copy))fold.append(dataset_copy.pop(index))dataset_split.append(fold)return dataset_split# Calculate accuracy percentage
def accuracy_metric(actual, predicted):correct = 0for i in range(len(actual)):if actual[i] == predicted[i]:correct += 1return correct / float(len(actual)) * 100.0# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):folds = cross_validation_split(dataset, n_folds)scores = list()for fold in folds:train_set = list(folds)train_set.remove(fold)train_set = sum(train_set, [])test_set = list()for row in fold:row_copy = list(row)test_set.append(row_copy)row_copy[-1] = Nonepredicted = algorithm(train_set, test_set, *args)actual = [row[-1] for row in fold]accuracy = accuracy_metric(actual, predicted)scores.append(accuracy)return scores# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):left, right = list(), list()for row in dataset:if row[index] < value:left.append(row)else:right.append(row)return left, right# Calculate the Gini index for a split dataset
def gini_index(groups, classes):# count all samples at split pointn_instances = float(sum([len(group) for group in groups])) # 計算總的樣本數# sum weighted Gini index for each groupgini = 0.0for group in groups:size = float(len(group))# avoid divide by zeroif size == 0:continuescore = 0.0# score the group based on the score for each classfor class_val in classes:p = [row[-1] for row in group].count(class_val) / size # row[-1]指每個樣本(一行)中最后一列即類別score += p * p# weight the group score by its relative sizegini += (1.0 - score) * (size / n_instances)return gini# Select the best split point for a dataset
def get_split(dataset):class_values = list(set(row[-1] for row in dataset)) # class_values的值為: [0, 1]b_index, b_value, b_score, b_groups = 999, 999, 999, Nonefor index in range(len(dataset[0])-1): # index的值為: [0, 1, 2, 3]for row in dataset:groups = test_split(index, row[index], dataset)gini = gini_index(groups, class_values)if gini < b_score:b_index, b_value, b_score, b_groups = index, row[index], gini, groupsreturn {'index':b_index, 'value':b_value, 'groups':b_groups} # 返回字典數據類型# Create a terminal node value
def to_terminal(group):outcomes = [row[-1] for row in group]return max(set(outcomes), key=outcomes.count)# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):left, right = node['groups']del(node['groups'])# check for a no splitif not left or not right:node['left'] = node['right'] = to_terminal(left + right)return# check for max depthif depth >= max_depth:node['left'], node['right'] = to_terminal(left), to_terminal(right)return# process left childif len(left) <= min_size:node['left'] = to_terminal(left)else:node['left'] = get_split(left)split(node['left'], max_depth, min_size, depth+1)# process right childif len(right) <= min_size:node['right'] = to_terminal(right)else:node['right'] = get_split(right)split(node['right'], max_depth, min_size, depth+1)# Build a decision tree
def build_tree(train, max_depth, min_size):root = get_split(train)split(root, max_depth, min_size, 1)return root# Make a prediction with a decision tree
def predict(node, row):if row[node['index']] < node['value']:if isinstance(node['left'], dict):return predict(node['left'], row)else:return node['left']else:if isinstance(node['right'], dict):return predict(node['right'], row)else:return node['right']# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):tree = build_tree(train, max_depth, min_size)predictions = list()for row in test:prediction = predict(tree, row)predictions.append(prediction)return(predictions)# Test CART on Bank Note dataset
seed(1)
# load and prepare data
filename = '../../../data/database/BacknoteDataset/data_banknote_authentication.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])):str_column_to_float(dataset, i) # dataset為嵌套列表的列表,類型為float# evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(dataset, decision_tree, n_folds, max_depth, min_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
執行結果如下:
GitHub:?https://github.com/fengbingchun/NN_Test?
總結
以上是生活随笔為你收集整理的Python实现决策树(Decision Tree)分类的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Windows C++中__declsp
- 下一篇: OpenCV3.3中决策树(Decisi