标签:
# -*- coding:utf-8 -*-from math import logfrom numpy import *import operatordef createDataSet():dataSet = [[1, 1, ‘yes‘],[1, 1, ‘yes‘],[1, 0, ‘no‘],[0, 1, ‘no‘],[0, 1, ‘no‘]]labels = [‘no surfacing‘, ‘flippers‘]# change to discrete valuesreturn dataSet, labels# 测量给定数据集的信息熵,度量数据的无序程度,熵越大则数据越无序。def calcShannonEnt(dataSet):numEntries = len(dataSet)labelCounts = {}for featVec in dataSet:currentLabel = featVec[-1]labelCounts[currentLabel] = labelCounts.get(currentLabel, 0) + 1shannonEnt = 0.0for key in labelCounts:prob = float(labelCounts[key]) / numEntriesshannonEnt -= prob * log(prob, 2)return shannonEnt# 按照给定特征划分数据集(当我们按照某个特征划分数据集时,就需要将所有符合要求的元素抽取出来)# axis:用来划分数据集的特征(索引值), value:该特征选取的属性值(需要返回的值)def splitDataSet(dataSet, axis, value):retDataSet = []for featVec in dataSet:if featVec[axis] == value:reducedFeatVec = featVec[:]reducedFeatVec.remove(value)retDataSet.append(reducedFeatVec)return retDataSet# 选择最好的数据集划分方式def chooseBestFeatureToSplit(dataSet):numFeatures = len(dataSet[0]) - 1baseEntropy = calcShannonEnt(dataSet)bestInfoGain = 0.0bestFeature = -1for i in range(numFeatures):# 创建唯一的分类标签列表featList = [example[i] for example in dataSet]uniqueVals = set(featList)newEntropy = 0.0# 计算每种划分方式的信息熵for value in uniqueVals:subDataSet = splitDataSet(dataSet, i, value)prob = len(subDataSet) / float(len(dataSet))newEntropy += prob * calcShannonEnt(subDataSet)infoGain = baseEntropy - newEntropy# 计算最好的信息增益if infoGain > bestInfoGain:bestInfoGain = infoGainbestFeature = ireturn bestFeaturedef majorityCnt(classList):classCount = {}for vote in classList:classCount[vote] = classCount.get(vote, 0) + 1sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)return sortedClassCount[0][0]def createTree(dataSet, labels):classList = [example[-1] for example in dataSet]# 两个结束条件:类别完全相同或者遍历完所有特征if len(set(classList)) == 1:return classList[0]if len(dataSet[0]) == 1:return majorityCnt(classList)bestFeat = chooseBestFeatureToSplit(dataSet)bestFeatLabel = labels[bestFeat]myTree = {bestFeatLabel: {}}del(labels[bestFeat])featValues = [example[bestFeat] for example in dataSet]uniqueVals = set(featValues)for value in uniqueVals:subLabels = labels[:]myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)return myTreedef classify(inputTree, featLabels, testVec):firstStr = inputTree.keys()[0]secondDict = inputTree[firstStr]featIndex = featLabels.index(firstStr)for key in secondDict.keys():if testVec[featIndex] == key:if type(secondDict[key]).__name__ == ‘dict‘:classLabel = classify(secondDict[key], featLabels, testVec)else:classLabel = secondDict[key]return classLabel# 使用pickle模块存储决策树def storeTree(inputTree, filename):import picklefw = open(filename, ‘w‘)pickle.dump(inputTree, fw)fw.close()def grabTree(filename):import picklefr = open(filename)return pickle.load(fr)
决策树(chap3)Machine Learning In Action学习笔记
标签:
原文地址:http://www.cnblogs.com/woaielf/p/5511164.html