维基百科:AdaBoost
首先介绍一下元算法,它是对其它算法进行组合的一种方式,而AdaBoost就是其中一种元算法。 在分类过程中我们可以将不同的方法进行组合,无论是kNN还是朴素贝叶斯,它们的组合叫做集成方法或者元算法,这既可以是一种算法的不同设置,也可以是不同算法的分配。 boosting方法从原始数据集选择S次后获得S个新数据集,是通过集中关注已有分类器错分的数据来获得新的分类器,分类器中的权重并不相同,权重代表的是上一轮迭代的成功度。Adaboost只是其中一个版本。
Adaboost方法依赖的是弱分类器(能够迅速正确的识别的过程就是强分类器,而易错的则是弱分类器),下面我们使用单层决策树测试。 我们先对数据进行训练过程,并基于错误提升分类器性能。然后基于单层决策树构建弱分类器。
完整实现代码为:
#ecoding:utf-8
from numpy import *
def loadSimpData():
datMat = matrix([[ 1. , 2.1],
[ 2. , 1.1],
[ 1.3, 1. ],
[ 1. , 1. ],
[ 2. , 1. ]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat,classLabels
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split('\t'))
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat-1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
#单层决策树的构建
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#数据分类
retArray = ones((shape(dataMatrix)[0],1))
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0
return retArray
def buildStump(dataArr,classLabels,D):
dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
m,n = shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
minError = inf #初始化错误和,到正无穷大
for i in range(n): #遍历所有维度
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
stepSize = (rangeMax-rangeMin)/numSteps
for j in range(-1,int(numSteps)+1):#在当前维度遍历所有
for inequal in ['lt', 'gt']: #判断大于或者小于
threshVal = (rangeMin + float(j) * stepSize)
predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal)
errArr = mat(ones((m,1)))
errArr[predictedVals == labelMat] = 0
weightedError = D.T*errArr
#计算加权错误率
#print "split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump,minError,bestClasEst
#基于单层决策树的AdaBoost训练过程
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
weakClassArr = []
m = shape(dataArr)[0]
D = mat(ones((m,1))/m) #将D重置为相同的值
aggClassEst = mat(zeros((m,1)))
for i in range(numIt):
bestStump,error,classEst = buildStump(dataArr,classLabels,D)#建立决策树
#print "D:",D.T
alpha = float(0.5*log((1.0-error)/max(error,1e-16)))
bestStump['alpha'] = alpha
weakClassArr.append(bestStump) #以Array形式保存树
#print "classEst: ",classEst.T
expon = multiply(-1*alpha*mat(classLabels).T,classEst) #为下一次迭代计算D
D = multiply(D,exp(expon))
D = D/D.sum()
#错误率累加计算
aggClassEst += alpha*classEst
#print "aggClassEst: ",aggClassEst.T
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
errorRate = aggErrors.sum()/m
print "total error: ",errorRate
if errorRate == 0.0: break
return weakClassArr,aggClassEst
#Adaboost分类函数,使用多个弱分类器
def adaClassify(datToClass,classifierArr):
dataMatrix = mat(datToClass)
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m,1)))
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
classifierArr[i]['thresh'],\
classifierArr[i]['ineq']) #使用树桩值分类
aggClassEst += classifierArr[i]['alpha']*classEst
print aggClassEst
return sign(aggClassEst)