# 朴素贝叶斯代码有误

2018-03-30 20:04:32

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):

pClass0 = 1.0 - pClass1

p1 = sum(vec2Classify * log(p1Vec)) + log(pClass1) # 乘法是对应元素相乘

p0 = sum(vec2Classify * log(p0Vec)) + log(pClass0)

vec2Classify_ = list(map(lambda x: x != 1 , vec2Classify))

p1 += sum(vec2Classify_ * log(1-p1Vec))

p0 += sum(vec2Classify_ * log(1-p0Vec))

if p1 > p0:

return 1

else:

return 0

def trainNB0(trainMatrix, trainCategory): # 样本， 样本标签 array类型便于对元素运算

m = len(trainMatrix) # 每篇文档为一个样本 m X n m个文档，每个文档有n个特征值（n是词典的word数)，特征值的0/1代表这个词在词典里是否出现

numWords = len(trainMatrix[0]) # 字典长

...

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):

pClass0 = 1.0 - pClass1

p1 = sum(vec2Classify * log(p1Vec)) + log(pClass1) # 乘法是对应元素相乘

p0 = sum(vec2Classify * log(p0Vec)) + log(pClass0)

vec2Classify_ = list(map(lambda x: x != 1 , vec2Classify))

p1 += sum(vec2Classify_ * log(1-p1Vec))

p0 += sum(vec2Classify_ * log(1-p0Vec))

if p1 > p0:

return 1

else:

return 0

def trainNB0(trainMatrix, trainCategory): # 样本， 样本标签 array类型便于对元素运算

m = len(trainMatrix) # 每篇文档为一个样本 m X n m个文档，每个文档有n个特征值（n是词典的word数)，特征值的0/1代表这个词在词典里是否出现

numWords = len(trainMatrix[0]) # 字典长

pAbusive = sum(trainCategory)/float(m)

p0Num = ones(numWords)

p1Num = ones(numWords)

p0Denom = 2 # 记录是脏话的样本数

p1Denom = 2

for i in range(m):

if trainCategory[i] == 1: # 第i个邮件样本是脏话

p1Num += trainMatrix[i]

p1Denom += 1

else:

p0Num += trainMatrix[i]

p0Denom += 1 #### 修改处

p1Vect = (p1Num / p1Denom)

p0Vect = (p0Num / p0Denom)

return p0Vect, p1Vect, pAbusive

1
0