原代码
def textParse(bigString):
import re
listOfTokens =re.split(r'\W*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList=[]; classList=[]; fullText=[]
for i in range(1,25):
#❶ (以下 七行) 导入 并解 析 文本 文件
wordList = textParse(open('email/spam/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/spam/%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet = list(range(26)); testSet=[]
#❷( 以下 四行) 随机 构建 训练 集
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[]; trainClasses =[]
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
#❸( 以下 四行) 对 测试 集 分类
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam)!= classList[docIndex]:
errorCount += 1
print('the error rate is:',float(errorCount)/len(testSet))
return wordList,docList,classList,fullText,vocabList,testSet,trainingSet
终端输出,解析结果是空的
>>> wordList,docList,classList,fullText,vocabList,testSet,trainingSet=bayes.spamTest()
the error rate is: 0.7
>>> wordList
[]
>>> docList
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
>>> classList
[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
>>> fullText
[]