#-*- coding:UTF-8 -*-
from numpy import *
def loadDataSet():
    postingList = [
        ["my","dog","has","flea","problems","help","please"],
        ["maybe","not","take","him","to","dog","park","stupid"],
        ["my","dalmation","is","so","cute","I","love","him"],
        ["stop","posting","stupid","worthless","garbage"],
        ["mr","licks","ate","my","steak","how","to","stop","him"],
        ["quit","buying","worthless","dog","food","stupid"]
    ]
    classVec = [0,1,0,1,0,1]    # 1代表侮辱性文字,0代表正常言论
    return postingList,classVec

def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document) # 计算所有文档并集
    return list(vocabSet)


def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
        else:
            print "the word:%s is not in my Vocabulary!" % word
    return returnVec


def trainNBO(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)     # 计算出待训练的文档数
    numWords = len(trainMatrix[0])      # 计算出单词总数
    pAbusive = sum(trainCategory)/float(numTrainDocs) # 计算出侮辱性文档占总文档的比值
    p0Num = ones(numWords)
    p1Num = ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1: # 侮辱性文档
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)
    p0Vect = log(p0Num/p0Denom)
    return p0Vect,p1Vect,pAbusive


def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNBO(array(trainMat),array(listClasses))
    testEntry = ["quit","buying","worthless","dog","food","stupid"]
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print testEntry,"classified as:",classifyNB(thisDoc,p0V,p1V,pAb)

testNB()