（三）朴素贝叶斯与垃圾分类Python代码实现_python

李航老师《统计学习方法》第二版学习笔记

知识点：

朴素贝叶斯是基于贝叶斯定理与特征条件独立假设的分类算法
属于生成模型
优点：算法逻辑简单，时空开销小
缺点：条件独立性的假设可能会导致牺牲一定的分类准确性
朴素贝叶斯的参数估计可以是极大似然估计或贝叶斯估计
贝叶斯公式：
朴素贝叶斯分类器:
极大似然估计先验概率：
极大似然估计条件概率：
贝叶斯估计先验概率： ,λ=1时称为拉普拉斯平滑，K代表Y有K类
贝叶斯估计条件概率：,Sj代表特征个数

垃圾分类代码实现：

初学者、独自实现的、中间逻辑可能有些地方不可取、、、

数据集：链接: https://pan.baidu.com/s/1weD2iNagq9BokllVJoJNKg 提取码: m4lf

STEP1：数据准备部分

'''数据获取路径'''
spam_path = "/Users/Downloads/ML-Test/实战/SVM垃圾邮件过滤器实战项目/spam_data3"
ham_path = "/Users/Downloads/ML-Test/实战/SVM垃圾邮件过滤器实战项目/ham_data3"
stop_path = '/Users/Downloads/ML-Test/实战/SVM垃圾邮件过滤器实战项目/chinese_stop_vocab.txt'



'''划分数据集8：2'''
import os
from sklearn.model_selection import train_test_split
all_spam = os.listdir(spam_path)
all_ham = os.listdir(ham_path)
spam_train, spam_test = train_test_split(all_spam, test_size=0.2)
ham_train, ham_test = train_test_split(all_ham, test_size=0.2)



'''数据处理'''
# 去除停用字
def get_stop_words(stop_path):
    stop_words = []
    with open(stop_path, 'rb') as f:
        for line in f.readlines():
            line = line.decode('utf-8','ignore').strip()
            stop_words.append(line)
    return stop_words
all_stop_words = get_stop_words(stop_path)


# 判断中文字符
def check_chinese(words):
    for ch in words:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False


# 分词
import jieba
def cut_words(words):
    vocabulary_list = jieba.cut(words,cut_all=False)
    return vocabulary_list



'''读取邮件内容'''
# 邮件数目上万条，这里限制垃圾邮件和正常邮件各600条实现该算法
def get_email_words(email_root,train_email_names, max_emails=600):
    all_email_word = []
    count = 0
    for train_email_name in train_email_names:
        if(train_email_name=='.DS_Store'):
            continue
        each_email_word = []
        if count>=max_emails:
            break
        email_full_path = email_root+"/"+train_email_name
        count +=1
        with open(email_full_path,'rb') as f:
            each_email_word = []
            for line in f.readlines():
                line = line.strip().decode('gbk','ignore')
                if not check_chinese(line):  
                    continue
                all_vocabulary = cut_words(line)
                for vocabulary in all_vocabulary:
                    if vocabulary in all_stop_words or not check_chinese(vocabulary):
                        continue
                    each_email_word.append(vocabulary)
            all_email_word.append(" ".join(each_email_word)) 
            f.close()
    return all_email_word


train_spam = get_email_words(spam_path,spam_train, max_emails=600)
train_ham = get_email_words(ham_path,ham_train, max_emails=600)
test_spam = get_email_words(spam_path,spam_test, max_emails=600)
test_ham = get_email_words(ham_path,ham_test, max_emails=600)

STEP2：特征工程-文本向量化

'''构造邮件数据特征矩阵'''
# 将每个词的出现与否作为一个特征，这被描述为词集模型，但一个词在文档中出现不止一次，就要使用词袋模型，在词袋中，每个单词可以出现多次

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
def feature_matrix(vocab):
    select_feature = []
    select_feature_index = []
    cv = CountVectorizer()
    cv.fit(vocab)
    words_vect = cv.transform(vocab)
    words_vect = words_vect.toarray()
    words_matrix = pd.DataFrame(words_vect)
    words_table = cv.vocabulary_
    # 选择出现在至少10个邮件的词作为特征
    for word, index in words_table.items(): # items() 函数以列表返回可遍历的(键, 值) 元组数组
        if words_matrix[index].sum()>=10:
            select_feature.append(word)
            select_feature_index.append(index)
    # 将列名index改为词汇，inplace是否覆盖原来数据
    words_matrix.rename(columns=dict(zip(select_feature_index,select_feature)),inplace=True)
    return words_matrix[select_feature]


# train_spam_matrix/train_ham_matrix中囊括了训练集中垃圾/正常邮件的特征矩阵，每个整数代表了词典上一个对应位置的词在当下文档中的出现频率。


train_spam_matrix = feature_matrix(train_spam)
train_ham_matrix = feature_matrix(train_ham)

STEP3：计算条件概率

'''计算 P(词|类别) 条件概率'''
# 每个词的特征有2个，一个为出现，一个为不出现， P(词出现|垃圾邮件)+ P(词不出现|垃圾邮件)=1
# 其中P(词出现|垃圾邮件)代表所有垃圾邮件文档中词条出现的概率
# 如果为词集模型那就比较好算，词出现的邮件个数/垃圾邮件个数，
# 但这里为词袋模型，所以我用词出现的频次/（词出现的频次+词不出现的邮件个数）代表P(词出现|垃圾邮件)，不知道这样思考行不行。

。


import numpy as np
def condition_func(feature_word, class_email):
    result = -100
    if class_email == "垃圾邮件":
        if feature_word in train_spam_matrix.columns:
            data_spam = train_spam_matrix[feature_word]
            # 0的个数
            num_0_spam = (data_spam == 0).astype(int).sum(axis=0)
            # 采用拉普拉斯平滑
            result = (np.sum(data_spam)+1)/(np.sum(data_spam)+num_0_spam+2)
        else:
            #len(data_spam=600)
            result = 1/600
    if class_email == "正常邮件":
        if feature_word in train_ham_matrix.columns:
            data_ham = train_ham_matrix[feature_word]
            num_0_ham = (data_ham == 0).astype(int).sum(axis=0)
            result = (np.sum(data_ham)+1)/(np.sum(data_ham)+num_0_ham+2)
        else:
            result = 1/600
    # 使用np.log是防止下溢，将公式中乘变成加，不会影响最后判定结果
    return np.log(result)

STEP4:计算先验概率

'''计算先验概率P(垃圾邮件)与P(正常邮件)'''

def prior_func(spam_train, ham_train):
    email_count_ham  = len(ham_train)
    email_count_spam = len(spam_train)
    email_total = email_count_ham + email_count_spam

    prior_ham = (email_count_ham+1) / (email_total+2)
    prior_spam = (email_count_spam+1) / (email_total+2)
    return np.log(prior_spam), np.log(prior_ham)


prior_spam, prior_ham = prior_func(train_spam, train_ham)

STEP5:垃圾邮件分类

def classify_func(test_data):
    score_list = []
    for each_email in test_data:
        each_email = each_email.split(" ")
        word_conditions_spam = []
        word_conditions_ham = []
        for word in each_email:
            word_condition_spam = condition_func(word,"垃圾邮件")
            word_conditions_spam.append(word_condition_spam)
            word_condition_ham = condition_func(word,"正常邮件")
            word_conditions_ham.append(word_condition_ham)
        email_condition_spam = np.sum(word_conditions_spam)
        email_condition_ham = np.sum(word_conditions_ham)
        score_spam = email_condition_spam+prior_spam
        score_ham = email_condition_ham+prior_ham
        if score_spam > score_ham:# 判断为垃圾邮件
            score_list.append(0)
        else:# 判断为正常邮件
            score_list.append(1)
    return score_list

STEP5:测试识别结果

'''用垃圾邮件进行测试准确率'''
score_list = classify_func(test_spam)
ham_num = np.sum(score_list)
email_num = len(score_list)
right_num = email_num-ham_num
score = right_num/email_num
print(score)

输出结果：

0.9033333333333333

'''用正常邮件进行测试准确率'''
score_list = classify_func(test_ham)
right_num = np.sum(score_list)
email_num = len(score_list)
score = right_num/email_num
print(score)

输出结果：

0.89

欢迎分享，转载请注明来源：内存溢出

原文地址: https://outofmemory.cn/langs/570201.html

（三）朴素贝叶斯与垃圾分类Python代码实现

发表评论

评论列表（0条）