(三)朴素贝叶斯与垃圾分类Python代码实现

(三)朴素贝叶斯与垃圾分类Python代码实现,第1张

李航老师《统计学习方法》第二版学习笔记

知识点:
  • 朴素贝叶斯是基于贝叶斯定理与特征条件独立假设分类算法
  • 属于生成模型
  • 优点:算法逻辑简单,时空开销小
  • 缺点:条件独立性的假设可能会导致牺牲一定的分类准确性
  • 朴素贝叶斯的参数估计可以是极大似然估计贝叶斯估计
  • 贝叶斯公式:
  • 朴素贝叶斯分类器:
  • 极大似然估计先验概率: 
  • 极大似然估计条件概率:
  • 贝叶斯估计先验概率:   ,λ=1时称为拉普拉斯平滑,K代表Y有K类
  • 贝叶斯估计条件概率:,Sj代表特征个数
垃圾分类代码实现:

初学者、独自实现的、中间逻辑可能有些地方不可取、、、

数据集:链接: https://pan.baidu.com/s/1weD2iNagq9BokllVJoJNKg 提取码: m4lf

STEP1:数据准备部分
'''数据获取路径'''
spam_path = "/Users/Downloads/ML-Test/实战/SVM垃圾邮件过滤器实战项目/spam_data3"
ham_path = "/Users/Downloads/ML-Test/实战/SVM垃圾邮件过滤器实战项目/ham_data3"
stop_path = '/Users/Downloads/ML-Test/实战/SVM垃圾邮件过滤器实战项目/chinese_stop_vocab.txt'



'''划分数据集8:2'''
import os
from sklearn.model_selection import train_test_split
all_spam = os.listdir(spam_path)
all_ham = os.listdir(ham_path)
spam_train, spam_test = train_test_split(all_spam, test_size=0.2)
ham_train, ham_test = train_test_split(all_ham, test_size=0.2)



'''数据处理'''
# 去除停用字
def get_stop_words(stop_path):
    stop_words = []
    with open(stop_path, 'rb') as f:
        for line in f.readlines():
            line = line.decode('utf-8','ignore').strip()
            stop_words.append(line)
    return stop_words
all_stop_words = get_stop_words(stop_path)


# 判断中文字符
def check_chinese(words):
    for ch in words:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False


# 分词
import jieba
def cut_words(words):
    vocabulary_list = jieba.cut(words,cut_all=False)
    return vocabulary_list



'''读取邮件内容'''
# 邮件数目上万条,这里限制垃圾邮件和正常邮件各600条实现该算法
def get_email_words(email_root,train_email_names, max_emails=600):
    all_email_word = []
    count = 0
    for train_email_name in train_email_names:
        if(train_email_name=='.DS_Store'):
            continue
        each_email_word = []
        if count>=max_emails:
            break
        email_full_path = email_root+"/"+train_email_name
        count +=1
        with open(email_full_path,'rb') as f:
            each_email_word = []
            for line in f.readlines():
                line = line.strip().decode('gbk','ignore')
                if not check_chinese(line):  
                    continue
                all_vocabulary = cut_words(line)
                for vocabulary in all_vocabulary:
                    if vocabulary in all_stop_words or not check_chinese(vocabulary):
                        continue
                    each_email_word.append(vocabulary)
            all_email_word.append(" ".join(each_email_word)) 
            f.close()
    return all_email_word


train_spam = get_email_words(spam_path,spam_train, max_emails=600)
train_ham = get_email_words(ham_path,ham_train, max_emails=600)
test_spam = get_email_words(spam_path,spam_test, max_emails=600)
test_ham = get_email_words(ham_path,ham_test, max_emails=600)
 STEP2:特征工程-文本向量化
'''构造邮件数据特征矩阵'''
# 将每个词的出现与否作为一个特征,这被描述为词集模型,但一个词在文档中出现不止一次,就要使用词袋模型,在词袋中,每个单词可以出现多次

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
def feature_matrix(vocab):
    select_feature = []
    select_feature_index = []
    cv = CountVectorizer()
    cv.fit(vocab)
    words_vect = cv.transform(vocab)
    words_vect = words_vect.toarray()
    words_matrix = pd.DataFrame(words_vect)
    words_table = cv.vocabulary_
    # 选择出现在至少10个邮件的词作为特征
    for word, index in words_table.items(): # items() 函数以列表返回可遍历的(键, 值) 元组数组
        if words_matrix[index].sum()>=10:
            select_feature.append(word)
            select_feature_index.append(index)
    # 将列名index改为词汇,inplace是否覆盖原来数据
    words_matrix.rename(columns=dict(zip(select_feature_index,select_feature)),inplace=True)
    return words_matrix[select_feature]


# train_spam_matrix/train_ham_matrix中囊括了训练集中垃圾/正常邮件的特征矩阵,每个整数代表了词典上一个对应位置的词在当下文档中的出现频率。


train_spam_matrix = feature_matrix(train_spam) train_ham_matrix = feature_matrix(train_ham)

 STEP3:计算条件概率
'''计算 P(词|类别) 条件概率'''
# 每个词的特征有2个,一个为出现,一个为不出现, P(词出现|垃圾邮件)+ P(词不出现|垃圾邮件)=1
# 其中P(词出现|垃圾邮件)代表所有垃圾邮件文档中词条出现的概率
# 如果为词集模型那就比较好算,词出现的邮件个数/垃圾邮件个数,
# 但这里为词袋模型,所以我用词出现的频次/(词出现的频次+词不出现的邮件个数)代表P(词出现|垃圾邮件),不知道这样思考行不行。




import numpy as np def condition_func(feature_word, class_email): result = -100 if class_email == "垃圾邮件": if feature_word in train_spam_matrix.columns: data_spam = train_spam_matrix[feature_word] # 0的个数 num_0_spam = (data_spam == 0).astype(int).sum(axis=0) # 采用拉普拉斯平滑 result = (np.sum(data_spam)+1)/(np.sum(data_spam)+num_0_spam+2) else: #len(data_spam=600) result = 1/600 if class_email == "正常邮件": if feature_word in train_ham_matrix.columns: data_ham = train_ham_matrix[feature_word] num_0_ham = (data_ham == 0).astype(int).sum(axis=0) result = (np.sum(data_ham)+1)/(np.sum(data_ham)+num_0_ham+2) else: result = 1/600 # 使用np.log是防止下溢,将公式中乘变成加,不会影响最后判定结果 return np.log(result)

STEP4:计算先验概率
'''计算先验概率P(垃圾邮件)与P(正常邮件)'''

def prior_func(spam_train, ham_train):
    email_count_ham  = len(ham_train)
    email_count_spam = len(spam_train)
    email_total = email_count_ham + email_count_spam

    prior_ham = (email_count_ham+1) / (email_total+2)
    prior_spam = (email_count_spam+1) / (email_total+2)
    return np.log(prior_spam), np.log(prior_ham)


prior_spam, prior_ham = prior_func(train_spam, train_ham)
STEP5:垃圾邮件分类
def classify_func(test_data):
    score_list = []
    for each_email in test_data:
        each_email = each_email.split(" ")
        word_conditions_spam = []
        word_conditions_ham = []
        for word in each_email:
            word_condition_spam = condition_func(word,"垃圾邮件")
            word_conditions_spam.append(word_condition_spam)
            word_condition_ham = condition_func(word,"正常邮件")
            word_conditions_ham.append(word_condition_ham)
        email_condition_spam = np.sum(word_conditions_spam)
        email_condition_ham = np.sum(word_conditions_ham)
        score_spam = email_condition_spam+prior_spam
        score_ham = email_condition_ham+prior_ham
        if score_spam > score_ham:# 判断为垃圾邮件
            score_list.append(0)
        else:# 判断为正常邮件
            score_list.append(1)
    return score_list
STEP5:测试识别结果
'''用垃圾邮件进行测试准确率'''
score_list = classify_func(test_spam)
ham_num = np.sum(score_list)
email_num = len(score_list)
right_num = email_num-ham_num
score = right_num/email_num
print(score)

        输出结果:

0.9033333333333333
'''用正常邮件进行测试准确率'''
score_list = classify_func(test_ham)
right_num = np.sum(score_list)
email_num = len(score_list)
score = right_num/email_num
print(score)

        输出结果:

0.89

欢迎分享,转载请注明来源:内存溢出

原文地址: https://outofmemory.cn/langs/570201.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-09
下一篇 2022-04-09

发表评论

登录后才能评论

评论列表(0条)

保存