【NLP】HMM原理---维特比算法实现

【NLP】HMM原理---维特比算法实现,第1张

HMM原理

参考博客:
https://www.cnblogs.com/pinard/p/6945257.html
https://www.cnblogs.com/pinard/p/6991852.html

代码实现

将上述博客中的例子代码实现

def hmm_viterbi(A,B,pi,O):
    δ = np.zeros((len(O),len(A)))   #第一个局部
    Ψ = np.zeros((len(O),len(A)))   #第二个局部
    # 1、初始化t=1时刻维特比的两个局部变量
    δ[0] = pi*B[:,O[0]]
    print(δ[0])
    # 2、递归求序列每一步的两个局部变量
    for index in range(1,len(δ)):
        δ[index] = np.max(δ[index-1]*A.T,1)*B[:,O[index]]
        Ψ[index] = np.argmax(δ[index-1]*A.T,1)
    # 3、求最后一个概率最大对应的隐含标签
    label = [δ[-1].argmax()]
    # 4、回溯求整个序列的隐含标签
    for index,tag in enumerate(Ψ[::-1]):
        if index<len(Ψ)-1:
            label.append(int(tag[int(label[-1])]))
    return label[::-1]

A = np.array([[0.5,0.2,0.3],[0.3,0.5,0.2],[0.2,0.3,0.5]])
B = np.array([[0.5,0.5],[0.4,0.6],[0.7,0.3]])
pi = np.array([0.2,0.4,0.4])
O = '维特比算法是一个分词方法'
O = np.array([0,1,0])
hmm_viterbi(A, B, pi, O)
维特比算法分词代码:
import numpy as np

class Hmm(object):
    def __init__(self, train_path):
        self.train_path = train_path
        self.clean_data()

    def clean_data(self):
        with open(self.train_path,encoding='utf-8') as f:
            sents = f.read()
        self.sents = [[word.split(" ") for word in sent.split("\n")] for sent in sents.split("\n\n")]
        self.Q = sorted(list(set([word[1] for sent in self.sents for word in sent])))  #隐含状态集合
        self.V = sorted(list(set([word[0] for sent in self.sents for word in sent])))  #观测集合

    def train(self):
        # 1、求hmm的初试隐含状态概率pi
        first_label = [sent[0][1] for sent in self.sents]
        self.pi = np.array([round(first_label.count(q)/len(first_label),4) for q in self.Q])
        # 2、求hmm的隐含状态转移概率矩阵A
        label = [[word[1] for word in sent] for sent in self.sents]
        two_label = [[tag[index:index+2] for index in range(len(tag)-1)] for tag in label]
        two_label = [''.join(word) for label in two_label for word in label]
        self.A = np.array([[round(two_label.count(q1+q2)/sum([1 for label in two_label if label[0]==q1]),4) for q2 in self.Q] for q1 in self.Q])
        # 3、求hmm的发射概率矩阵B
        word_label = [[''.join(word) for word in sent] for sent in self.sents]
        word_label = [word for label in word_label for word in label]
        label = [t for tag in label for t in tag]
        self.B = np.array([[word_label.count(v+q)/label.count(q) for v in self.V] for q in self.Q])

    def predict(self,sent):
        O = np.array([self.V.index(word) for word in sent])
        δ = np.zeros((len(O),len(self.A)))   #第一个局部
        Ψ = np.zeros((len(O),len(self.A)))   #第二个局部
        # 1、初始化t=1时刻维特比的两个局部变量
        δ[0] = self.pi*self.B[:,O[0]]
        # 2、递归求序列每一步的两个局部变量
        for index in range(1,len(δ)):
            δ[index] = np.max(δ[index-1]*self.A.T,1)*self.B[:,O[index]]
            Ψ[index] = np.argmax(δ[index-1]*self.A.T,1)
        # 3、求最后一个概率最大对应的隐含标签
        label = [δ[-1].argmax()]
        # 4、回溯求整个序列的隐含标签
        for index,tag in enumerate(Ψ[::-1]):
            if index<len(Ψ)-1:
                label.append(int(tag[int(label[-1])]))
        label = label[::-1]
        label = ''.join([self.Q[index] for index in label])
        return label

if __name__ == '__main__':
    text = '维特比算法是一个分词方法'
    train_path = 'test.txt'
    hmm = Hmm(train_path)
    hmm.train()
    label = hmm.predict(text)
    print([text[word.start():word.end()] for word in re.finditer(r'bi+|o', label)])

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/571023.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-09
下一篇 2022-04-09

发表评论

登录后才能评论

评论列表(0条)

保存