《Python自然语言处理》第五章练习题答案_python

概述这章主要内容涉及分词、词性标注和标注器训练、字典使用。因为中英文差别，所以在后面练习里尝试用中文数据来训练ngram标注器。首先导包importnltkfromnltk.corpusimportbrownfromnltk.bookimport*importjiebaimportmatplotlib.pyplotasplt1#nltk词性标

这章主要内容涉及分词、词性标注和标注器训练、字典使用。
因为中英文差别，所以在后面练习里尝试用中文数据来训练ngram标注器。

首先导包

import nltkfrom nltk.corpus import brownfrom nltk.book import *import jIEbaimport matplotlib.pyplot as plt

#nltk词性标注无法消除歧义text = nltk.word_tokenize('British left Waffles on Falkland Islands')nltk.pos_tag(text)

tag_words=brown.tagged_words()for (word,tag) in tag_words:    if word == 'contest':        print(tag)        break

nltk.pos_tag(nltk.word_tokenize('They wind back the clock,while we chase after the wind.'))

#dic中update将内容全部添加到d1中d1 = {'a':1,'b':2,'c':3}d2 = {'d':4,'f':5,'g':6}d1.update(d2)print(d1,d2)

text1.concordance('go')text1.concordance('went')

import rebrown_tagged_sents = brown.tagged_sents()brown_sents = brown.sents()unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)new_text = re.sub('[\,\.]',' ',"What needs to be clarifIEd is that the fundamental purpose of China's development is to ensure that the Chinese people can live a better life and to benefit all humankind. Win-win cooperation is an important principle of China's development and a golden rule in China's external relations. China has no intention to interfere in the political system of the United States, nor challenge or replace its status and influence.In the past few years, due to Washington's irrational suppression of China's legitimate rights and interests, China-US relations have encountered unprecedented difficultIEs. This situation should not continue any longer. The only right way is to follow the principles of non-conflict, non-confrontation, mutual respect and win-win cooperation.")word_Tags = unigram_tagger.tag(new_text.split())none_tag = []for (word,tag) in word_Tags:    if tag==None:        none_tag.append(word)none_tag

没被标记的有拼写不规范的词、有连字符、新词

help(nltk.AffixTagger)

用法AffixTagger(train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False)

brown_sents=brown.sents(categorIEs='news')brown_tagged_sents=brown.tagged_sents(categorIEs='news')affixtagger=nltk.AffixTagger(train=brown_tagged_sents,affix_length=-3,min_stem_length=2)affixtagger.tag(brown_sents[2007])

sents = brown.sents()tag_sents = brown.tagged_sents()baseline_tagger = nltk.BigramTagger(tag_sents)baseline_tagger.evaluate(tag_sents)

sent = "They expressed their willingness to enhance cooperation or coordination in some specific areas. For instance, the two sIDes are committed to strengthening dialogue and cooperation in the fIEld of climate change and will establish a joint working group on that subject. In the spirit of reciprocity and mutual benefit, the two sIDes will hold talks on facilitating activitIEs of each other's diplomatic and consular missions and personnel, as well as on issues related to media reporters.".split()baseline_tagger.tag(sent)

baseline_tagger.evaluate(brown_tagged_sents)

bigram用于新数据后得分会提高

print("date:%i-%i-%i"%(2021,3,21))print("date:%s/%s/%s"%('2021','3','21'))

words = brown.words()fd_dic={}for w in words:    w = w.lower()    if w in fd_dic.keys():        fd_dic[w] += 1    else:        fd_dic[w] = 1 fd_dic

sorted(fd_dic.items(),key=lambda item:item[1],reverse=True)

words = brown.tagged_words(Tagset='universal')set(cont[1] for cont in words)

n_words = set(word for (word,tag) in words if tag=='NOUN')n_dic={}for w in brown.words():    if len(w)>2:        w = w[:-1]        if w in n_words:            if w in n_dic.keys():                n_dic[w] += 1            else:                n_dic[w] = 1sorted(n_dic.items(),key = lambda item:item[1],reverse=True)

cfd = nltk.ConditionalFreqdist((w.lower(),tag)for (w,tag) in words)count_dic = {}for word in cfd.conditions():    count_dic[word] = len(cfd[word])sorted(count_dic.items(),key = lambda item:item[1],reverse=True)

count_tag = {}words = brown.tagged_words()for (w,tag) in words:    if tag in count_tag.keys():        count_tag[tag] += 1    else:        count_tag[tag] = 1sorted(count_tag.items(),key = lambda item:item[1],reverse=True)

words = brown.tagged_words(Tagset='universal')count_Tags = {}for i in range(len(words)):    if words[i][1] == 'NOUN':        back_tag = words[i+1][1]        if back_tag in count_Tags.keys():            count_Tags[back_tag] += 1        else:            count_Tags[back_tag] = 1sorted(count_Tags.items(),key = lambda item:item[1],reverse=True)

fd = nltk.Freqdist(brown.words())cfd = nltk.ConditionalFreqdist(brown.tagged_words())likely_Tags = dict((word,cfd[word].max()) for word in brown.words())baseline_tagger = nltk.UnigramTagger(model=likely_Tags)baseline_tagger.evaluate(brown_tagged_sents)

fd = nltk.Freqdist(brown.words())cfd = nltk.ConditionalFreqdist(brown.tagged_words())likely_Tags = dict((word,cfd[word].max()) for word in brown.words())baseline_tagger = nltk.UnigramTagger(model=likely_Tags,backoff=nltk.DefaultTagger('NN'))baseline_tagger.evaluate(brown_tagged_sents)

for (k,v) in count_Tags.items():    print(k,":",(v/sum(count_Tags.values()))*100)

cfd = nltk.ConditionalFreqdist((w.lower(),tag)for (w,tag) in words)count_dic = {}for word in cfd.conditions():    if len(cfd[word])>1:        count_dic[word] = len(cfd[word])print((len(count_dic)/len(words))*100,"%")

words = brown.tagged_words()w_li = []for (w,t) in words:    if t=='MD':        w_li.append(w.lower())w_li.sort()print(set(w_li))

for i in range(len(words)):    if words[i][1]=='P' and words[i+1][1]=='DET' and words[i+2][1]=='NN':        print(words[i:i+3][0])

ws = []for i in range(len(words)):    if words[i][0].lower() in ('adore','love','like','prefer'):        ws.append(words[i-1][0].lower())set(ws)

brown_tagged_sents = brown.tagged_sents()brown_sents = brown.sents()train_full_size = int(len(brown_tagged_sents)*0.7)train_sents = brown_tagged_sents[:train_full_size]test_sents = brown_tagged_sents[train_full_size:]

tagger = nltk.UnigramTagger(train_sents)tagger.evaluate(test_sents)

tagger = nltk.BigramTagger(train_sents)tagger.evaluate(test_sents)

tagger = nltk.TrigramTagger(train_sents)tagger.evaluate(test_sents)

多元标注器性能逐渐下降

25
加载人民日报2014语料

with open(r'E:\laptop\研一14_corpus.txt',enCoding='utf8') as f:    corpus = f.readlines()

#人民日报语料切分tagged_sents = []i = 1for sent in corpus:    if i<20000:        tagged_sent = []        for w in sent.split(' '):            if w != '\n' and len(w.split('/'))==2:                tagged_sent.append(tuple(w.split('/')))        tagged_sents.append(tagged_sent)        i+=1    else:        break

#一元标注器训练size = int(len(tagged_sents)*0.7)train = tagged_sents[:size]test = tagged_sents[size:]t0 = nltk.DefaultTagger('n')t1 = nltk.UnigramTagger(train,backoff=t0)t2 = nltk.BigramTagger(train,backoff=t1)t2.evaluate(test)

t2.tag(jIEba.lcut('PFR语料库是对人民日报1998年上半年的纯文本语料进行了词语切分和词性标注制作而成的，严格按照人民日报的日期、版序、文章顺序编排的。文章中的每个词语都带有词性标记。'))

t1 = nltk.UnigramTagger(train)

%matplotlib inlinedef perform(data,test):    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))    return baseline_tagger.evaluate(test)def display():    sizes = range(1,16)    test = tagged_sents[-5000:]    train_data = tagged_sents    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]    plt.plot(sizes,perfs,'-bo')    plt.xlabel('data size')    plt.ylabel('perform')    plt.show()display()

#抽出人民语料的标记orl_sent = [[word for (word,tag) in sent if (word != None and tag!= None)] for sent in test]

test_Tags = [tag for sent in orl_sent for (word,tag) in t2.tag(sent) if (word != None and tag!= None)]gold_Tags = [tag for (word,tag) in sent for sent in test if (word != None and tag!= None)]nltk.ConfusionMatrix(gold_Tags,test_Tags)

%matplotlib inlinedef perform(data,test):    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))    return baseline_tagger.evaluate(test)def display():    sizes = range(1,16)    test = tagged_sents[-5000:]    train_data = tagged_sents    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]    plt.semilogx(sizes,perfs,'-bo')    plt.xlabel('data size')    plt.ylabel('perform')    plt.show()display()

size = int(len(tagged_sents)*0.7)train = tagged_sents[:size]test = tagged_sents[size:]t0 = nltk.DefaultTagger('n')t1 = nltk.UnigramTagger(train,backoff=t0)t2 = nltk.BigramTagger(train,backoff=t1)t2.evaluate(test)

t3 = nltk.BrillTaggerTrainer.train(train_sents=train,max_rules=200, min_score=2, min_acc=None)t3.evaluate(test)

总结

以上是内存溢出为你收集整理的《Python自然语言处理》第五章练习题答案全部内容，希望文章能够帮你解决《Python自然语言处理》第五章练习题答案所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错，欢迎将内存溢出网站推荐给程序员好友。

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/langs/1188377.html

《Python自然语言处理》第五章练习题答案

发表评论

评论列表（0条）