导入需要的库
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
代码:
def getText():
txt = open('comment.text','r',encoding='utf-8').read()
txt = txt.lower()
for ch in r'`~!\'''@#$%^&*()_+-={}[];":,./<>?\|':
txt = txt.replace(ch," ")
return txt
txt = getText()
words = pos_tag(word_tokenize(txt))
# print(words)
counts = {}
for word in words:
counts[word[1]] = counts.get(word[1],0) + 1
items = list(counts.items())
items.sort(key = lambda x:x[1],reverse=True)
print("该文本含有的词种类以及数量如下:")
for i in range(len(counts)):
pos,count = items[i]
print("{0:<10}{1:>5}".format(pos,count))
可以新增查询功能:
a = input("是否需要查询某种词的具体内容?(Y/N)")
while a in ['Y','y']:
b = input("请输入你要查询的词性:(输入all查看含有的词性种类)")
while b in ['all','All','ALL']:
print(counts.keys())
b = input("请输入你要查询的词性:(输入all查看含有的词性种类)")
if b in counts.keys():
for word in words:
if word.flag == b:
print(word,end=" ")
print()
b = input("是否还要继续查询?(Y/N)")
if b in ['N','n']:
break
else:
print("你的输入不正确!请重新输入!")
a = input("是否需要查询某种词的具体内容?(Y/N)")
if a in ['N','n'] or b in ['N','n']:
print("感谢使用!程序退出")
else:
print("你的输入有误!程序退出!")
结果如下:
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)