英文文本:hamlet,统计出现最多的英文单词
代码实现:
#Hamlet词频统计 def getText(): txt = open("hamlet",'r').read() txt = txt.lower() #大写字母转换小写 for word in '~!@#$%^&*()_+-={}[],./:";<>?': txt = txt.replace(word," ")#把多余符号转换为空格 return txt hamletTxt = getText() words = hamletTxt.split() #以空格拆分为列表 counts = {} for word in words: counts[word] = counts.get(word,0) + 1 #以每个词为键,值默认0,,每出现一次累加1 items = list(counts.items()) items.sort(key=lambda x:x[1],reverse=True) #[1]按照第二维排序,reverse=True表示降序 for i in range(10): word,count = items[i] print("{0:<10}{1:5}".format(word,count))
中文文本:三国,分析人物
import jieba txt = open("Threekingdoms", 'r', encoding="utf-8").read() excludes = {'将军','却说','荆州','二人','不可','不能','如此'} words = jieba.lcut(txt) counts = {} for word in words: if len(word) == 1: continue #书中同一人物多个名字统一改为一个名字 elif word == '诸葛亮' or word == '空明日': rword = "孔明" elif word == '关公' or word == '云长': rword = "关羽" elif word == '玄德' or word == '玄德日': rword = "刘备" elif word == '孟德' or word == '丞相': rword = "曹 *** " else: rword = word counts[word] = counts.get(word, 0) + 1 for word in excludes: del counts[word] #去重 items = list(counts.items()) items.sort(key=lambda x:x[1],reverse=True) for i in range(10): word,count = items[i] print("{0:<10}{1:>5}".format(word,count))
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)