# -*- coding: utf-8 -*-
import os, codecs
import jieba
from collections import Counter
def get_words(txt):
seg_list = jieba.cut(txt)
c = Counter()
铅衫让for x in seg_list:
if len(x)>1 and x != '塌镇\r\n':
c[x] += 1
print('常用词频度统计结果')
for (k,v) in c.most_common(100):
槐局 print('%s%s %s %d' % (' '*(5-len(k)), k, '*'*int(v/3), v))
if __name__ == '__main__':
with codecs.open('19d.txt', 'r', 'utf8') as f:
txt = f.read()
get_words(txt)
def statistics(astr):# astr.replace("\n", ""做宴渗)
slist = list(astr.split("\t"))
alist = []
[alist.append(i) for i in slist if i not in alist]
alist[-1] = alist[-1].replace("\n", "")
return alist
if __name__ == "__main__":
code_doc = {}
with open("test_data.txt"祥消, "纯脊r", encoding='utf-8') as fs:
for ln in fs.readlines():
l = statistics(ln)
for t in l:
if t not in code_doc:
code_doc.setdefault(t, 1)
else:
code_doc[t] += 1
for keys in code_doc.keys():
print(keys + ' ' + str(code_doc[keys]))
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)