G++测试芦段通过
#include<iostream>
#include<fstream>
#include <string>
#include <map>
#include <iterator>
using namespace std
int main ()
{
ifstream is("input.txt")
char s[5000]
char c
int i = 0
int iChi=0
while ( (c = is.get()) != EOF)
s[i++] = c
s[i] = '\0'
map<string,int>counter
for (i=0s[i]!='\0'i++)
{
if (s[i] &0X80)
{
string temp
temp.push_back(s[i])
temp.push_back(s[i+1])
counter[temp]++
iChi++
i++
}
else continue
}
is.close()
cout<<"汉字孙孙总数:"<<iChi<<endl
cout<<"字频:"<<endl
map<则哗链string,int>::iterator iter
for(iter=counter.begin()iter!=counter.end()iter++)
{
cout<<iter->first<<":"<<iter->second<<endl
}
}
假设你的系统为 Windows,即中文环境编码为 gbk。看代码:
# -*- encoding: gbk -*-def is_chinese(uchar):
答毁 """判断一个unicode是否是汉字"""
if uchar >= u'\u4E00' and uchar <= u'\u9FA5':
return True
else:
return False
def count_chinese_word(filepath, encoding):
_dict = {}
try:
with open (filepath, 'r'誉举橘庆团) as txt_file:
for line in txt_file:
ustr = line.decode(encoding)
for uchar in ustr:
if is_chinese(uchar):
if _dict.has_key(uchar):
_dict[uchar] = _dict[uchar] + 1
else:
_dict[uchar] = 1
except IOError as ioerr:
print "文件",filepath,"不存在"
return _dict
if __name__ == '__main__':
_dict = count_chinese_word('内容.txt', 'gbk')
import json
print json.dumps(_dict, encoding = "utf-8", indent = 4, ensure_ascii = False)
内容.txt:
运行:
统计字词的频率文本分析程序如下//编程语言为网络编程的php脚本语言
header("content-type:text/htmlcharset=utf-8")
$filePath="txt.txt"
$chars=2
$wordArray =array()
$file=fopen($filePath,"r")
while(!FEOF($file)){
//读出一行
$singleLine=trim(fgets($file))
//数字、英文、标点、空格过滤
$singleLine=preg_replace("/[0-9]{1}/", "", $singleLine)
$singleLine=preg_replace("/[a-zA-Z]{1}/", "", $singleLine)
$singleLine=preg_replace("/[ '.,:*?~`!@#$%^&+=\-)(<>{}]|\]|\[|\/|\\\|\"|\|/", "", $singleLine)
$singleLine=str_replace(" ", "", $singleLine)
//只处理字数多于2的行
if (strlen($singleLine)>2){
for($i=0$i<strlen($singleLine)-$chars*3$i=$i+3){//一个汉字在utf-8下算三个字符
$word=substr($singleLine,$i,$chars*3)
$wordArray[]=$word
//echo $word
}
}
}
//关闭滚宽文件
fclose($file)
//对频数进行统计
$wordArrayOut=array_count_values($wordArray)
//根据统计次数冲握降序大判亮排列
arsort($wordArrayOut)
//输出结果
$i=1
foreach($wordArrayOut as $key=>$value){
$rankNo=$i<10?"0".$i:$i
echo "$key $value<br />"
$i++
}
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)