本例使用golang实现词频统计。步骤:
(1)从文件中读取一篇文章。
(2)统计词频,按单词出现的频率从大到小进行排序。
(3)写入到文件中。
注:任何非英文字母的符号均认为是单词分隔符(即等同于空格)。
效率:使用本程序统计一篇150W单词的文章,大约需要70ms.
1.核心代码:
package wordtestimport ( "bytes" "fmt" "io/IoUtil" "os" "runtime" "sort" "strings" "time")//简单的词频统计任务func CountTestBase(inputfilePath string,outputfilePath string) { //时间开始点 start := time.Now().UnixNano() / 1e6 //读取文件 fileData,err := IoUtil.Readfile(inputfilePath) Checkerror(err,"read file") var fileText string = string(fileData) //根据cpu核数新开协程 newRountineCount := runtime.Numcpu()*2 - 1 runtime.GOMAXPROCS(newRountineCount + 1) //切分文件 parts := splitfileText(fileText,newRountineCount) var ch chan map[string]int = make(chan map[string]int,newRountineCount) for i := 0; i < newRountineCount; i++ { go countTest(parts[i],ch) } //主线程接收数据 var totalWordsMap map[string]int = make(map[string]int,0) completeCount := 0 for { receiveData := <-ch for k,v := range receiveData { totalWordsMap[strings.Tolower(k)] += v } completeCount++ if newRountineCount == completeCount { break } } //添加进slice,并排序 List := make(WordCountBeanList,0) for k,v := range totalWordsMap { List = append(List,NewWordCountBean(k,v)) } sort.sort(List) //时间结束点 end := time.Now().UnixNano() / 1e6 fmt.Printf("time consume:%dms\n",end-start) //输出 wordsCount := List.totalCount() var data bytes.Buffer data.WriteString(fmt.Sprintf("程序执行:%dms\n",end-start)) data.WriteString(fmt.Sprintf("文章总单词数:%d\n\n",wordsCount)) for _,v := range List { var percent float64 = 100.0 * float64(v.count) / float64(wordsCount) _,err := data.WriteString(fmt.Sprintf("%s: %d,%3.2f%%\n",v.word,v.count,percent)) Checkerror(err,"bytes.Buffer,WriteString") } err = IoUtil.Writefile(outputfilePath,[]byte(data.String()),os.ModePerm) Checkerror(err,"IoUtil.Writefile")}func countTest(text string,ch chan map[string]int) { var wordMap map[string]int = make(map[string]int,0) //按字母读取,除26个字母(大小写)之外的所有字符均认为是分隔符 startIndex := 0 letterStart := false for i,v := range text { if (v >= 65 && v <= 90) || (v >= 97 && v <= 122) { if !letterStart { letterStart = true startIndex = i } } else { if letterStart { wordMap[text[startIndex:i]]++ letterStart = false } } } //最后一个单词 if letterStart { wordMap[text[startIndex:]]++ } ch <- wordMap}//将全文分成n段func splitfileText(fileText string,n int) []string { length := len(fileText) parts := make([]string,n) lastPostion := 0 for i := 0; i < n-1; i++ { position := length / n * (i + 1) for string(fileText[position]) != " " { position++ } parts[i] = fileText[lastPostion:position] lastPostion = position } //最后一段 parts[n-1] = fileText[lastPostion:] return parts}func Checkerror(err error,msg string) { if err != nil { panic(msg + "," + err.Error()) }}2.一个struct
package wordtesttype WordCountBean struct { word string count int}func NewWordCountBean(word string,count int) *WordCountBean { return &WordCountBean{word,count}}type WordCountBeanList []*WordCountBeanfunc (List WordCountBeanList) Len() int { return len(List)}func (List WordCountBeanList) Less(i,j int) bool { if List[i].count > List[j].count { return true } else if List[i].count < List[j].count { return false } else { return List[i].word < List[j].word }}func (List WordCountBeanList) Swap(i,j int) { var temp *WordCountBean = List[i] List[i] = List[j] List[j] = temp}func (List WordCountBeanList) totalCount() int { totalCount := 0 for _,v := range List { totalCount += v.count } return totalCount}3.主函数:
package mainimport ( "WordsTest/wordtest")func main() { inputfilePath := "files/article.txt" outputfilePath := "files/result.txt" wordtest.CountTestBase(inputfilePath,outputfilePath)}总结
以上是内存溢出为你收集整理的Golang实现词频统计全部内容,希望文章能够帮你解决Golang实现词频统计所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)