Golang实现词频统计

Golang实现词频统计,第1张

概述本例使用golang实现词频统计。步骤: (1)从文件中读取一篇文章。 (2)统计词频,按单词出现的频率从大到小进行排序。 (3)写入到文件中。 注:任何非英文字母的符号均认为是单词分隔符(即等同于空格)。 效率:使用本程序统计一篇150W单词的文章,大约需要70ms. 1.核心代码: package wordtestimport ( "bytes" "fmt" "io/ioutil"

本例使用golang实现词频统计。步骤:

(1)从文件中读取一篇文章。

(2)统计词频,按单词出现的频率从大到小进行排序。

(3)写入到文件中。

注:任何非英文字母的符号均认为是单词分隔符(即等同于空格)。

效率:使用本程序统计一篇150W单词的文章,大约需要70ms.

1.核心代码:

package wordtestimport (	"bytes"	"fmt"	"io/IoUtil"	"os"	"runtime"	"sort"	"strings"	"time")//简单的词频统计任务func CountTestBase(inputfilePath string,outputfilePath string) {	//时间开始点	start := time.Now().UnixNano() / 1e6	//读取文件	fileData,err := IoUtil.Readfile(inputfilePath)	Checkerror(err,"read file")	var fileText string = string(fileData)	//根据cpu核数新开协程	newRountineCount := runtime.Numcpu()*2 - 1	runtime.GOMAXPROCS(newRountineCount + 1)	//切分文件	parts := splitfileText(fileText,newRountineCount)	var ch chan map[string]int = make(chan map[string]int,newRountineCount)	for i := 0; i < newRountineCount; i++ {		go countTest(parts[i],ch)	}	//主线程接收数据	var totalWordsMap map[string]int = make(map[string]int,0)	completeCount := 0	for {		receiveData := <-ch		for k,v := range receiveData {			totalWordsMap[strings.Tolower(k)] += v		}		completeCount++		if newRountineCount == completeCount {			break		}	}	//添加进slice,并排序	List := make(WordCountBeanList,0)	for k,v := range totalWordsMap {		List = append(List,NewWordCountBean(k,v))	}	sort.sort(List)	//时间结束点	end := time.Now().UnixNano() / 1e6	fmt.Printf("time consume:%dms\n",end-start)	//输出	wordsCount := List.totalCount()	var data bytes.Buffer	data.WriteString(fmt.Sprintf("程序执行:%dms\n",end-start))	data.WriteString(fmt.Sprintf("文章总单词数:%d\n\n",wordsCount))	for _,v := range List {		var percent float64 = 100.0 * float64(v.count) / float64(wordsCount)		_,err := data.WriteString(fmt.Sprintf("%s: %d,%3.2f%%\n",v.word,v.count,percent))		Checkerror(err,"bytes.Buffer,WriteString")	}	err = IoUtil.Writefile(outputfilePath,[]byte(data.String()),os.ModePerm)	Checkerror(err,"IoUtil.Writefile")}func countTest(text string,ch chan map[string]int) {	var wordMap map[string]int = make(map[string]int,0)	//按字母读取,除26个字母(大小写)之外的所有字符均认为是分隔符	startIndex := 0	letterStart := false	for i,v := range text {		if (v >= 65 && v <= 90) || (v >= 97 && v <= 122) {			if !letterStart {				letterStart = true				startIndex = i			}		} else {			if letterStart {				wordMap[text[startIndex:i]]++				letterStart = false			}		}	}	//最后一个单词	if letterStart {		wordMap[text[startIndex:]]++	}	ch <- wordMap}//将全文分成n段func splitfileText(fileText string,n int) []string {	length := len(fileText)	parts := make([]string,n)	lastPostion := 0	for i := 0; i < n-1; i++ {		position := length / n * (i + 1)		for string(fileText[position]) != " " {			position++		}		parts[i] = fileText[lastPostion:position]		lastPostion = position	}	//最后一段	parts[n-1] = fileText[lastPostion:]	return parts}func Checkerror(err error,msg string) {	if err != nil {		panic(msg + "," + err.Error())	}}
2.一个struct
package wordtesttype WordCountBean struct {	word  string	count int}func NewWordCountBean(word string,count int) *WordCountBean {	return &WordCountBean{word,count}}type WordCountBeanList []*WordCountBeanfunc (List WordCountBeanList) Len() int {	return len(List)}func (List WordCountBeanList) Less(i,j int) bool {	if List[i].count > List[j].count {		return true	} else if List[i].count < List[j].count {		return false	} else {		return List[i].word < List[j].word	}}func (List WordCountBeanList) Swap(i,j int) {	var temp *WordCountBean = List[i]	List[i] = List[j]	List[j] = temp}func (List WordCountBeanList) totalCount() int {	totalCount := 0	for _,v := range List {		totalCount += v.count	}	return totalCount}
3.主函数:
package mainimport (	"WordsTest/wordtest")func main() {	inputfilePath := "files/article.txt"	outputfilePath := "files/result.txt"	wordtest.CountTestBase(inputfilePath,outputfilePath)}
总结

以上是内存溢出为你收集整理的Golang实现词频统计全部内容,希望文章能够帮你解决Golang实现词频统计所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: https://outofmemory.cn/langs/1287981.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-09
下一篇 2022-06-09

发表评论

登录后才能评论

评论列表(0条)

保存