java分词器

java分词器,第1张

java分词

java分词器
  • 一、代码示例
    • 1.word分词器
    • 2.mmseg4j分词器(推荐)


示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。

一、代码示例 1.word分词器

代码如下:

import com.alibaba.fastjson.JSON;
import com.chenlb.mmseg4j.ComplexSeg;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MMSeg;
import com.chenlb.mmseg4j.Seg;

import com.google.common.collect.Lists;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.Word;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;


public class WordFilter {

    //分词器方法一
    public static   List automaticSelection(String title) {
        //移除停用词进行分词
        List list = WordSegmenter.seg(title);


        return list;
//        System.out.println(JSON.toJSonString(list));

//        //保留停用词
//        List lists = WordSegmenter.segWithStopWords(title);
//        System.out.println(JSON.toJSonString(lists));

    }

    //word方法二
    public static Map segMore(String text) {
        Map map = new HashMap<>();
        for(SegmentationAlgorithm segmentationAlgorithm : SegmentationAlgorithm.values()){
            map.put(segmentationAlgorithm.getDes(), seg(text, segmentationAlgorithm));
        }
        return map;
    }
    private static String seg(String text, SegmentationAlgorithm segmentationAlgorithm) {
        StringBuilder result = new StringBuilder();
        for(Word word : WordSegmenter.segWithStopWords(text, segmentationAlgorithm)){
            result.append(word.getText()).append(" ");
        }
        return result.toString();
    }

    public static void main(String[] args) {
//        WordFilter.automaticSelection("我叫李太白,我是一个诗人,我生活在唐朝");
//        WordFilter.automaticSelection("在唐朝有一名李太白诗人");

        System.out.println( WordFilter.MMSegDemoToString("在唐朝有一名李太白诗人"));

//        Map map = new WordFilter().segMore("我叫李太白,我是一个诗人,我生活在唐朝");
//        System.out.println(map);
    }


    
    public static  List MMSegDemo(String txt){


        StringReader input = new StringReader(txt);
        Dictionary dic = Dictionary.getInstance();
        Seg seg = new ComplexSeg(dic);//Complex分词
        //seg = new SimpleSeg(dic);//Simple分词
        MMSeg mmSeg = new MMSeg(input, seg);
        com.chenlb.mmseg4j.Word word;
        List wordList = Lists.newArrayList();
        try {
            while ((word = mmSeg.next()) != null) {
                //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
                wordList.add(word.getString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            input.close();
        }

        return wordList;

    }


    
    public static  String MMSegDemoToString(String txt){


        StringReader input = new StringReader(txt);
        Dictionary dic = Dictionary.getInstance();
        Seg seg = new ComplexSeg(dic);//Complex分词
        //seg = new SimpleSeg(dic);//Simple分词
        MMSeg mmSeg = new MMSeg(input, seg);
        com.chenlb.mmseg4j.Word word;
        List wordList = Lists.newArrayList();
        String citiesCommaSeparated ="";
        try {
            while ((word = mmSeg.next()) != null) {
                //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
                wordList.add(word.getString());
            }
            citiesCommaSeparated = wordList.stream()
                    .collect(Collectors.joining(","));

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            input.close();
        }

        return citiesCommaSeparated;

    }

}

2.mmseg4j分词器(推荐)

代码如下(示例):

    
    public static  List MMSegDemo(String txt){


        StringReader input = new StringReader(txt);
        Dictionary dic = Dictionary.getInstance();
        Seg seg = new ComplexSeg(dic);//Complex分词
        //seg = new SimpleSeg(dic);//Simple分词
        MMSeg mmSeg = new MMSeg(input, seg);
        com.chenlb.mmseg4j.Word word;
        List wordList = Lists.newArrayList();
        try {
            while ((word = mmSeg.next()) != null) {
                //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
                wordList.add(word.getString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            input.close();
        }

        return wordList;

    }


    
    public static  String MMSegDemoToString(String txt){


        StringReader input = new StringReader(txt);
        Dictionary dic = Dictionary.getInstance();
        Seg seg = new ComplexSeg(dic);//Complex分词
        //seg = new SimpleSeg(dic);//Simple分词
        MMSeg mmSeg = new MMSeg(input, seg);
        com.chenlb.mmseg4j.Word word;
        List wordList = Lists.newArrayList();
        String citiesCommaSeparated ="";
        try {
            while ((word = mmSeg.next()) != null) {
                //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
                wordList.add(word.getString());
            }
            citiesCommaSeparated = wordList.stream()
                    .collect(Collectors.joining(","));

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            input.close();
        }

        return citiesCommaSeparated;

    }

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/zaji/4023789.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-10-22
下一篇 2022-10-22

发表评论

登录后才能评论

评论列表(0条)

保存