- 一、代码示例
- 1.word分词器
- 2.mmseg4j分词器(推荐)
示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。
一、代码示例 1.word分词器代码如下:
import com.alibaba.fastjson.JSON; import com.chenlb.mmseg4j.ComplexSeg; import com.chenlb.mmseg4j.Dictionary; import com.chenlb.mmseg4j.MMSeg; import com.chenlb.mmseg4j.Seg; import com.google.common.collect.Lists; import org.apdplat.word.WordSegmenter; import org.apdplat.word.segmentation.SegmentationAlgorithm; import org.apdplat.word.segmentation.Word; import java.io.IOException; import java.io.StringReader; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; public class WordFilter { //分词器方法一 public static List2.mmseg4j分词器(推荐)automaticSelection(String title) { //移除停用词进行分词 List list = WordSegmenter.seg(title); return list; // System.out.println(JSON.toJSonString(list)); // //保留停用词 // List lists = WordSegmenter.segWithStopWords(title); // System.out.println(JSON.toJSonString(lists)); } //word方法二 public static Map segMore(String text) { Map map = new HashMap<>(); for(SegmentationAlgorithm segmentationAlgorithm : SegmentationAlgorithm.values()){ map.put(segmentationAlgorithm.getDes(), seg(text, segmentationAlgorithm)); } return map; } private static String seg(String text, SegmentationAlgorithm segmentationAlgorithm) { StringBuilder result = new StringBuilder(); for(Word word : WordSegmenter.segWithStopWords(text, segmentationAlgorithm)){ result.append(word.getText()).append(" "); } return result.toString(); } public static void main(String[] args) { // WordFilter.automaticSelection("我叫李太白,我是一个诗人,我生活在唐朝"); // WordFilter.automaticSelection("在唐朝有一名李太白诗人"); System.out.println( WordFilter.MMSegDemoToString("在唐朝有一名李太白诗人")); // Map map = new WordFilter().segMore("我叫李太白,我是一个诗人,我生活在唐朝"); // System.out.println(map); } public static List MMSegDemo(String txt){ StringReader input = new StringReader(txt); Dictionary dic = Dictionary.getInstance(); Seg seg = new ComplexSeg(dic);//Complex分词 //seg = new SimpleSeg(dic);//Simple分词 MMSeg mmSeg = new MMSeg(input, seg); com.chenlb.mmseg4j.Word word; List wordList = Lists.newArrayList(); try { while ((word = mmSeg.next()) != null) { //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来 wordList.add(word.getString()); } } catch (IOException e) { e.printStackTrace(); } finally { input.close(); } return wordList; } public static String MMSegDemoToString(String txt){ StringReader input = new StringReader(txt); Dictionary dic = Dictionary.getInstance(); Seg seg = new ComplexSeg(dic);//Complex分词 //seg = new SimpleSeg(dic);//Simple分词 MMSeg mmSeg = new MMSeg(input, seg); com.chenlb.mmseg4j.Word word; List wordList = Lists.newArrayList(); String citiesCommaSeparated =""; try { while ((word = mmSeg.next()) != null) { //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来 wordList.add(word.getString()); } citiesCommaSeparated = wordList.stream() .collect(Collectors.joining(",")); } catch (IOException e) { e.printStackTrace(); } finally { input.close(); } return citiesCommaSeparated; } }
代码如下(示例):
public static ListMMSegDemo(String txt){ StringReader input = new StringReader(txt); Dictionary dic = Dictionary.getInstance(); Seg seg = new ComplexSeg(dic);//Complex分词 //seg = new SimpleSeg(dic);//Simple分词 MMSeg mmSeg = new MMSeg(input, seg); com.chenlb.mmseg4j.Word word; List wordList = Lists.newArrayList(); try { while ((word = mmSeg.next()) != null) { //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来 wordList.add(word.getString()); } } catch (IOException e) { e.printStackTrace(); } finally { input.close(); } return wordList; } public static String MMSegDemoToString(String txt){ StringReader input = new StringReader(txt); Dictionary dic = Dictionary.getInstance(); Seg seg = new ComplexSeg(dic);//Complex分词 //seg = new SimpleSeg(dic);//Simple分词 MMSeg mmSeg = new MMSeg(input, seg); com.chenlb.mmseg4j.Word word; List wordList = Lists.newArrayList(); String citiesCommaSeparated =""; try { while ((word = mmSeg.next()) != null) { //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来 wordList.add(word.getString()); } citiesCommaSeparated = wordList.stream() .collect(Collectors.joining(",")); } catch (IOException e) { e.printStackTrace(); } finally { input.close(); } return citiesCommaSeparated; }
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)