本体开发日记07-我与java分词组件的爱恨情仇-DictSegment类_随笔

本体开发日记07-我与java分词组件的爱恨情仇-DictSegment类

1.词典树分段，表示词典树的一个分枝？啥是词典树？

package com.huaban.analysis.jieba;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

上面是引用！

class DictSegment implements Comparable

这个类实现了一个接口：Comparable

讲解：https://www.cnblogs.com/walter371/p/5197511.html
讲解：https://blog.csdn.net/weixin_43505709/article/details/87908756

不知道这个接口是干啥的！

 // 公用字典表，存储汉字
 private static final Map charMap = new HashMap(16, 0.95f);

// 数组大小上限
private static final int ARRAY_LENGTH_LIMIT = 3;

// Map存储结构
 private Map childrenMap;

// 数组方式存储结构
private DictSegment[] childrenArray;

// 当前节点上存储的字符
private Character nodeChar;
// 当前节点存储的Segment数目storeSize <=ARRAY_LENGTH_LIMIT ，使用数组存储， storeSize >ARRAY_LENGTH_LIMIT,则使用Map存储
private int storeSize = 0;
// 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
 private int nodeState = 0;

上面都是提前定义好的参数！这个java基础要很好才行！我步行！！

DictSegment(Character nodeChar) {
        if (nodeChar == null) {
            throw new IllegalArgumentException("参数为空异常，字符不能为空");
        }
        this.nodeChar = nodeChar;
    }

Character getNodeChar() {
        return nodeChar;
    }

这个首先要提取nodeChar！

boolean hasNextNode() {
        return this.storeSize > 0;
    }

判断是否有下一个节点！

Hit match(char[] charArray) {
        return this.match(charArray, 0, charArray.length, null);
    }
Hit match(char[] charArray, int begin, int length) {
        return this.match(charArray, begin, length, null);
    }
 Hit match(char[] charArray, int begin, int length, Hit searchHit) {

        if (searchHit == null) {
            // 如果hit为空，新建
            searchHit = new Hit();
            // 设置hit的其实文本位置
            searchHit.setBegin(begin);
        }
        else {
            // 否则要将HIT状态重置
            searchHit.setUnmatch();
        }
        // 设置hit的当前处理位置
        searchHit.setEnd(begin);

        Character keyChar = new Character(charArray[begin]);
        DictSegment ds = null;

        // 引用实例变量为本地变量，避免查询时遇到更新的同步问题
        DictSegment[] segmentArray = this.childrenArray;
        Map segmentMap = this.childrenMap;

        // STEP1 在节点中查找keyChar对应的DictSegment
        if (segmentArray != null) {
            // 在数组中查找
            DictSegment keySegment = new DictSegment(keyChar);
            int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
            if (position >= 0) {
                ds = segmentArray[position];
            }

        }
        else if (segmentMap != null) {
            // 在map中查找
            ds = (DictSegment) segmentMap.get(keyChar);
        }

        // STEP2 找到DictSegment，判断词的匹配状态，是否继续递归，还是返回结果
        if (ds != null) {
            if (length > 1) {
                // 词未匹配完，继续往下搜索
                return ds.match(charArray, begin + 1, length - 1, searchHit);
            }
            else if (length == 1) {

                // 搜索最后一个char
                if (ds.nodeState == 1) {
                    // 添加HIT状态为完全匹配
                    searchHit.setMatch();
                }
                if (ds.hasNextNode()) {
                    // 添加HIT状态为前缀匹配
                    searchHit.setPrefix();
                    // 记录当前位置的DictSegment
                    searchHit.setMatchedDictSegment(ds);
                }
                return searchHit;
            }

        }
        // STEP3 没有找到DictSegment， 将HIT设置为不匹配
        return searchHit;
    }

匹配词段！（OS：眼睛突然不行了！估计以后找工作都费劲儿！就这样吧！！我打算粗略看一下！！！）

void fillSegment(char[] charArray) {
        this.fillSegment(charArray, 0, charArray.length, 1);
    }

加载填充词典片段！

    
    void disableSegment(char[] charArray) {
        this.fillSegment(charArray, 0, charArray.length, 0);
    }


    
    private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
        // 获取字典表中的汉字对象
        Character beginChar = new Character(charArray[begin]);
        Character keyChar = charMap.get(beginChar);
        // 字典中没有该字，则将其添加入字典
        if (keyChar == null) {
            charMap.put(beginChar, beginChar);
            keyChar = beginChar;
        }

        // 搜索当前节点的存储，查询对应keyChar的keyChar，如果没有则创建
        DictSegment ds = lookforSegment(keyChar, enabled);
        if (ds != null) {
            // 处理keyChar对应的segment
            if (length > 1) {
                // 词元还没有完全加入词典树
                ds.fillSegment(charArray, begin + 1, length - 1, enabled);
            }
            else if (length == 1) {
                // 已经是词元的最后一个char,设置当前节点状态为enabled，
                // enabled=1表明一个完整的词，enabled=0表示从词典中屏蔽当前词
                ds.nodeState = enabled;
            }
        }

    }


    
    private DictSegment lookforSegment(Character keyChar, int create) {

        DictSegment ds = null;

        if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
            // 获取数组容器，如果数组未创建则创建数组
            DictSegment[] segmentArray = getChildrenArray();
            // 搜寻数组
            DictSegment keySegment = new DictSegment(keyChar);
            int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
            if (position >= 0) {
                ds = segmentArray[position];
            }

            // 遍历数组后没有找到对应的segment
            if (ds == null && create == 1) {
                ds = keySegment;
                if (this.storeSize < ARRAY_LENGTH_LIMIT) {
                    // 数组容量未满，使用数组存储
                    segmentArray[this.storeSize] = ds;
                    // segment数目+1
                    this.storeSize++;
                    Arrays.sort(segmentArray, 0, this.storeSize);

                }
                else {
                    // 数组容量已满，切换Map存储
                    // 获取Map容器，如果Map未创建,则创建Map
                    Map segmentMap = getChildrenMap();
                    // 将数组中的segment迁移到Map中
                    migrate(segmentArray, segmentMap);
                    // 存储新的segment
                    segmentMap.put(keyChar, ds);
                    // segment数目+1 ， 必须在释放数组前执行storeSize++ ， 确保极端情况下，不会取到空的数组
                    this.storeSize++;
                    // 释放当前的数组引用
                    this.childrenArray = null;
                }

            }

        }
        else {
            // 获取Map容器，如果Map未创建,则创建Map
            Map segmentMap = getChildrenMap();
            // 搜索Map
            ds = (DictSegment) segmentMap.get(keyChar);
            if (ds == null && create == 1) {
                // 构造新的segment
                ds = new DictSegment(keyChar);
                segmentMap.put(keyChar, ds);
                // 当前节点存储segment数目+1
                this.storeSize++;
            }
        }

        return ds;
    }


    
    private DictSegment[] getChildrenArray() {
        if (this.childrenArray == null) {
            synchronized (this) {
                if (this.childrenArray == null) {
                    this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
                }
            }
        }
        return this.childrenArray;
    }


    
    private Map getChildrenMap() {
        if (this.childrenMap == null) {
            synchronized (this) {
                if (this.childrenMap == null) {
                    this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2, 0.8f);
                }
            }
        }
        return this.childrenMap;
    }


    
    private void migrate(DictSegment[] segmentArray, Map segmentMap) {
        for (DictSegment segment : segmentArray) {
            if (segment != null) {
                segmentMap.put(segment.nodeChar, segment);
            }
        }
    }


    
    public int compareTo(DictSegment o) {
        // 对当前节点存储的char进行比较
        return this.nodeChar.compareTo(o.nodeChar);
    }

OS：眼睛不行了！这段就随便理解一下吧！后面怎么使用才是重要的！

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/zaji/5637436.html

本体开发日记07-我与java分词组件的爱恨情仇-DictSegment类

发表评论

评论列表（0条）