⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dicttreeimpl.java

📁 这是关于中文分词的有关程序
💻 JAVA
字号:
/*
 * Copyright 2002-2005 the original author or authors.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * Created on 2005-12-28
 * author 谢骋超
 * 
 */
package cn.edu.zju.dartsplitter.impl;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;

import cn.edu.zju.dartsplitter.DictTree;
import cn.edu.zju.dartsplitter.dao.DictDAO;
import cn.edu.zju.dartsplitter.data.DictNode;
import cn.edu.zju.dartsplitter.data.DictValue;
import cn.edu.zju.dartsplitter.exceptions.DictSerializeException;

/**
 * @author xiecc
 * @email xieccy@gmail.com xieccy@yahoo.com
 * homepage:  http://blog.itpub.net/xiecc
 * projectpage: http://ccnt.zju.edu.cn/projects
 */
public class DictTreeImpl implements DictTree {

    /**
     * Logger for this class
     */
    private static final Logger logger = Logger.getLogger(DictTreeImpl.class);

    private Map<String, DictNode> dictMap = new HashMap<String, DictNode>();

    private boolean rebuild;

    private String fileName;

    private List<DictDAO> dictDAOList;

    private boolean preload = false;

    private Set<String> prefixSet;

    private int maxNodeId = 1;

    private int maxWordLength =20;// 词典里最大词的默认长度,如果超过这个长度就略掉了.因为不会有人查20个以上的词,还会导致树的高度太高

    /**
     * @return Returns the maxWordLength.
     */
    public int getMaxWordLength() {
        return maxWordLength;
    }

    /**
     * @param maxWordLength
     *            The maxWordLength to set.
     */
    public void setMaxWordLength(int maxWordLength) {
        this.maxWordLength = maxWordLength;
    }

    /**
     * @return Returns the dictDAOList.
     */
    public List<DictDAO> getDictDAOList() {
        return dictDAOList;
    }

    /**
     * @param dictDAOList
     *            The dictDAOList to set.
     */
    public void setDictDAOList(List<DictDAO> dictDAOList) {
        this.dictDAOList = dictDAOList;
    }

    /**
     * @return Returns the rebuild.
     */
    public boolean isRebuild() {
        return rebuild;
    }

    /**
     * @param rebuild
     *            The rebuild to set.
     */
    public void setRebuild(boolean rebuild) {
        this.rebuild = rebuild;
    }

    /**
     * @return Returns the dictMap.
     */
    public Map<String, DictNode> getDictMap() {
        return dictMap;
    }

    /**
     * @param dictMap
     *            The dictMap to set.
     */
    public void setDictMap(Map<String, DictNode> dictMap) {
        this.dictMap = dictMap;
    }

    public DictNode getDictNode(String strPrefix) {
        return dictMap.get(strPrefix);
    }

    public void putNode(DictNode dictNode) {
        dictMap.put(dictNode.getValue(), dictNode);
    }

    public List<DictValue> getDictValues(String strPrefix) {
       // Set<DictValue> dictValues = new HashSet<DictValue>();
        List<DictValue> dictValues=null;
        for (DictDAO dictDAO : dictDAOList) {
            if (null==dictValues){
                dictValues=dictDAO.getDictValues(strPrefix);                
            }
            else{
              dictValues.addAll(dictDAO.getDictValues(strPrefix));
            }
        }
        return dictValues;
    }

    public DictNode loadDictNodes(String strPrefix) {
        loadDictMap();
        DictNode rootNode = getDictNode(strPrefix);
        if (null != rootNode) {
            return rootNode;
        }
        return rootNode;
    }

    private boolean isValidPrefix(String strPrefix) {
        if (null==strPrefix){
            return false;
        }
        char[] prefixChar=strPrefix.toCharArray();
        if (prefixChar.length!=1){
            logger.debug("wrong prefix, lengh!=1: "+strPrefix);
            return false;
        }
        if (Character.isLetter(prefixChar[0])){ //|| Character.isDigit(prefixChar[0])){
            return true;
        }
        return false;
    }
    /*
     * (non-Javadoc)
     * 
     * @see edu.zju.tcmgrid2.lucene.splitter.impl.DictTree#buildSubNodes(java.lang.String)
     */
    public DictNode buildSubNodes(String strPrefix) {
        if (null == strPrefix || !isValidPrefix(strPrefix)) {
            return null;
        }
        DictNode rootNode = getDictNode(strPrefix);
        if (null != rootNode) {
            return rootNode;
        }
        List<DictValue> dictValueList = getDictValues(strPrefix);
        if (0 == dictValueList.size()) {
            return null;
        }
        // logger.debug("prefix: "+strPrefix);
        rootNode = new DictNode(1, strPrefix, DictNode.EMPTY_NODE, maxNodeId++);
        putNode(rootNode);

        DictNode dictNode = rootNode;

        for (DictValue dictValue : dictValueList) {
            if (dictValue.length() > getMaxWordLength()) {
                continue;
            }
            if (dictValue.getValue().equals(strPrefix)) {// 首字作为一词的情况
                rootNode.setEnd(true);
                rootNode.setUseCount(dictValue.getUseCount());
            }
            DictNode parentNode = dictNode;
            dictValue.setCurPos(strPrefix.length());// 前面值是已经build过的,所以先跳过
            String curStr = "";
            int level = 1;
            while (true) {
                level++;
                curStr = dictValue.nextValue();
                if (null == curStr) {
                    break;
                }
                DictNode curDictNode = parentNode.getChildNode(curStr);
                if (curDictNode == null) {
                    curDictNode = new DictNode(level, curStr, parentNode,
                            maxNodeId++);
                    parentNode.addChildNode(curDictNode);
                }

                if (dictValue.isEnd()) {
                    // logger.debug("dict value is "+dictValue);
                    curDictNode.setEnd(true);
                    curDictNode.setUseCount(dictValue.getUseCount());
                    // logger.debug("dict node is "+curDictNode);
                }
                parentNode = curDictNode;
            }
        }
        dictValueList=null;
        return dictNode;
    }

    /*
     * (non-Javadoc)
     * 
     * @see edu.zju.tcmgrid2.lucene.splitter.impl.DictTree#buildOrGetSubNodes(java.lang.String,
     *      edu.zju.tcmgrid2.lucene.splitter.data.DictNode)
     */
    public DictNode buildOrGetSubNodes(String strPrefix, DictNode parentNode) {
        if (parentNode.getLevel() == 0) {
            if (isRebuild()) {
                return buildSubNodes(strPrefix);
            } else {
                return loadDictNodes(strPrefix);
            }
        }
        return parentNode.getChildNode(strPrefix);
    }

    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Object#toString()
     */
    @Override
    public String toString() {
        return this.dictMap.toString();
    }

    public DictNode getRootNode(String strPrefix) {
        return getDictNode(strPrefix);
    }

    /**
     * @return Returns the fileName.
     */
    public String getFileName() {
        return fileName;
    }

    /**
     * @param fileName
     *            The fileName to set.
     */
    public void setFileName(String fileName) {
        this.fileName = fileName;
    }

    public void saveDictMap() {
        int i = 0;
        maxNodeId = 1;
        logger.debug("start build Dict");
        for (String strPrefix : getAllPrefixes()) {
            i++;
            logger.debug("start put node : "+i+" prefix: "+strPrefix);            
            DictNode dictNode = buildSubNodes(strPrefix);
            if (null != dictNode) {
                putNode(dictNode);
            }
        }
        logger.debug("end build Dict!");
        logger.debug("dict node size: "+i);

        this.prefixSet=null;//用完后赶快释放        
        
        logger.debug("start save Dict!");
        try {
            FileOutputStream fos = new FileOutputStream(fileName);
            ObjectOutputStream oos = new ObjectOutputStream(fos);
            oos.writeObject(this.dictMap);
            fos.close();
        } catch (FileNotFoundException e) {
            throw new DictSerializeException("file not find: " + fileName, e);
        } catch (IOException e) {
            throw new DictSerializeException(
                    "IOException on write serialize file: " + fileName, e);
        }
        logger.debug("end save Dict!");
    }


    public Set<String> getAllPrefixes() {
        if (null != prefixSet) {
            return prefixSet;
        }
        prefixSet = new HashSet<String>();
        for (DictDAO dictDAO : dictDAOList) {
            List<String> prefixList = dictDAO.getAllPrefixes();
            // logger.debug("prefixList size: "+prefixList.size());
            prefixSet.addAll(prefixList);
            prefixList=null;
        }
        // logger.debug("prefixSet size: "+prefixSet.size());
        return prefixSet;
    }

    public Map<String, DictNode> loadDictMap() {
        if (preload) {
            if (this.dictMap.size() == 0) {
                logger.warn("dict Map size is 0! something is wrong!");
            }
            return this.dictMap;
        }
        try {
            FileInputStream fis = new FileInputStream(fileName);

            ObjectInputStream ois = new ObjectInputStream(fis);

            this.dictMap = (Map<String, DictNode>) ois.readObject();
            preload = true;
            if (this.dictMap.size() == 0) {
                logger.warn("dict Map size is 0! something is wrong!");
            }
            return this.dictMap;
        } catch (FileNotFoundException e) {
            logger.debug("file not find on deserliaze: " + fileName);
            return null;
        } catch (IOException e) {
            throw new DictSerializeException(
                    "IOException on write serialize file: " + fileName, e);
        } catch (ClassNotFoundException e) {
            throw new DictSerializeException(
                    "class not found on write serialize: " + DictNode.class, e);
        } // TODO Auto-generated method stub
    }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -