⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segmenter.jav.old

📁 「我是中國人」
💻 OLD
📖 第 1 页 / 共 3 页
字号:
package org.apache.lucene.analysis.cw;

import java.lang.*;
import java.io.*;
import java.util.*;
import java.util.logging.*;
/* 
   Originally Written by Erik Peterson
   erik AT mandarintools.com
   
   Modified by Siu Ying
   siu DOT ying AT gmail DOT com 
*/

public class Segmenter
{
    static Logger logger = Logger.getLogger(Segmenter.class.getName());
    static Segmenter mysegmenter;
    private TreeMap zhwords;
    private TreeSet csurname, cforeign, cnumbers, cnotname;
    private String debugencoding;

    private boolean debug;

    // Char form       
    public final static int TRAD = 0;
    public final static int SIMP = 1;
    public final static int BOTH = 2;

    /* singleton */
    public static synchronized Segmenter getSegmenter(String swapfilename){
        if (mysegmenter == null){
            mysegmenter = new Segmenter(BOTH, true, swapfilename);
        }
        return mysegmenter;
    }

    private Segmenter(int charform, boolean loadwordfile, String swapfilename)
    {
        debug = false;
        debugencoding = "UTF-8";

        int count = 0;

        int treelevel;

        csurname = new TreeSet();
        cforeign = new TreeSet();
        cnumbers = new TreeSet();
        cnotname = new TreeSet();

        if (charform == SIMP)
        {
            loadset(cnumbers, "data/snumbers_u8.txt");
            loadset(cforeign, "data/sforeign_u8.txt");
            loadset(csurname, "data/ssurname_u8.txt");
            loadset(cnotname, "data/snotname_u8.txt");
        }
        else if (charform == TRAD)
        {
            loadset(cnumbers, "data/tnumbers_u8.txt");
            loadset(cforeign, "data/tforeign_u8.txt");
            loadset(csurname, "data/tsurname_u8.txt");
            loadset(cnotname, "data/tnotname_u8.txt");
        }
        else
        {  // BOTH
            loadset(cnumbers, "data/snumbers_u8.txt");
            loadset(cforeign, "data/sforeign_u8.txt");
            loadset(csurname, "data/ssurname_u8.txt");
            loadset(cnotname, "data/snotname_u8.txt");
            loadset(cnumbers, "data/tnumbers_u8.txt");
            loadset(cforeign, "data/tforeign_u8.txt");
            loadset(csurname, "data/tsurname_u8.txt");
            loadset(cnotname, "data/tnotname_u8.txt");
        }

        zhwords = new TreeMap();

        if (!loadwordfile)
        {
            return;
        }

        boolean loaded_obj = false;
        Date start = new Date();
        
        /* load words from object file, not txt file */
        File swapfile = new File(swapfilename);
        if ( swapfile.exists() && swapfile.canRead() )
        {
            try
            {
                logger.info("Loading data from object file");
                InputStream swapdata = new FileInputStream(swapfile);
                ObjectInputStream obj = new ObjectInputStream(swapdata);
                zhwords = (TreeMap)obj.readObject();
                loaded_obj = true;
            }
            catch(java.io.IOException ioe)
            {
                logger.info("Error reading object file: " + ioe);
            }
            catch(ClassNotFoundException cne)
            {
                logger.info("Error reading object file: Format not expected: " + cne);
            }
        }

        if (! loaded_obj)
        {
            String newword = null;
            try
            {
                InputStream worddata = null;
                if (charform == SIMP)
                {
                    worddata = getClass().getResourceAsStream("simplexu8.txt");
                }
                else if (charform == TRAD)
                {
                    worddata = getClass().getResourceAsStream("tradlexu8.txt");
                }
                else if (charform == BOTH)
                {
                    worddata = getClass().getResourceAsStream("bothlexu8.txt");
                }
                BufferedReader in = new BufferedReader(new InputStreamReader(worddata, "UTF8"));

                while ((newword = in.readLine()) != null)
                {
                    if ((newword.indexOf("#") == -1) && (newword.length() < 5))
                    {

                        zhwords.put(newword.intern(), "1");

                        if (newword.length() == 3)
                        {
                            if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
                            {
                                zhwords.put(newword.substring(0,2).intern(), "2");
                            }
                        }

                        if (newword.length() == 4)
                        {
                            if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
                            {
                                zhwords.put(newword.substring(0,2).intern(), "2");
                            }
                            if (zhwords.containsKey(newword.substring(0, 3).intern()) == false)
                            {
                                zhwords.put(newword.substring(0,3).intern(), "2");
                            }

                        }
                        // Add section for words of 5 characters

                        //addword(newword);
                        if (count++ % 20000 == 0)
                        {
                            logger.info("" + count);
                        }
                        //if (count > 65000) { break; }
                    }
                }
                in.close();               
                
            }
            catch (IOException e)
            {
                logger.info("IOException: "+e);
            }
        }
        Date end = new Date();
        logger.info("Loaded in " + (end.getTime() - start.getTime()) + " ms");
        
        /* write words to object file */
        if (! loaded_obj){
            try
            {
                swapfile.createNewFile();
                FileOutputStream f = new FileOutputStream(swapfile);
                ObjectOutput s = new ObjectOutputStream(f);
                s.writeObject(zhwords);
                s.flush();
            }
            catch(java.io.FileNotFoundException fe)
            {
                logger.info("Error finding object file: " + fe);
            }
            catch(java.io.IOException ioe)
            {
                logger.info("Error write object file: " + ioe);
            }
        }
    }

    /** Load a set of character data */
    private void loadset(TreeSet targetset, String sourcefile)
    {
        String dataline;
        try
        {
            InputStream setdata = getClass().getResourceAsStream(sourcefile);
            BufferedReader in = new BufferedReader(new InputStreamReader(setdata, "UTF-8"));
            while ((dataline = in.readLine()) != null)
            {
                if ((dataline.indexOf("#") > -1) || (dataline.length() == 0))
                {
                    continue;
                }
                targetset.add(dataline.intern());
            }
            in.close();
        }
        catch (Exception e)
        {
            logger.info("Exception loading data file" + sourcefile + " " + e);
        }

    }

    public boolean isNumber(String testword)
    {
        boolean result = true;
        for (int i = 0; i < testword.length(); i++)
        {
            if (cnumbers.contains(testword.substring(i, i+1).intern()) == false)
            {
                result = false;
                break;
            }
        }

        if (debug)
        {
            try
            {
                logger.info(new String(testword.getBytes("UTF-8")) + " " + result);
            }
            catch (Exception a)
            { }
            ;
        }

        return result;
    }

    public boolean isAllForeign(String testword)
    {
        boolean result = true;
        for (int i = 0; i < testword.length(); i++)
        {
            if (cforeign.contains(testword.substring(i, i+1).intern()) == false)
            {
                result = false;
                break;
            }
        }

        return result;
    }

    public boolean isNotCJK(String testword)
    {
        boolean result = true;
        for (int i = 0; i < testword.length(); i++)
        {
            if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS)
            {
                result = false;
                break;
            }
        }

        return result;
    }

    public String stemWord(String word)
    {
        String[] prefix = new String[] {"\u7b2c", "\u526f", "\u4e0d"};
        String[] suffix = new String[] {"\u4e86", "\u7684", "\u5730", "\u4e0b", "\u4e0a", "\u4e2d", "\u91cc",
                                        "\u5230", "\u5185", "\u5916", "\u4eec"};
        String[] infix  = new String[] {"\u5f97", "\u4e0d"};
        int i;

        StringBuffer unstemmed = new StringBuffer(word);

        for (i = 0; i < prefix.length; i++)
        {
            if (unstemmed.substring(0, 1).equals(prefix[i]) == true &&
                    (zhwords.get(unstemmed.substring(1, unstemmed.length()).intern()) != null ||
                     unstemmed.length() == 2))
            {
                logger.info("Stemmed prefix");
                try
                {
                    logger.info(new String(unstemmed.toString().getBytes(debugencoding)));
                }
                catch (Exception a)
                { }
                ;
                unstemmed.deleteCharAt(0);
                return unstemmed.toString();

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -