⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segmenter.jav.old

📁 「我是中國人」
💻 OLD
📖 第 1 页 / 共 3 页
字号:
            }
        }


        for (i = 0; i < suffix.length; i++)
        {
            if (unstemmed.substring(unstemmed.length()-1, unstemmed.length()).equals(suffix[i]) == true &&
                    (zhwords.get(unstemmed.substring(0, unstemmed.length()-1).intern()) != null ||
                     unstemmed.length() == 2))
            {
                logger.info("Stemmed suffix");
                try
                {
                    logger.info(new String(unstemmed.toString().getBytes(debugencoding)));
                }
                catch (Exception a)
                { }
                ;
                unstemmed.deleteCharAt(unstemmed.length()-1);
                return unstemmed.toString();
            }
        }

        for (i = 0; i < infix.length; i++)
        {
            if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
                    zhwords.get(new String(unstemmed.substring(0, 1) + unstemmed.substring(2, 3)).intern()) != null)
            {
                logger.info("Stemmed infix");
                unstemmed.deleteCharAt(1);
                return unstemmed.toString();
            }
        }

        return unstemmed.toString();
    }


    public String segmentLine(String cline, String separator)
    {
        StringBuffer currentword = new StringBuffer();
        StringBuffer outline = new StringBuffer();
        int i, clength;
        char currentchar;
        //separator = " ";

        clength = cline.length();
        int[][] offsets = new int[clength][2];

        for (i = 0; i < clength; i++)
        {
            currentchar = cline.charAt(i);
            if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
                    isNumber(cline.substring(i, i+1)) == true)
            {
                // Character in CJK block
                if (currentword.length() == 0)
                {  // start looking for next word
                    //logger.info("current word length 0");
                    if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false))
                    {
                        outline.append(separator);
                    }
                    currentword.append(currentchar);
                    if (debug)
                    {
                        try
                        {
                            logger.info(new String(currentword.toString().getBytes(debugencoding)));
                        }
                        catch (Exception a)
                        { }
                        ;
                    }

                }
                else
                {
                    if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
                            ((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true)
                    {
                        // word is in lexicon
                        currentword.append(currentchar);
                        if (debug)
                        {
                            try
                            {
                                logger.info(new String(currentword.toString().getBytes(debugencoding)));
                            }
                            catch (Exception a)
                            { }
                            ;
                        }

                    }
                    else if (isAllForeign(currentword.toString()) &&
                             cforeign.contains(new String(new char[] {currentchar}).intern()) &&
                             i + 2 < clength &&
                             (zhwords.containsKey(cline.substring(i, i+2).intern()) == false))
                    {
                        // Possible a transliteration of a foreign name
                        currentword.append(currentchar);
                        if (debug)
                        {
                            try
                            {
                                logger.info(new String(currentword.toString().getBytes(debugencoding)));
                            }
                            catch (Exception a)
                            { }
                            ;
                        }

                    }
                    else if (isNumber(currentword.toString()) &&
                             cnumbers.contains(new String(new char[] {currentchar}).intern())
                             /* && (i + 2 < clength) &&
                             (zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ ) 
                    {
                        // Put all consecutive number characters together
                        currentword.append(currentchar);
                        if (debug)
                        {
                            try
                            {
                                logger.info(new String(currentword.toString().getBytes(debugencoding)));
                            }
                            catch (Exception a)
                            { }
                            ;
                        }

                    }
                    else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
                             (((String)(zhwords.get(new String(currentword.toString() +
                                                               currentchar).intern()))).equals("2") == true) &&
                             i + 1 < clength &&
                             (zhwords.containsKey(new String(currentword.toString() + currentchar +
                                                             cline.charAt(i+1)).intern()) == true))
                    {
                        if (debug)
                        {
                            try
                            {
                                logger.info(new String(currentword.toString().getBytes(debugencoding)));
                            }
                            catch (Exception a)
                            { }
                            ;
                        }

                        // Starts a word in the lexicon
                        currentword.append(currentchar);

                    }
                    else
                    { // Start anew
                        if (debug)
                        {
                            try
                            {
                                logger.info(new String(currentword.toString().getBytes(debugencoding)));
                            }
                            catch (Exception a)
                            { }
                            ;
                        }

                        outline.append(currentword.toString());
                        if (Character.isWhitespace(currentchar) == false)
                        {
                            outline.append(separator);
                        }
                        currentword.setLength(0);
                        currentword.append(currentchar);
                    }
                }

            }
            else
            {  // Not chinese character
                //logger.info("not cjk");
                if (currentword.length() > 0)
                {
                    outline.append(currentword.toString());
                    if (Character.isWhitespace(currentchar) == false)
                    {
                        outline.append(separator);
                    }
                    currentword.setLength(0);
                }
                outline.append(currentchar);
            }
        }

        outline.append(currentword.toString());

        return outline.toString();
        //return offsets;
    }


    public LinkedList segmentLine(String cline)
    {
        StringBuffer currentword = new StringBuffer();
        StringBuffer outline = new StringBuffer();
        int i, clength;
        char currentchar;
        String separator = "";
        LinkedList offsets = new LinkedList();
        Map tokenpair = new HashMap();
        
        int currentoffset = 0;
        offsets.add(new Integer(1));
        
        clength = cline.length();

        for (i = 0; i < clength; i++)
        {
            currentchar = cline.charAt(i);
            if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
                    isNumber(cline.substring(i, i+1)) == true)
            {

                // Character in CJK block
                if (currentword.length() == 0)
                {  // start looking for next word
                    //logger.info("current word length 0");
                    if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false))
                    {
                        outline.append(separator);
                        currentoffset += separator.length();
                        if (separator.length() > 0)
                        {
                            offsets.add(new Integer(currentoffset));
                        }

                    }
                    currentword.append(currentchar);
                }
                else
                {
                    if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
                            ((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true)
                    {
                        // word is in lexicon
                        currentword.append(currentchar);
                    }
                    else if (isNumber(currentword.toString()) &&
                             cnumbers.contains(new String(new char[] {currentchar}).intern())
                             /* && (i + 2 < clength) &&
                             (zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ ) 
                    {
                        // Put all consecutive number characters together
                        currentword.append(currentchar);
                        if (debug)
                        {
                            try
                            {
                                logger.info(new String(currentword.toString().getBytes(debugencoding)));
                            }
                            catch (Exception a)
                            { }
                            ;
                        }
                    }
                    else if (isAllForeign(currentword.toString()) &&
                             cforeign.contains(new String(new char[] {currentchar}).intern()) &&
                             i + 2 < clength &&
                             (zhwords.containsKey(cline.substring(i, i+2).intern()) == false))
                    {
                        // Possible a transliteration of a foreign name
                        currentword.append(currentchar);
                    }
                    else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
                             (((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("2") == true) &&
                             i + 1 < clength &&
                             (zhwords.containsKey(new String(currentword.toString() + currentchar + cline.charAt(i+1)).intern()) == true))
                    {
                        // Starts a word in the lexicon
                        currentword.append(currentchar);

                    }
                    else
                    { // Start anew
                        outline.append(currentword.toString());
                        currentoffset += currentword.length();
                        offsets.add(new Integer(currentoffset));

                        if (Character.isWhitespace(currentchar) == false)
                        {
                            outline.append(separator);
                            currentoffset += separator.length();
                            if (separator.length() > 0)
                            {
                                offsets.add(new Integer(currentoffset));
                            }
                        }
                        currentword.setLength(0);
                        currentword.append(currentchar);
                    }
                }

            }
            else

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -