⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segmenter.jav.old

📁 「我是中國人」
💻 OLD
📖 第 1 页 / 共 3 页
字号:
            {  // Not chinese character
                //logger.info("not cjk");
                if (currentword.length() > 0)
                {
                    outline.append(currentword.toString());
                    currentoffset += currentword.length();
                    offsets.add(new Integer(currentoffset));

                    if (Character.isWhitespace(currentchar) == false)
                    {
                        outline.append(separator);
                        currentoffset += separator.length();
                        if (separator.length() > 0)
                        {
                            offsets.add(new Integer(currentoffset));
                        }
                    }
                    currentword.setLength(0);
                }

                while ((i < clength) &&
                        (Character.UnicodeBlock.of(cline.charAt(i)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS))
                {
                    currentword.append(cline.charAt(i));
                    i++;
                }
                i--;
                outline.append(currentword.toString());
                currentoffset += currentword.length();
                offsets.add(new Integer(currentoffset));
                currentword.setLength(0);
            }
        }

        outline.append(currentword.toString());
        if (currentword.length() > 0)
        {
            currentoffset += currentword.length();
            offsets.add(new Integer(currentoffset));
        }

        return offsets;
    }


    public void addword(String newword)
    {
        zhwords.put(newword.intern(), "1");

        if (newword.length() == 3)
        {
            if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
            {
                zhwords.put(newword.substring(0,2).intern(), "2");
            }
        }

        if (newword.length() == 4)
        {
            if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
            {
                zhwords.put(newword.substring(0,2).intern(), "2");
            }
            if (zhwords.containsKey(newword.substring(0, 3).intern()) == false)
            {
                zhwords.put(newword.substring(0,3).intern(), "2");
            }

        }

        if (newword.length() == 5)
        {
            if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
            {
                zhwords.put(newword.substring(0,2).intern(), "2");
            }
            if (zhwords.containsKey(newword.substring(0, 3).intern()) == false)
            {
                zhwords.put(newword.substring(0,3).intern(), "2");
            }
            if (zhwords.containsKey(newword.substring(0, 4).intern()) == false)
            {
                zhwords.put(newword.substring(0,4).intern(), "2");
            }
        }

        if (newword.length() == 6)
        {
            if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
            {
                zhwords.put(newword.substring(0,2).intern(), "2");
            }
            if (zhwords.containsKey(newword.substring(0, 3).intern()) == false)
            {
                zhwords.put(newword.substring(0,3).intern(), "2");
            }
            if (zhwords.containsKey(newword.substring(0, 4).intern()) == false)
            {
                zhwords.put(newword.substring(0,4).intern(), "2");
            }
            if (zhwords.containsKey(newword.substring(0, 5).intern()) == false)
            {
                zhwords.put(newword.substring(0,5).intern(), "2");
            }
        }

    }


    public void segmentFile(String inputfile, String encoding)
    {
        byte[] gbbytes;
        String outfile = inputfile + ".seg";
        String segstring;
        boolean debug = false;
        LinkedList list;
        LinkedList strlist = new LinkedList();

        String dataline;
        InputStream srcdata;
        BufferedReader in;
        BufferedWriter outbuffer;
        StringBuffer buffer;
        
        try{
            srcdata = new FileInputStream(inputfile);
            in = new BufferedReader(new InputStreamReader(srcdata, encoding));
            outbuffer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), encoding));
            buffer = new StringBuffer();
        }catch(UnsupportedEncodingException uee){
            logger.warning("Encoding not supported, abort: " + uee);
            return;
        }catch(FileNotFoundException fnfe){
            logger.warning("File not found, abort: " + fnfe);
            return;
        }
        
        try{
            while ((dataline = in.readLine()) != null){
                buffer.append(dataline + " ");
            }
        }catch(IOException ioe){
            // ignored
            logger.warning("Error reading from stream: " + ioe);
        }

        try{
            String output = buffer.toString();
            if (output.length() == 0) return;
            
            list = segmentLine(output);
            
            String seg_result = segmentLine(output, " ");
            
            logger.info("Offset: " + list);
            logger.info("Output: " + seg_result);
            
            // iterate through the list
            Iterator iter = list.iterator();
            int endpos = 0;
            int startpos = 0;
            
            while(iter.hasNext()){
                endpos = ((Integer)iter.next()).intValue();
                String token = output.substring(startpos, endpos);
                if (token.trim().length() > 0){
                    strlist.add(token);
                }
                startpos = endpos;
            }
    
            // outbuffer.write(segstring);
            outbuffer.write(seg_result);
            outbuffer.newLine();
        }catch(IOException ioe){
            logger.warning("Error writing to file: " + ioe);
        }finally{
            System.out.flush();
            try{
                in.close();
                outbuffer.close();
            }catch(IOException ioe){}
        }
    }

    public static void printHelp()
    {
        System.err.println("Usage:\njava -jar Segmenter.jar [-b|-g|-8] inputfile.txt");
        System.err.println("\t-b Big5, -g GB2312, -8 UTF-8");
        System.err.println("  Segmented text will be saved to inputfile.txt.seg");
        System.exit(0);
    }

    public static void main(String[] argv)
    {
        Vector inputfiles = new Vector() ;
        String encoding = "BIG5";
        int charform = Segmenter.TRAD;
        boolean debug = false;
        int i, j;

        for (i = 0; i < argv.length; i++)
        {
            if (argv[i].equals("-b"))
            {
                if (debug)
                    logger.info("Setting to Big5, TRAD");
                encoding = "BIG5";
                charform = Segmenter.TRAD;
            }
            else if (argv[i].equals("-g"))
            {
                if (debug)
                    logger.info("Setting to GB, SIMP");
                encoding = "GBK";
                charform = Segmenter.SIMP;
            }
            else if (argv[i].equals("-8"))
            {
                encoding = "UTF8";
                charform = Segmenter.BOTH;
            }
            else if (argv[i].equals("-s"))
            {
                if (debug)
                    logger.info("Setting to SIMP");
                charform = Segmenter.SIMP;
            }
            else if (argv[i].equals("-t"))
            {
                if (debug)
                    logger.info("Setting to TRAD");
                charform = Segmenter.TRAD;
            }
            else if (argv[i].equals("-h"))
            {
                printHelp();
            }
            else
            {
                inputfiles.add(argv[i]);
            }
        }

        if (inputfiles.size() == 0)
        {
            logger.info("ERROR: Please specify name of Chinese text file to segment.\n");
            printHelp();
        }


        logger.info("Loading segmenter word list.  One moment please.");
        Segmenter mainsegmenter = Segmenter.getSegmenter("zword.obj");
        logger.info("Total keys " + mainsegmenter.zhwords.size());

        File tmpfile;
        String dirfiles[];
        for (i = 0; i < inputfiles.size(); i++)
        {
            tmpfile = new File((String)inputfiles.get(i));
            if (tmpfile.exists() == false)
            {
                logger.info("ERROR: Source file " + (String)inputfiles.get(i) +
                                   " does not exist.\n");
                continue;
            }
            if (tmpfile.isDirectory() == true)
            {
                dirfiles = tmpfile.list();
                if (dirfiles != null)
                {
                    for (j = 0; j < dirfiles.length; j++)
                    {
                        inputfiles.add((String)inputfiles.get(i) + File.separator +
                                       dirfiles[j]);
                    }
                }
                continue;
            }
            logger.info("Segmenting " + inputfiles.get(i) +
                               " with encoding " + encoding);
            mainsegmenter.segmentFile((String)inputfiles.get(i), encoding);
        }
    }
    
    // accesser
    public TreeSet getCsurname(){
        return csurname;
    }

    public TreeSet getCforeign(){
        return cforeign;
    }
    
    public TreeSet getCnumbers(){
        return cnumbers;
    }
    
    public TreeSet getCnotname(){
        return cnotname;
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -