⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 chinesesegmenter.java

📁 自己写的一段分词程序,有2部分 第一个是词库的, 第二个是概略的
💻 JAVA
📖 第 1 页 / 共 2 页
字号:


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.*;
import java.util.Locale;
import java.util.TreeMap;
import java.util.TreeSet;

public class ChineseSegmenter {

   private static ChineseSegmenter segmenter = null;

 
   private TreeMap zhwords;

   private TreeSet cforeign, cnumbers, csign, cenglish;

 
   public final static int TRAD = 0;

   public final static int SIMP = 1;

   public final static int BOTH = 2;


   private ChineseSegmenter(int charform, boolean loadwordfile) {
      
      cforeign = new TreeSet();
      cnumbers = new TreeSet();
      csign = new TreeSet();

      if (charform == SIMP) {
         loadset(cnumbers, "data/snumbers_u8.txt");
         loadset(cforeign, "data/sforeign_u8.txt");
      } else if (charform == TRAD) {
         loadset(cnumbers, "data/tnumbers_u8.txt");
         loadset(cforeign, "data/tforeign_u8.txt");
      } else { // BOTH
         loadset(cnumbers, "data/snumbers_u8.txt");
         loadset(cforeign, "data/sforeign_u8.txt");
         loadset(cnumbers, "data/tnumbers_u8.txt");
         loadset(cforeign, "data/tforeign_u8.txt");
      }

         loadset(csign, "data/snotname_u8.txt");
         loadset(cenglish, "data/english_u8.txt");
      // zhwords = new Hashtable(120000);
      zhwords = new TreeMap();

      if (!loadwordfile) {
         return;
      }

      String newword = null;
      try {
         InputStream worddata = null;
         if (charform == SIMP) {
            worddata = getClass().getResourceAsStream("simplexu8.txt");
         } else if (charform == TRAD) {
            worddata = getClass().getResourceAsStream("tradlexu8.txt");
         } else if (charform == BOTH) {
            worddata = getClass().getResourceAsStream("bothlexu8.txt");
         }
         BufferedReader in = new BufferedReader(new InputStreamReader(
               worddata, "UTF8"));
         while ((newword = in.readLine()) != null) {
            if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {

               zhwords.put(newword.intern(), "1");

               if (newword.length() == 3) {
                  if (zhwords.containsKey(newword.substring(0, 2)
                        .intern()) == false) {
                     zhwords.put(newword.substring(0, 2).intern(), "2");
                  }
               }

               if (newword.length() == 4) {
                  if (zhwords.containsKey(newword.substring(0, 2)
                        .intern()) == false) {
                     zhwords.put(newword.substring(0, 2).intern(), "2");
                  }
                  if (zhwords.containsKey(newword.substring(0, 3)
                        .intern()) == false) {
                     zhwords.put(newword.substring(0, 3).intern(), "2");
                  }
               }
            }
         }
         in.close();
      } catch (IOException e) {
         e.printStackTrace();
      }

   }
   
   public synchronized static void reset() {
      ChineseSegmenter.segmenter = null;
   }

   public synchronized static ChineseSegmenter getGBSegmenter() {
      Locale.setDefault(Locale.SIMPLIFIED_CHINESE);
      if (ChineseSegmenter.segmenter == null) {
         ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.SIMP, true);
      }
      return ChineseSegmenter.segmenter;
   }

   public synchronized static ChineseSegmenter getBig5Segmenter() {
      Locale.setDefault(Locale.TRADITIONAL_CHINESE);
      if (ChineseSegmenter.segmenter == null) {
         ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.TRAD, true);
      } 
      return ChineseSegmenter.segmenter;
   }

   private void loadset(TreeSet targetset, String sourcefile) {
      String dataline;
      try {
         InputStream setdata = getClass().getResourceAsStream(sourcefile);
         BufferedReader in = new BufferedReader(new InputStreamReader(
               setdata, "UTF-8"));
         while ((dataline = in.readLine()) != null) {
            if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
               continue;
            }
            targetset.add(dataline.intern());
         }
         in.close();
      } catch (Exception e) {
         System.err.println("Exception loading data file" + sourcefile + " "
               + e);
         e.printStackTrace();
      }

   }

   public boolean isNumber(String testword) {
      boolean result = true;
      for (int i = 0; i < testword.length(); i++) {
         if (cnumbers.contains(testword.substring(i, i + 1).intern()) == false) {
            result = false;
            break;
         }
      }
      return result;
   }

   public boolean isSign(String testword) {
	      boolean result = true;
	      for (int i = 0; i < testword.length(); i++) {
	         if (csign.contains(testword.substring(i, i + 1).intern()) == false) {
	            result = false;
	            break;
	         }
	      }
	      return result;
	   }

   public boolean isEnglish(String testword) {
	      boolean result = true;
	      for (int i = 0; i < testword.length(); i++) {
	         if (cenglish.contains(testword.substring(i, i + 1).intern()) == false) {
	            result = false;
	            break;
	         }
	      }
	      return result;
	   }
   
   public boolean isAllForeign(String testword) {
      boolean result = true;
      for (int i = 0; i < testword.length(); i++) {
         if (cforeign.contains(testword.substring(i, i + 1).intern()) == false) {
            result = false;
            break;
         }
      }

      return result;
   }

   public boolean isNotCJK(String testword) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -