📄 utility.java

📁 基于词典的分词工具,用与对文本文件的分词
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
					while (i + 1 < str.length() && "０１２３４５６７８９".indexOf(str.substring(i + 1, i + 2)) != -1)

						i++;
				}
			}

			if (i >= str.length())
				return true;

			while (i < str.length() && GFString.cint(str.substring(i, i + 1)) >= 0
					&& GFString.cint(str.substring(i, i + 1)) <= 9)
				i++;
			// Get middle delimiter such as .
			if (i < str.length()) {
				String s = str.substring(i, i + 1);
				if ("∶·．／".indexOf(s) != -1 || ".".equals(s) || "/".equals(s)) {// 98．1％
					i++;
					while (i + 1 < str.length() && "0123456789".indexOf(str.substring(i + 1, i + 2)) != -1)
						i++;
				}
			}

			if (i < str.length()) {

				if ("百千万亿佰仟％‰".indexOf(str.substring(i, i + 1)) == -1 && !"%".equals(str.substring(i, i + 1)))
					i--;
			}
			if (i >= str.length())
				return true;
		}
		return false;
	}

	/***************************************************************************
	 * 
	 * Func Name : IsAllIndex
	 * 
	 * Description: Judge the string is all made up of Index Num Char
	 * 
	 * 
	 * Parameters : sSentence: the original sentence which includes Chinese or
	 * Non-Chinese char
	 * 
	 * Returns : the end of the sub-sentence Author : Kevin Zhang History :
	 * 1.create 2002-1-24
	 **************************************************************************/
	public static boolean isAllIndex(byte[] sString) {
		int nLen = sString.length;
		int i = 0;

		while (i < nLen - 1 && getUnsigned(sString[i]) == 162) {
			i += 2;
		}
		if (i >= nLen)
			return true;
		while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1)
				|| (sString[i] > 'a' - 1 && sString[i] < 'z' + 1)) {// single
			// byte
			// number
			// char
			i += 1;
		}

		if (i < nLen)
			return false;
		return true;

	}

	/***************************************************************************
	 * 
	 * Func Name : IsAllLetter
	 * 
	 * Description: Judge the string is all made up of Letter Char
	 * 
	 * 
	 * Parameters : sSentence: the original sentence which includes Chinese or
	 * Non-Chinese char
	 * 
	 * Returns : the end of the sub-sentence Author : Kevin Zhang History :
	 * 1.create 2002-1-24
	 **************************************************************************/
	public static boolean isAllLetter(String str) {
		int i = 0;

		if (str != null) {
			int nLen = str.length();
			byte[] b = str.getBytes();
			while (i < nLen - 1
					&& getUnsigned(b[i]) == 163
					&& ((getUnsigned(b[i + 1]) >= 193 && getUnsigned(b[i + 1]) <= 218) || (getUnsigned(b[i + 1]) >= 225 && getUnsigned(b[i + 1]) <= 250))) {
				i += 2;
			}
			if (i < nLen)
				return false;
			return true;
		}
		return false;
	}

	/***************************************************************************
	 * 
	 * Func Name : IsAllDelimiter
	 * 
	 * Description: Judge the string is all made up of Delimiter
	 * 
	 * 
	 * Parameters : sSentence: the original sentence which includes Chinese or
	 * Non-Chinese char
	 * 
	 * Returns : the end of the sub-sentence Author : Kevin Zhang History :
	 * 1.create 2002-1-24
	 **************************************************************************/
	public static boolean isAllDelimiter(byte[] sString) {
		int nLen = sString.length;
		int i = 0;

		while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163)) {
			i += 2;
		}
		if (i < nLen)
			return false;
		return true;
	}

	/***************************************************************************
	 * 
	 * Func Name : BinarySearch
	 * 
	 * Description: Lookup the index of nVal in the table nTable which length is
	 * nTableLen
	 * 
	 * Parameters : nPOS: the POS value
	 * 
	 * Returns : the index value Author : Kevin Zhang History : 1.create
	 * 2002-1-25
	 **************************************************************************/
	public static int binarySearch(int val, int[] table) {
		if (table != null) {
			int len = table.length;
			int start = 0, end = len - 1, mid = (start + end) / 2;

			while (start <= end)// Binary search
			{
				if (table[mid] == val) {
					return mid;// find it
				} else if (table[mid] < val) {
					start = mid + 1;
				} else {
					end = mid - 1;
				}
				mid = (start + end) / 2;
			}
		}
		return -1;// Can not find it;
	}

	/***************************************************************************
	 * 
	 * Func Name : IsForeign
	 * 
	 * Description: Decide whether the word is not a Non-fereign word
	 * 
	 * Parameters : sWord: the word
	 * 
	 * Returns : the index value Author : Kevin Zhang History : 1.create
	 * 2002-1-26
	 **************************************************************************/
	public static boolean isForeign(String word) {
		if (word != null) {
			int foreignCount = getForeignCharCount(word);
			int charCount = word.length();
			if (charCount > 2 || foreignCount >= 1 * charCount / 2)
				return true;
		}
		return false;
	}

	/***************************************************************************
	 * 
	 * Func Name : IsAllForeign
	 * 
	 * Description: Decide whether the word is not a Non-fereign word
	 * 
	 * Parameters : sWord: the word
	 * 
	 * Returns : the index value Author : Kevin Zhang History : 1.create
	 * 2002-3-25
	 **************************************************************************/
	public static boolean isAllForeign(String sWord) {
		int nForeignCount = getForeignCharCount(sWord);
		if (2 * nForeignCount == sWord.length())
			return true;
		return false;
	}

	/***************************************************************************
	 * 
	 * Func Name : IsForeign
	 * 
	 * Description: Decide whether the word is Chinese Num word
	 * 
	 * Parameters : sWord: the word
	 * 
	 * Returns : the index value Author : Kevin Zhang History : 1.create
	 * 2002-1-26
	 **************************************************************************/
	public static boolean isAllChineseNum(String word) {// 百分之五点六的人早上八点十八分起床

		String chineseNum = "零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·．／点";//
		String prefix = "几数第上成";

		if (word != null) {
			String temp = word + " ";
			for (int i = 0; i < word.length(); i++) {

				if (temp.indexOf("分之", i) != -1)// 百分之五
				{
					i += 2;
					continue;
				}

				String tchar = temp.substring(i, i + 1);
				if (chineseNum.indexOf(tchar) == -1 && (i != 0 || prefix.indexOf(tchar) == -1))
					return false;
			}
			return true;
		}

		return false;
	}

	/***************************************************************************
	 * 
	 * Func Name : GetForeignCharCount
	 * 
	 * Description:
	 * 
	 * Parameters : sWord: the word
	 * 
	 * Returns : the index value Author : Kevin Zhang History : 1.create
	 * 2002-4-4 2.Modify 2002-5-21
	 **************************************************************************/
	public static int getForeignCharCount(String sWord) {
		int nForeignCount, nCount;
		// English char counnts
		nForeignCount = getCharCount(TRANS_ENGLISH, sWord);
		// Japan char counnts
		nCount = getCharCount(TRANS_JAPANESE, sWord);
		if (nForeignCount <= nCount)
			nForeignCount = nCount;
		// Russian char counnts
		nCount = getCharCount(TRANS_RUSSIAN, sWord);
		if (nForeignCount <= nCount)
			nForeignCount = nCount;
		return nForeignCount;
	}

	/**
	 * 得到字符集的字符在字符串中出现的次数
	 * 
	 * @param charSet
	 * @param word
	 * @return
	 */
	public static int getCharCount(String charSet, String word) {
		int nCount = 0;

		if (word != null) {
			String temp = word + " ";
			for (int i = 0; i < word.length(); i++) {
				String s = temp.substring(i, i + 1);
				if (charSet.indexOf(s) != -1)
					nCount++;
			}
		}

		return nCount;
	}

	/***************************************************************************
	 * 
	 * Func Name : GetForeignCharCount
	 * 
	 * Description: Return the foreign type
	 * 
	 * Parameters : sWord: the word
	 * 
	 * Returns : the index value Author : Kevin Zhang History : 1.create
	 * 2002-4-4 2.Modify 2002-5-21
	 **************************************************************************/
	public int GetForeignType(String sWord) {
		int nForeignCount, nCount, nType = TT_ENGLISH;
		nForeignCount = getCharCount(TRANS_ENGLISH, sWord);// English
		// char
		// counnts
		nCount = getCharCount(TRANS_RUSSIAN, sWord);// Russian
		// char
		// counnts
		if (nForeignCount < nCount) {
			nForeignCount = nCount;
			nType = TT_RUSSIAN;
		}
		nCount = getCharCount(TRANS_JAPANESE, sWord);// Japan
		// char
		// counnts
		if (nForeignCount < nCount) {
			nForeignCount = nCount;
			nType = TT_JAPANESE;
		}
		return nType;
	}

	public static byte[] readBytes(DataInputStream in, int len) {
		if (in != null && len > 0) {
			byte[] b = new byte[len];
			try {
				for (int i = 0; i < len; i++)
					b[i] = in.readByte();
			} catch (IOException e) {
				e.printStackTrace();
			}

			return b;
		}

		return null;
	}

	public static boolean PostfixSplit(byte[] sWord, byte[] sWordRet, byte[] sPostfix) {
		byte[] sSinglePostfix = POSTFIX_SINGLE.getBytes();
		byte[][] sMultiPostfix = new byte[POSTFIX_MUTIPLE.length][9];
		for (int i = 0; i < sMultiPostfix.length; i++)
			sMultiPostfix[i] = POSTFIX_MUTIPLE[i].getBytes();
		int nPostfixLen = 0, nWordLen = sWord.length;
		int i = 0;

		while (sMultiPostfix[i][0] != 0
				&& strncmp(GFCommon.bytesCopy(sWord, nWordLen - sMultiPostfix[i].length, sWord.length - nWordLen
						+ sMultiPostfix[i].length), 0, sMultiPostfix[i], sMultiPostfix[i].length) == false) {// Try
			// to
			// get
			// the
			// postfix of an
			// address
			i++;
		}
		GFCommon.bytesCopy(sPostfix, sMultiPostfix[i], 0, sMultiPostfix.length);
		nPostfixLen = sMultiPostfix[i].length;// Get the length of place
		// postfix

		if (nPostfixLen == 0) {
			sPostfix[2] = 0;
			strncpy(sPostfix, GFCommon.bytesCopy(sWord, nWordLen - 2, 2), 2);
			if (CC_Find(sSinglePostfix, sPostfix))
				nPostfixLen = 2;
		}

		strncpy(sWordRet, sWord, nWordLen - nPostfixLen);
		sWordRet[nWordLen - nPostfixLen] = 0;// Get the place name which have
		// erasing the postfix
		sPostfix[nPostfixLen] = 0;
		return true;
	}

	/**
	 * 比较第二个字节数组是否在第一个中出现
	 * 
	 * @param b1
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -