⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utility.java

📁 基于词典的分词工具,用与对文本文件的分词
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
	 * @param b2
	 * @return 返回第一次出现在位置。如果没有出现,则返回-1
	 */
	public static int strstr(byte[] b1, byte[] b2) {
		boolean flag = true;
		if (b1 != null && b2 != null) {
			for (int i = 0; i < b1.length; i++) {
				if (b1[i] != b2[0])
					continue;
				else {
					if (b1.length - i >= b2.length) {
						for (int j = 0; j < b2.length; j++) {
							if (b2[j] != b1[i + j]) {
								flag = false;
								break;
							}
						}

						if (flag) {
							return i;
						}
					}
				}
			}
		}

		return -1;
	}

	public static int strchr(byte[] bs, byte b) {
		if (bs != null) {
			for (int i = 0; i < bs.length; i++) {
				if (bs[i] == b)
					return i;
			}

		}

		return -1;
	}

	/**
	 * 比较两个字节数组前len个字节是否相等
	 * 
	 * @param b1
	 * @param b2
	 * @param len
	 * @return
	 */
	public static boolean strncmp(byte[] b1, int startIndex, byte[] b2, int len) {
		if (b1 != null && b2 != null && len > 0) {
			if (b1.length >= len && b2.length >= len) {
				for (int i = startIndex; i < len; i++) {
					if (b1[i] != b2[i])
						return true;
				}
			}
		}

		return false;
	}

	public static int getUnsigned(byte b) {
		if (b > 0)
			return (int) b;
		else
			return (b & 0x7F + 128);
	}

	public static void strncpy(byte[] dest, byte[] src, int len) {
		if (dest != null && src != null) {
			if (dest.length >= len && len <= src.length) {
				for (int i = 0; i < len; i++)
					dest[i] = src[i];
			}
		}
	}

	/**
	 * 汉字在6768区位表中对应的ID号
	 */
	public static int CC_ID(String str) {
		int result = -1;
		if (str != null && str.length() > 0) {
			byte[] b = str.getBytes();
			result = (getUnsigned(b[0]) - 176) * 94 + (getUnsigned(b[1]) - 161);
		}
		return result;
	}

	/**
	 * The first char computed by the Chinese Char ID
	 * 
	 * @param id
	 * @return
	 */
	public static int CC_CHAR1(int id) {
		return (id) / 94 + 176;
	}

	/**
	 * The second char computed by the Chinese Char ID
	 * 
	 * @param id
	 * @return
	 */
	public static int CC_CHAR2(int id) {
		return (id) % 94 + 161;
	}

	public static int strcat(byte[] dest, byte[] src, int len) {
		if (dest != null && src != null && len > 0) {

			for (int i = 0; i < dest.length; i++) {
				if (dest[i] == 0) {
					for (int j = 0; j < len; j++)
						dest[i] = src[j];
					return i;
				}
			}

		}

		return -1;
	}

	public static int strcpy(byte[] dest, byte[] src) {
		return strcpy(dest, src, src.length);
	}

	public static int strcpy(byte[] dest, byte[] src, int len) {
		if (dest != null && src != null && len > 0) {
			int i = 0;
			for (i = 0; i < len; i++) {
				dest[i] = src[i];

			}
			return i;
		}

		return -1;
	}

	/**
	 * 根据ID号得到对应的GB汉字
	 * 
	 * @param id
	 *            0--6767
	 * @return
	 */
	public static String getGB(int id) {
		String result = null;

		if (id >= 0 && id < 6768) {
			byte[] b = new byte[2];
			b[0] = (byte) CC_CHAR1(id);
			b[1] = (byte) CC_CHAR2(id);
			try {
				result = new String(b, "GBK");
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			}
		}
		return result;
	}

	public static boolean isSingle(String s) {
		if (s != null && s.getBytes().length == 1)
			return true;
		else
			return false;
	}

	public static int[] removeInvalid(int[] src) {
		int[] result = null;
		int count = 0;
		if (src != null && src.length > 0) {
			for (int i = 0; i < src.length; i++) {
				if (i != 0 && src[i] == 0)
					break;
				else
					count++;
			}

			result = new int[count];
			for (int i = 0; i < count; i++)
				result[i] = src[i];
		}

		return result;
	}

	/**
	 * 判断字符串是否是年份
	 * 
	 * @param str
	 * @return
	 */
	public static boolean isYearTime(String snum) {
		if (snum != null) {
			int len = snum.length();
			String first = snum.substring(0, 1);

			// 1992年, 98年,06年
			if (isAllSingleByte(snum)
					&& (len == 4 || len == 2 && (GFString.cint(first) > 4 || GFString.cint(first) == 0)))
				return true;
			if (isAllNum(snum) && (len >= 6 || len == 4 && "056789".indexOf(first) != -1))
				return true;
			if (getCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", snum) == len && len >= 2)
				return true;
			if (len == 4 && getCharCount("千仟零○", snum) == 2)// 二仟零二年
				return true;
			if (len == 1 && getCharCount("千仟", snum) == 1)
				return true;
			if (len == 2 && getCharCount("甲乙丙丁戊己庚辛壬癸", snum) == 1
					&& getCharCount("子丑寅卯辰巳午未申酉戌亥", snum.substring(1)) == 1)
				return true;
		}
		return false;
	}

	/**
	 * 判断一个字符串的所有字符是否在另一个字符串集合中
	 * 
	 * @param aggr
	 *            字符串集合
	 * @param str
	 *            需要判断的字符串
	 * @return
	 */
	public static boolean isInAggregate(String aggr, String str) {
		if (aggr != null && str != null) {
			str += "1";
			for (int i = 0; i < str.length(); i++) {
				String s = str.substring(i, i + 1);
				if (aggr.indexOf(s) == -1)
					return false;
			}
			return true;
		}

		return false;
	}

	/**
	 * 判断该字符串是否是半角字符
	 * 
	 * @param str
	 * @return
	 */
	public static boolean isDBCCase(String str) {
		if (str != null) {
			str += " ";
			for (int i = 0; i < str.length(); i++) {
				String s = str.substring(i, i + 1);
				if (s.getBytes().length != 1)
					return false;
			}

			return true;
		}

		return false;
	}

	/**
	 * 判断该字符串是否是全角字符
	 * 
	 * @param str
	 * @return
	 */
	public static boolean isSBCCase(String str) {
		if (str != null) {
			str += " ";
			for (int i = 0; i < str.length(); i++) {
				String s = str.substring(i, i + 1);
				if (s.getBytes().length != 2)
					return false;
			}

			return true;
		}

		return false;
	}

	/**
	 * 判断是否是一个连字符(分隔符)
	 * 
	 * @param str
	 * @return
	 */
	public static boolean isDelimiter(String str) {
		if (str != null && ("-".equals(str) || "-".equals(str)))
			return true;
		else
			return false;
	}

	public static boolean isUnknownWord(String word) {
		if (word != null && word.indexOf("未##") == 0)
			return true;
		else
			return false;
	}

	public static PersonName chineseNameSplit(String word, PosTagger personTagger) {
		PersonName result = null;

		if (word != null && personTagger != null) {
			Dictionary personDict = personTagger.getUnknownDict();
			int len = word.length();
			if (len < 2 || len > 4)
				return null;
			String[] atoms = GFString.atomSplit(word);
			for (String s : atoms) {
				if (Utility.charType(s) != Utility.CT_CHINESE && Utility.charType(s) != Utility.CT_OTHER)
					return null;
			}

			String surName = null;
			int surNameLen = 2;
			if (len > 2)
				surName = word.substring(0, surNameLen);
			else if (len == 2)
				surName = word;
			if (!personDict.isExist(surName, 1)) {
				surNameLen = 1;
				if (len > 1)
					surName = word.substring(0, surNameLen);
				else if (len == 1)
					surName = word;
				if (!personDict.isExist(surName, 1)) {
					surName = null;
					surNameLen = 0;
				}
			}
			String giveName = word.substring(surNameLen);
			if (len > 3) {
				String temp = word.substring(surNameLen, surNameLen + 1);
				if (personDict.isExist(temp, 1)) {

					giveName = word.substring(surNameLen + 1);
				}
			}

			double freq = personDict.getFreq(surName, 1);
			String temp = giveName.substring(0, 1);
			double freq2 = personDict.getFreq(temp, 2);

			if (surNameLen != 2
					&& ((surNameLen == 0 && len > 2) || giveName.length() > 2 || getForeignCharCount(word) >= 3
							&& freq < personDict.getFreq("张", 1) / 40 && freq2 < personDict.getFreq("华", 2) / 20 || (freq < 10 && getForeignCharCount(giveName) == (len - surNameLen) / 2)))
				return null;
			if (len == 2 && personTagger.isGivenName(word))
				return null;
			result = new PersonName();
			result.setFirstName(surName);
			result.setLastName(giveName);
		}
		return result;

	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -