📄 utility.java

📁 基于词典的分词工具,用与对文本文件的分词
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
package org.ictclas4j.utility;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;

import org.ictclas4j.bean.Dictionary;
import org.ictclas4j.bean.PersonName;
import org.ictclas4j.segment.PosTagger;

public class Utility {
	public static final int CC_NUM = 6768;

	// The number of Chinese Char,including 5 empty position between 3756-3761
	public static final int WORD_MAXLENGTH = 100;

	public static final int WT_DELIMITER = 0;

	public static final int WT_CHINESE = 1;

	public static final int WT_OTHER = 2;

	public static final int CT_SENTENCE_BEGIN = 1;// Sentence begin

	public static final int CT_SENTENCE_END = 4;// Sentence ending

	public static final int CT_SINGLE = 5;// SINGLE byte

	public static final int CT_DELIMITER = CT_SINGLE + 1;// delimiter

	public static final int CT_CHINESE = CT_SINGLE + 2;// Chinese Char

	public static final int CT_LETTER = CT_SINGLE + 3;// HanYu Pinyin

	public static final int CT_NUM = CT_SINGLE + 4;// HanYu Pinyin

	public static final int CT_INDEX = CT_SINGLE + 5;// HanYu Pinyin

	public static final int CT_OTHER = CT_SINGLE + 12;// Other

	public static final int MAX_WORDS = 650;

	public static final int MAX_SEGMENT_NUM = 10;

	public static final String POSTFIX_SINGLE = "坝邦堡杯城池村单岛道堤店洞渡队法峰府冈港阁宫沟国海号河湖环集江奖礁角街井郡坑口矿里岭楼路门盟庙弄牌派坡铺旗桥区渠泉人山省市水寺塔台滩坛堂厅亭屯湾文屋溪峡县线乡巷型洋窑营屿语园苑院闸寨站镇州庄族陂庵町";

	public static final String[] POSTFIX_MUTIPLE = { "半岛", "草原", "城市", "大堤", "大公国", "大桥", "地区", "帝国", "渡槽", "港口",
			"高速公路", "高原", "公路", "公园", "共和国", "谷地", "广场", "国道", "海峡", "胡同", "机场", "集镇", "教区", "街道", "口岸", "码头", "煤矿",
			"牧场", "农场", "盆地", "平原", "丘陵", "群岛", "沙漠", "沙洲", "山脉", "山丘", "水库", "隧道", "特区", "铁路", "新村", "雪峰", "盐场", "盐湖",
			"渔场", "直辖市", "自治区", "自治县", "自治州", "" };

	public static final String TRANS_ENGLISH = "·—阿埃艾爱安昂敖奥澳笆芭巴白拜班邦保堡鲍北贝本比毕彼别波玻博勃伯泊卜布才采仓查差柴彻川茨慈次达大戴代丹旦但当道德得的登迪狄蒂帝丁东杜敦多额俄厄鄂恩尔伐法范菲芬费佛夫福弗甫噶盖干冈哥戈革葛格各根古瓜哈海罕翰汗汉豪合河赫亨侯呼胡华霍基吉及加贾坚简杰金京久居君喀卡凯坎康考柯科可克肯库奎拉喇莱来兰郎朗劳勒雷累楞黎理李里莉丽历利立力连廉良列烈林隆卢虏鲁路伦仑罗洛玛马买麦迈曼茅茂梅门蒙盟米蜜密敏明摩莫墨默姆木穆那娜纳乃奈南内尼年涅宁纽努诺欧帕潘畔庞培佩彭皮平泼普其契恰强乔切钦沁泉让热荣肉儒瑞若萨塞赛桑瑟森莎沙山善绍舍圣施诗石什史士守斯司丝苏素索塔泰坦汤唐陶特提汀图土吐托陀瓦万王旺威韦维魏温文翁沃乌吾武伍西锡希喜夏相香歇谢辛新牙雅亚彦尧叶依伊衣宜义因音英雍尤于约宰泽增詹珍治中仲朱诸卓孜祖佐伽娅尕腓滕济嘉津赖莲琳律略慕妮聂裴浦奇齐琴茹珊卫欣逊札哲智兹芙汶迦珀琪梵斐胥黛";

	public static final String TRANS_RUSSIAN = "·阿安奥巴比彼波布察茨大德得丁杜尔法夫伏甫盖格哈基加坚捷金卡科可克库拉莱兰勒雷里历利连列卢鲁罗洛马梅蒙米姆娜涅宁诺帕泼普奇齐乔切日萨色山申什斯索塔坦特托娃维文乌西希谢亚耶叶依伊以扎佐柴达登蒂戈果海赫华霍吉季津柯理琳玛曼穆纳尼契钦丘桑沙舍泰图瓦万雅卓兹";

	public static final String TRANS_JAPANESE = "安奥八白百邦保北倍本比滨博步部彩菜仓昌长朝池赤川船淳次村大代岛稻道德地典渡尔繁饭风福冈高工宫古谷关广桂贵好浩和合河黑横恒宏后户荒绘吉纪佳加见健江介金今进井静敬靖久酒菊俊康可克口梨理里礼栗丽利立凉良林玲铃柳隆鹿麻玛美萌弥敏木纳南男内鸟宁朋片平崎齐千前浅桥琴青清庆秋丘曲泉仁忍日荣若三森纱杉山善上伸神圣石实矢世市室水顺司松泰桃藤天田土万望尾未文武五舞西细夏宪相小孝新星行雄秀雅亚岩杨洋阳遥野也叶一伊衣逸义益樱永由有佑宇羽郁渊元垣原远月悦早造则泽增扎宅章昭沼真政枝知之植智治中忠仲竹助椎子佐阪坂堀荻菅薰浜濑鸠筱";

	// Translation type
	public static final int TT_ENGLISH = 0;

	public static final int TT_RUSSIAN = 1;

	public static final int TT_JAPANESE = 2;

	// Seperator type
	public static final String SEPERATOR_C_SENTENCE = "。！？：；…";

	public static final String SEPERATOR_C_SUB_SENTENCE = "、，（）“”‘’";

	public static final String SEPERATOR_E_SENTENCE = "!?:;";

	public static final String SEPERATOR_E_SUB_SENTENCE = ",()\"'";

	public static final String SEPERATOR_LINK = "\n\r 　";

	// Sentence begin and ending string
	public static final String SENTENCE_BEGIN = "始##始";

	public static final String SENTENCE_END = "末##末";

	// Seperator between two words
	public static final String WORD_SEGMENTER = "@";

	public static final int MAX_WORDS_PER_SENTENCE = 120;

	public static final int MAX_UNKNOWN_PER_SENTENCE = 200;

	public static final int MAX_POS_PER_WORD = 20;

	public static final int LITTLE_FREQUENCY = 6;

	public enum TAG_TYPE {
		TT_NORMAL, TT_PERSON, TT_PLACE, TT_TRANS_PERSON
	};

	public static final int MAX_FREQUENCE = 2079997;// 7528283+329805

	// //1993123+86874

	public static final int MAX_SENTENCE_LEN = 2000;

	public static final double INFINITE_VALUE = 10000.00;

	// 平滑参数
	public static final double SMOOTH_PARAM = 0.1;

	public static final String UNKNOWN_PERSON = "未##人";

	public static final String UNKNOWN_SPACE = "未##地";

	public static final String UNKNOWN_NUM = "未##数";

	public static final String UNKNOWN_TIME = "未##时";

	public static final String UNKNOWN_LETTER = "未##串";

	public static boolean gbGenerate(String fileName) {
		File file;
		int i, j;
		file = new File(fileName);
		try {
			PrintWriter out = new PrintWriter(new FileOutputStream(file));
			if (!file.canWrite())
				return false;// fail while opening the file
			for (i = 161; i < 255; i++)
				for (j = 161; j < 255; j++)
					out.println("" + i + j + "," + i + "," + j);
			out.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		return true;
	}

	/***************************************************************************
	 * 
	 * Func Name : CC_Generate
	 * 
	 * Description: Generate the Chinese Char List file
	 * 
	 * 
	 * Parameters : sFilename: the file name for the output CC List
	 * 
	 * Returns : public static boolean Author : Kevin Zhang History : 1.create
	 * 2002-1-8
	 **************************************************************************/
	public static boolean CC_Generate(String fileName) {
		File file;
		int i, j;
		file = new File(fileName);
		try {
			PrintWriter out = new PrintWriter(new FileOutputStream(file));
			for (i = 176; i < 255; i++)
				for (j = 161; j < 255; j++)
					out.println("" + i + j + "," + i + "," + j);
			out.close();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return true;
	}

	/***************************************************************************
	 * 
	 * Func Name : CC_Find
	 * 
	 * Description: Find a Chinese sub-string in the Chinese String
	 * 
	 * 
	 * Parameters : string:Null-terminated string to search
	 * 
	 * strCharSet:Null-terminated string to search for
	 * 
	 * Returns : String Author : Kevin Zhang History : 1.create 2002-1-8
	 **************************************************************************/
	public static boolean CC_Find(final byte[] string, final byte[] strCharSet) {
		if (string != null && strCharSet != null) {
			int index = strstr(string, strCharSet);
			if (index != -1 && (index % 2 == 1)) {
				return false;
			}
		}
		return true;
	}

	/***************************************************************************
	 * 
	 * Func Name : charType
	 * 
	 * Description: Judge the type of sChar or (sChar,sChar+1)
	 * 
	 * 
	 * Parameters : sFilename: the file name for the output CC List
	 * 
	 * Returns : int : the type of char Author : Kevin Zhang History : 1.create
	 * 2002-1-8
	 **************************************************************************/
	public static int charType(String str) {

		if (str != null && str.length() > 0) {
			byte[] b = str.getBytes();
			byte b1 = b[0];
			byte b2 = b.length > 1 ? b[1] : 0;
			if (getUnsigned(b1) < 128) {
				if ("\"!,.?()[]{}+=".indexOf((char) b1) != -1)
					return CT_DELIMITER;
				return CT_SINGLE;
			} else if (getUnsigned(b1) == 162)
				return CT_INDEX;
			else if (getUnsigned(b1) == 163 && getUnsigned(b2) > 175 && getUnsigned(b2) < 186)
				return CT_NUM;
			else if (getUnsigned(b1) == 163
					&& (getUnsigned(b2) >= 193 && getUnsigned(b2) <= 218 || getUnsigned(b2) >= 225
							&& getUnsigned(b2) <= 250))
				return CT_LETTER;
			else if (getUnsigned(b1) == 161 || getUnsigned(b1) == 163)
				return CT_DELIMITER;
			else if (getUnsigned(b1) >= 176 && getUnsigned(b1) <= 247)
				return CT_CHINESE;

		}
		return CT_OTHER;

	}

	/***************************************************************************
	 * 
	 * Func Name : GetCCPrefix
	 * 
	 * Description: Get the max Prefix string made up of Chinese Char
	 * 
	 * 
	 * Parameters : sSentence: the original sentence which includes Chinese or
	 * Non-Chinese char
	 * 
	 * Returns : the end of the sub-sentence Author : Kevin Zhang History :
	 * 1.create 2002-1-8
	 **************************************************************************/
	public static int getCCPrefix(byte[] sSentence) {
		int nLen = sSentence.length;
		int nCurPos = 0;
		while (nCurPos < nLen && getUnsigned(sSentence[nCurPos]) > 175 && getUnsigned(sSentence[nCurPos]) < 248) {
			nCurPos += 2;// Get next Chinese Char
		}
		return nCurPos;
	}

	/***************************************************************************
	 * 
	 * Func Name : IsAllSingleByte
	 * 
	 * Description: Judge the string is all made up of Single Byte Char
	 * 
	 * 
	 * Parameters : sSentence: the original sentence which includes Chinese or
	 * Non-Chinese char
	 * 
	 * Returns : the end of the sub-sentence Author : Kevin Zhang History :
	 * 1.create 2002-1-24
	 **************************************************************************/
	public static boolean isAllChinese(String str) {
		if (str != null) {

			String temp = str + " ";
			for (int i = 0; i < str.length(); i++) {
				byte[] b = temp.substring(i, i + 1).getBytes();
				if (b.length == 2) {
					if (!(getUnsigned(b[0]) < 248 && getUnsigned(b[0]) > 175)
							|| !(getUnsigned(b[0]) < 253 && getUnsigned(b[0]) > 160))
						return false;
				}
			}

			return true;
		}
		return false;
	}

	/***************************************************************************
	 * 
	 * Func Name : IsAllNonChinese
	 * 
	 * Description: Judge the string is all made up of Single Byte Char
	 * 
	 * 
	 * Parameters : sSentence: the original sentence which includes Chinese or
	 * Non-Chinese char
	 * 
	 * Returns : the end of the sub-sentence Author : Kevin Zhang History :
	 * 1.create 2002-1-24
	 **************************************************************************/
	public static boolean isAllNonChinese(byte[] sString) {
		int nLen = sString.length;
		int i = 0;

		while (i < nLen) {
			if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175)
				return false;
			if (sString[i] < 0)
				i += 2;
			else
				i += 1;
		}
		return true;
	}

	/***************************************************************************
	 * 
	 * Func Name : IsAllSingleByte
	 * 
	 * Description: Judge the string is all made up of Single Byte Char
	 * 
	 * 
	 * Parameters : sSentence: the original sentence which includes Chinese or
	 * Non-Chinese char
	 * 
	 * Returns : the end of the sub-sentence Author : Kevin Zhang History :
	 * 1.create 2002-1-24
	 **************************************************************************/
	public static boolean isAllSingleByte(String str) {
		if (str != null) {
			int len = str.length();
			int i = 0;
			byte[] b = str.getBytes();
			while (i < len && b[i] < 128) {
				i++;
			}
			if (i < len)
				return false;
			return true;
		}
		return false;
	}

	/***************************************************************************
	 * 
	 * Func Name : IsAllNum
	 * 
	 * Description: Judge the string is all made up of Num Char
	 * 
	 * 
	 * Parameters : sSentence: the original sentence which includes Chinese or
	 * Non-Chinese char
	 * 
	 * Returns : the end of the sub-sentence Author : Kevin Zhang History :
	 * 1.create 2002-1-24
	 **************************************************************************/
	public static boolean isAllNum(String str) {

		if (str != null) {
			int i = 0;
			String temp = str + " ";
			// 判断开头是否是+-之类的符号
			if ("±+—-＋".indexOf(temp.substring(0, 1)) != -1)
				i++;
			/** 如果是全角的０１２３４５６７８９ 字符* */
			while (i < str.length() && "０１２３４５６７８９".indexOf(str.substring(i, i + 1)) != -1)
				i++;

			// Get middle delimiter such as .
			if (i < str.length()) {
				String s = str.substring(i, i + 1);
				if ("∶·．／".indexOf(s) != -1 || ".".equals(s) || "/".equals(s)) {// 98．1％
					i++;
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -