📄 gfstring.java

📁 基于词典的分词工具,用与对文本文件的分词
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
				result = p1 + newstr;
			else {
				String p2 = src.substring(index + len);
				result = p1 + newstr + p2;
			}
		}
		return result;
	}

	public static boolean hasZero(String msg) {
		if (msg != null) {
			byte[] bb = msg.getBytes();
			for (byte b : bb)
				if (b == 0)
					return true;
		}

		return false;
	}

	/**
	 * 判断字符串是否是字母数字的
	 * 
	 * @param str
	 * @return
	 */
	public static boolean isAlphanumeric(String str) {
		if (str != null) {
			byte[] bs = str.getBytes();
			for (byte b : bs) {
				if (b < 48 || b > 57 && b < 65 || b > 90 && b < 97 || b > 122)
					return false;
			}
			return true;
		}
		return false;
	}

	/**
	 * 去掉地名(市/区/县/乡/村)的后缀"市/区/县/乡/镇/村"
	 * 
	 * @param placename
	 * @return
	 */
	public static String removePlacenameSuffix(String placename) {
		int index = -1;
		String[] suffix = { "省", "市", "区", "县", "乡", "镇", "村" };
		if (placename != null && placename.length() > 1) {
			for (String s : suffix) {
				index = placename.indexOf(s);
				if (placename.length() > 2 && index == placename.length() - 1) {
					placename = placename.substring(0, index);
					break;
				}
			}
		}

		return placename;
	}

	/**
	 * 添加地名后缀(市/区/县/乡/村)的后缀"市/区/县/乡/镇/村"
	 * 
	 * @param placename
	 * @param type
	 *            地名类型 0:省 1:市 2:区 3:县
	 * 
	 * @return
	 */
	public static String addPlacenameSuffix(String placename, String suffix) {
		int index = -1;
		if (placename != null && placename.length() > 1) {
			if (suffix != null && suffix.length() == 1) {
				index = placename.indexOf(suffix);
				if (index != placename.length() - 1) {
					placename += suffix;
				}
			}

		}

		return placename;
	}

	/**
	 * 比较两个字符串,看str1是否在str2前,按字母排序. 比如:abc是在adc之前
	 * 
	 * @param str1
	 * @param str2
	 * @return
	 */
	public static boolean isBefore(String str1, String str2) {
		boolean rs = false;
		if (str1 != null && str2 != null) {
			int len = str1.length() < str2.length() ? str1.length() : str2.length();
			byte[] b1 = str1.getBytes();
			byte[] b2 = str2.getBytes();

			for (int i = 0; i < len; i++) {
				if (b2[i] > b1[i])
					return true;
				else if (b2[i] < b1[i])
					return false;

			}
		}
		return rs;
	}

 

	/**
	 * 是否是联通手机号码
	 * 
	 * @param sim
	 * @return
	 */
	public static boolean isUnicommMobile(String sim) {
		boolean result = false;
		if (sim != null && sim.length() == 11) {
			String part = sim.substring(0, 3);
			if (part.equals("130") || part.equals("131") || part.equals("132") || part.equals("133") || part.equals("153") || part.equals("156"))
				result = true;
		}
		return result;
	}

	/**
	 * 是否是联通手机号码
	 * 
	 * @param sim
	 * @return
	 */
	public static boolean isChinaMobile(String sim) {
		boolean result = false;
		if (sim != null && sim.length() == 11) {
			String part = sim.substring(0, 3);
			if (part.equals("134") || part.equals("135") || part.equals("136") || part.equals("137") || part.equals("138") || part.equals("139")
					|| part.equals("159") || part.equals("158"))
				result = true;
		}
		return result;

	}

	/**
	 * 取得指定位置后面的紧邻的字符
	 * 
	 * @param str
	 * @param index
	 * @return
	 */
	public static String getNextString(String str, int index) {
		String rs = null;

		if (str != null && str.length() > 0) {
			if (index < 0)
				rs = str.length() > 1 ? str.substring(0, 1) : str;
			else if (index == str.length() - 1)
				rs = null;
			else if (index == str.length() - 2)
				rs = str.substring(index + 1);
			else
				rs = str.substring(index + 1, index + 2);
		}

		return rs;
	}

	/**
	 * 对字符串进行原子分隔,比如:解放军第101医院----解 放 军 第 1 0 1 医 院
	 * 
	 * @param str
	 * @return
	 */
	public static String[] atomSplit(String str) {		
		if (str==null) {return null;}
		
		String[] result = null;
		int nLen=str.length();
		result = new String[nLen];			
		for (int i = 0; i < nLen; i++) {
			result[i] = str.substring(i, i + 1);
		}
		return result;
	}

	public static boolean hasTelNo(String str) {
		if (str != null && str.length() >= 7) {
			String[] ss = atomSplit(quan2banGBK(str));
			String rs = "";
			for (String s : ss) {
				if ("-".equals(s) || "/".equals(s) || "(".equals(s) || ")".equals(s) || isNumeric(s)) {
					rs += s;
				} else if (rs.length() > 0)
					break;

			}

			if (rs.length() >= 7) {
				if (isMobileNo(rs))
					return true;
				else if (isTelNo(rs))
					return true;
			}
		}

		return false;
	}

	/**
	 * 找到POS词性标记的位置
	 * 
	 * @param str
	 *            分词的字符串
	 * @param pos
	 *            字词标记
	 * @return
	 */
	public static int findPos(String str, String pos) {
		int result = -1;

		if (str != null && pos != null) {
			for (int i = 0; i < str.length(); i++) {
				int index = str.indexOf(pos, i);
				if (index + pos.length() == str.length() || (index != -1 && str.substring(index + pos.length()).indexOf(" ") == 0)) {
					result = index;
					break;
				}

			}
		}
		return result;
	}

	/**
	 * 去掉词性标注，获取关键词
	 * 
	 * @param str
	 *            带词性标注的关键词,比如：团校/bs /sh
	 * @return
	 */
	public static String getPOSKey(String str) {
		if (str != null) {
			int index = str.indexOf("/");
			if (index > 0) {
				return str.substring(0, index);
			}
		}

		return null;
	}

	/**
	 * <pre>
	 *  根据词性标注进行分隔，一个关键词可能有多个词性标注，在分隔是视为一个整体。
	 *  比如：团校/bs /sh 到 雅仕苑/bs /cm
	 *  分隔后:
	 *  团校/bs /sh 
	 *  到 
	 *  雅仕苑/bs /cm
	 * </pre>
	 * 
	 * @param str
	 * @return
	 */
	public static String[] splitByPOS(String str) {
		String[] result = null;
		ArrayList<String> list = new ArrayList<String>();
		if (str != null) {
			String[] ss = str.split(" ");
			int i = 0;
			for (String s : ss) {
				if (s.indexOf("/") == 0 && i - 1 >= 0 && i - 1 < list.size()) {
					String key = list.get(i - 1);
					list.set(i - 1, key + " " + s);
				} else {
					list.add(s);
					i++;
				}
			}

			result = new String[list.size()];
			list.toArray(result);
		}
		return result;
	}

	/**
	 * 得到一个汉字串对应的拼音.只把串的汉字进行转换,其它字符保持不变
	 * 
	 * @param cstr
	 * @return
	 */
	public static String getBopomofo(String cstr) {
		String bopomofo = null;

		if (cstr != null) {
			bopomofo = "";
			String[] atoms = atomSplit(cstr);
			for (String atom : atoms) {
				if (isAllChinese(atom)) {
					byte[] b = atom.getBytes();
					int id = (256 + b[0]) * 256 + (256 + b[1]) - 256 * 256;

					int id1 = -20319;
					int id2 = 0;
					String last = null;
					Iterator itr = bopoMap.keySet().iterator();
					while (itr.hasNext()) {
						String py = (String) itr.next();
						id2 = bopoMap.get(py);
						if (id >= id1 && id < id2) {
							bopomofo += last == null ? py : last;
							break;
						} else {
							last = py;
							id1 = id2;
						}
					}

				} else
					bopomofo += atom;
			}

			bopomofo = bopomofo.toUpperCase();
		}

		return bopomofo;
	}

	/**
	 * 按字典顺序对两个字符串进行比较
	 * 
	 * @param s1
	 * @param s2
	 * @return
	 */
	public static int compareTo(String s1, String s2) {
		if (s1 == null && s2 == null)
			return 0;
		else if (s1 != null && s2 == null)
			return 1;
		else if (s1 == null && s2 != null)
			return -1;
		else {
			int len = Math.min(s1.length(), s2.length());
			s1 += " ";
			s2 += " ";
			for (int i = 0; i < len; i++) {
				String id1 = s1.substring(i, i + 1);
				String id2 = s2.substring(i, i + 1);
				int rs = getID(id1) - getID(id2);

				if (rs != 0)
					return rs;
			}

			if (s1.length() > s2.length())
				return 1;
			else if (s1.length() < s2.length())
				return -1;
			else
				return 0;
		}

	}

	/**
	 * 根据ID号得到对应的GB汉字
	 * 
	 * @param id
	 *            0--6767
	 * @return
	 */
	public static String getGB(int id) {
		String result = null;

		if (id >= 0 && id < 6768) {
			byte[] b = new byte[2];
			b[0] = (byte) ((id) / 94 + 176);
			b[1] = (byte) ((id) % 94 + 161);
			try {
				result = new String(b, "GBK");
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			}
		}
		return result;
	}

	public static int getGBID(String s) {
		int result = -1;

		if (s != null && s.length() == 1 && isAllChinese(s)) {
			byte[] b = s.getBytes();
			int high = b[0] + 256;
			int low = b[1] + 256;

			return (high - 176) * 94 + (low - 161);
		}
		return result;
	}

	public static int getID(String s) {
		int result = -1;

		if (s != null && s.length() == 1) {
			byte[] b = s.getBytes();
			if (b.length == 2) {
				int high = b[0] + 256;
				int low = b[1] + 256;

				return high * 256 + low;
			} else
				return b[0];
		}
		return result;
	}

	 

	public static String getTelcode(String telno) {
		String head = null;
		if (isTelNo(telno) && telno.length() > 7) {
			int len = telno.length();
			switch (len) {
			case 10:
				head = telno.substring(0, 3);
				break;
			case 11:
				if (telno.indexOf("01") == 0 || telno.indexOf("02") == 0)
					head = telno.substring(0, 3);
				else
					head = telno.substring(0, 4);
				break;
			case 12:
				if (telno.indexOf("098") == 0 || telno.indexOf("094") == 0 && telno.indexOf("0943") == -1 || telno.indexOf("092") == 0
						|| telno.indexOf("086") == 0 || telno.indexOf("084") == 0 || telno.indexOf("0827") == 0 || telno.indexOf("0829") == 0
						|| telno.indexOf("0822") == 0 || telno.indexOf("0824") == 0 || telno.indexOf("080") == 0 || telno.indexOf("07437") == 0
						|| telno.indexOf("0483") == 0 || telno.indexOf("0788") == 0)
					head = telno.substring(0, 5);
				else
					head = telno.substring(0, 4);
				break;
			}
		}
		return head;
	}

	/**
	 * 得到该词性对应的词
	 * 
	 * @param src
	 *            源字符串
	 * @param indexPos
	 *            词性标记的位置
	 */
	public static String getPosWord(String src, int indexPos) {
		String result = null;

		if (src != null && indexPos > 0 && indexPos < src.length() - 1) {
			String temp = src.substring(0, indexPos + 1);
			String[] ss = temp.split(" ");
			for (int i = ss.length - 1; i >= 0; i--) {
				int index = ss[i].indexOf("/");
				if (index == -1)
					break;
				else if (index > 0) {
					result = ss[i].substring(0, index);
					break;
				}
			}
		}

		return result;
	}
 

	/**
	 * 取得字符串中第一次出现的整数
	 * 
	 * @param str
	 * @return
	 */
	public static String getFirstInt(String str) {
		String result = null;

		if (str != null) {
			String temp = "";
			String[] atoms = atomSplit(str);
			for (int i = 0; i < atoms.length; i++) {
				if (isNumeric(atoms[i]))
					temp += atoms[i];
				if (i + 1 < atoms.length && !isNumeric(atoms[i + 1]))
					break;
			}

			if (temp.length() > 0)
				result = temp;

		}

		return result;
	}

	/**
	 * 字符串当中是否含有无法显示的乱码
	 * 
	 * GBK 亦采用双字节表示，总体编码范围为 8140-FEFE，首字节在 81-FE 之间，尾字节在 40-FE 之间，剔除 xx7F 一条线。总计
	 * 23940 个码位，共收入 21886 个汉字和图形符号，其中汉字（包括部首和构件）21003 个，图形符号 883 个。
	 * 
	 * @param msg
	 * @return
	 */
	public static boolean hasDisorderChar(String msg) {
		if (msg != null) {
			String[] atoms = atomSplit(msg);
			for (int i = 0; i < atoms.length; i++) {
				byte[] bs = atoms[i].getBytes();
				if (bs.length == 1) {
					if (bs[0] < 32 || bs[0] > 126)
						return true;
				} else if (bs.length == 2) {
					if (GFCommon.getUnsigned(bs[0]) < 0x81 || GFCommon.getUnsigned(bs[0]) > 0xFE || GFCommon.getUnsigned(bs[1]) < 40
							|| GFCommon.getUnsigned(bs[1]) > 0xFE)
						return true;
				}

			}
		}

		return false;
	}

	/**
	 * 格式化时间成时分秒的形式
	 * 
	 * @param millisTime
	 *            毫秒数
	 * @return
	 */
	public static String formatTime(long millisTime) {
		StringBuffer sb = new StringBuffer();
		millisTime = millisTime / 1000;
		sb.append(millisTime / 3600);
		sb.append("小时");
		sb.append((millisTime % 3600) / 60);
		sb.append("分钟");
		sb.append((millisTime % 3600) % 60);
		sb.append("秒");
		return sb.toString();
	}
	public static ArrayList<String> readTxtFile2(String fileName) throws IOException {
		ArrayList<String> result = null;
		FileInputStream fin = null;
		InputStreamReader in = null;
		BufferedReader br = null;
		File file = null;
		String value = null;

		if (fileName != null) {
			file = new File(fileName);
			if (file.exists()) {
				result = new ArrayList<String>();
				try {
					fin = new FileInputStream(file);
					in = new InputStreamReader(fin);
					br = new BufferedReader(in);
					while ((value = br.readLine()) != null) {
						result.add(value);
					}
				} catch (IOException e) {
					throw new IOException();
				}
			}
		}
		return result;
	}
}
上一页 1 2 34
💿 文件大小 3014 K
👤 上传用户 qq735970242
📂 所属分类多国语言处理
🏷️ 相关标签

#分 #词典
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -