⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segment.java

📁 基于中科院的ICTCLAS实现中文分词系统 开发工具是JAVA.经测试,效果很好
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package com.gftech.ictclas4j.segment;

import java.util.ArrayList;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import com.gftech.ictclas4j.bean.Atom;
import com.gftech.ictclas4j.bean.PersonName;
import com.gftech.ictclas4j.bean.SegGraph;
import com.gftech.ictclas4j.bean.Sentence;
import com.gftech.ictclas4j.bean.WordResult;
import com.gftech.ictclas4j.run.Config;
import com.gftech.ictclas4j.utility.Dictionary;
import com.gftech.ictclas4j.utility.DynamicArray;
import com.gftech.ictclas4j.utility.Utility;
import com.gftech.util.GFString;

public class Segment {
	private Dictionary coreDict;

	private Dictionary bigramDict;

	private String src;

	private int[] wordPosMapTable;// 保存每个词在表中对应的绝对位置

	private double smoothParam;

	private long spendTime;

	private String splitedWord;

	private UnknownSeg unPerson;

	private UnknownSeg unTransPerson;

	private UnknownSeg unPlace;

	static Logger logger = Logger.getLogger(Segment.class);

	public Segment(String src, Dictionary dict, Dictionary biDict) {
		PropertyConfigurator.configure(Config.LOG4J_CONF);
		this.src = src;
		this.coreDict = dict;
		this.bigramDict = biDict;
		this.smoothParam = 0.1;
		this.spendTime = System.currentTimeMillis();

		unPerson = new UnknownSeg();
		unTransPerson = new UnknownSeg();
		unPlace = new UnknownSeg();
		unPerson.configure("data\\nr", Utility.TAG_TYPE.TT_PERSON);
		unTransPerson.configure("data\\tr", Utility.TAG_TYPE.TT_TRANS_PERSON);
		unPlace.configure("data\\ns", Utility.TAG_TYPE.TT_TRANS_PERSON);

		split();
	}

	private void split() {
		String result = null;

		if (src != null) {
			result = "";
			SentenceSeg ss = new SentenceSeg(src);
			ArrayList<Sentence> sens = ss.getSens();
			for (Sentence sen : sens) {
				logger.debug(sen);
				if (sen.isSeg()) {
					AtomSeg as = new AtomSeg(sen.getContent());
					ArrayList<Atom> atoms = as.getAtoms();
					for (Atom atom : atoms)
						logger.info(atom);

					GraphGenerate gg = new GraphGenerate(coreDict, bigramDict);
					ArrayList<SegGraph> sgs = gg.generate(atoms);
					logger.info(sgs);
					wordPosMapTable = new int[sgs.size()];
					for (int i = 0; i < sgs.size(); i++) {
						SegGraph sg = sgs.get(i);
						wordPosMapTable[i] = sg.getRow() * Utility.MAX_SENTENCE_LEN + sg.getCol();
					}

					ArrayList<SegGraph> biSgs = gg.biGenerate(sgs, wordPosMapTable, smoothParam);
					logger.info(biSgs);
					NShortPath nsp = new NShortPath(biSgs, 1);
					int[] bipath = nsp.getNShortPath(true);
					int[] unipath = bipath2unipath(wordPosMapTable, bipath);

					ArrayList<SegGraph> segPath = getSegPath(atoms, sgs, unipath);
					ArrayList<WordResult> rs = generateWord(segPath);
					for (WordResult wr : rs)
						logger.info(wr);
					
					
					//对分词结果进优化
					DynamicArray optSeg = new DynamicArray();
					optSeg.setSgs(segPath);

					unPerson.recognition(rs, optSeg, coreDict);
					unTransPerson.recognition(rs, optSeg, coreDict);
					unPlace.recognition(rs, optSeg, coreDict);

					ArrayList<SegGraph> optsgs = optSeg.getSgs();
					wordPosMapTable = new int[optsgs.size()];
					for (int i = 0; i < optsgs.size(); i++) {
						SegGraph sg = optsgs.get(i);
						wordPosMapTable[i] = sg.getRow() * Utility.MAX_SENTENCE_LEN + sg.getCol();
					}

					ArrayList<SegGraph> biSgs2 = gg.biGenerate(optsgs, wordPosMapTable, smoothParam);
					logger.info("bisgs2:" + biSgs2);

					NShortPath nsp2 = new NShortPath(biSgs2, 1);
					int[] bipath2 = nsp2.getNShortPath(true);
					int[] unipath2 = bipath2unipath(wordPosMapTable, bipath2);

					ArrayList<SegGraph> segPath2 = getSegPath(atoms, optsgs, unipath2);
					ArrayList<WordResult> rs2 = generateWord(segPath2);
					for (WordResult wr : rs2)
						logger.info(wr);

					unPerson.roleTag.setType(Utility.TAG_TYPE.TT_NORMAL);
					unPerson.roleTag.posTagging(rs2, coreDict, unPerson.unDict);
					result += outputResult(adjust(rs2));
				} else
					result += sen.getContent();
			}

		}

		spendTime = System.currentTimeMillis() - spendTime;
		splitedWord = result;
	}

	private String outputResult(ArrayList<WordResult> wrList) {
		String result = null;
		char[] pos = new char[2];
		if (wrList != null && wrList.size() > 2) {
			result = "";
			wrList.remove(0);
			wrList.remove(wrList.size() - 1);
			for (WordResult wr : wrList) {
				pos[0] = (char) (wr.getHandle() / 256);
				pos[1] = (char) (wr.getHandle() % 256);

				result += wr.getWord() + "/" + pos[0] + pos[1] + " ";
			}
		}

		return result;
	}

	private int[] bipath2unipath(int[] wordPosMapTable, int[] bipath) {
		int[] result = null;

		if (bipath != null && wordPosMapTable != null) {
			int[] temp = new int[bipath.length + 1];
			int wordPos = -1;
			int i = 0;
			for (int j = 0; i < bipath.length; i++, j++) {
				wordPos = wordPosMapTable[bipath[i]];
				temp[j] = wordPos / Utility.MAX_SENTENCE_LEN;
			}

			// if (wordPos > 0)
			// temp[i] = wordPos % Utility.MAX_SENTENCE_LEN;

			result = Utility.removeInvalid(temp);
		}
		return result;

	}

	/**
	 * 生成最终的分词路径
	 * 
	 * @param unipath
	 *            由二叉路径生成的唯一路径
	 * @return
	 */
	private ArrayList<WordResult> generateWord(ArrayList<SegGraph> sgs) {

		WordResult[] wordResult = null;
		int index = 0;
		int nPOS = 0;
		double fValue = 0;
		int j = 0;
		boolean isNum = false;

		if (sgs != null) {
			wordResult = new WordResult[sgs.size()];

			for (int i = 0; i < sgs.size(); i++, index++) {
//				String curWord = null;
				String snum = sgs.get(i).getWord();
				nPOS = sgs.get(i).getPos();
				fValue = sgs.get(i).getValue();
				for (j = i; j < sgs.size() - 1; j++) {
					snum += sgs.get(j + 1).getWord();
					if (Utility.isAllNum(snum) || Utility.isAllChineseNum(snum)) {
						isNum = true;
						wordResult[index] = new WordResult();
						wordResult[index].setWord(snum);
					} else
						break;

				}

				i = j;
				if (!isNum) {
					wordResult[index] = new WordResult();
					wordResult[index].setWord(sgs.get(i).getWord());
//					curWord = sgs.get(i).getWord();
				} else {

					WordResult wr = wordResult[index];
					String word = wr.getWord();
					if (word.length() == 2 && "第上成±—+∶·./".indexOf(word) != -1 || word.length() == 1
							&& "+-./".indexOf(word) != -1) {
//						curWord = word;
					}
					// 是一个数字
					else {
						if ("--".equals(word) || "—".equals(word) || "-".equals(word))// The
						// delimiter
						// "--"
						{
							nPOS = 30464;// 'w'*256;Set the POS with 'w'
						} else {// Adding time suffix

							String first = word.substring(0, 1);
							if (index > 0
									&& (Math.abs(wordResult[index - 1].getHandle()) == 27904 || Math
											.abs(wordResult[index - 1].getHandle()) == 29696)
									&& ("—".equals(first) || "-".equals(first)) && (word.length()) > 1) {// 3-4月
								// //27904='m'*256
								// 29696='t'*256
								// Split the sInitChar from the original word
								wordResult[index + 1].setWord(word.substring(1));
								wordResult[index + 1].setValue(wordResult[index].getValue());
								wordResult[index + 1].setHandle(27904);
								wr.setWord(word.substring(0, 1));
								wr.setValue(0);
								wr.setHandle(30464);// 'w'*256;

								// TODO
								// Utility.insertGraph(optGraph,sg,false);
							}
//							int len = word.length();
							String atom = sgs.get(i + 1).getWord();
							if (atom.length() == 1 && "月日时分秒".indexOf(atom) != -1 || "月份".equals(atom)) {// 2001年
								wr.setWord(word + atom);
//								curWord = "未##时";
								nPOS = -29696;// 't'*256;//Set the POS with
								i++;
								// 'm'
							} else if ("年".equals(atom)) {
								if (Utility.isYearTime(word))// strncmp(sAtom,"年",2)==0&&
								{// 1998年,
									wordResult[index].setWord(word + atom);
//									curWord = "未##时";
									nPOS = -29696;// Set the POS with 't'
									i++;
								} else {
//									curWord = "未##数";
									nPOS = -27904;// Set the POS with 'm'

								}
							} else {
								// 早晨/t 五点/t
								if (word.indexOf("点") == word.length() - 1) {
//									curWord = "未##时";
									nPOS = -29696;// Set the POS with 't'
								} else {
									if ("∶·./".indexOf(word.substring(word.length() - 1)) == -1
											&& !".".equals(word.substring(word.length() - 1))
											&& !"/".equals(word.substring(word.length() - 1))) {
//										curWord = "未##数";
										// 'm'*256;Set the POS with 'm'
										nPOS = -27904;
									}
									// Get rid of .example 1.
									else if (word.length() > 1) {
										if (".".equals(word.substring(word.length() - 1))
												|| !"/".equals(word.substring(word.length() - 1)))
											wr.setWord(word.substring(0, word.length() - 1));
										else
											wr.setWord(word.substring(0, word.length() - 2));
//										curWord = "未##数";
										nPOS = -27904;// 'm'*256;Set the POS
										// with 'm'

									}
								}
							}
						}
						fValue = 0;
					}
				}

				wordResult[index].setHandle(nPOS);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -