graphgenerate.java

来自「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」· Java 代码 · 共 221 行

JAVA

221 行

package com.gftech.ictclas4j.segment;

import java.util.ArrayList;

import org.apache.log4j.Logger;

import com.gftech.ictclas4j.bean.Atom;
import com.gftech.ictclas4j.bean.SegGraph;
import com.gftech.ictclas4j.bean.WordItem;
import com.gftech.ictclas4j.utility.Dictionary;
import com.gftech.ictclas4j.utility.Utility;

public class GraphGenerate {

	private Dictionary dict;

	private Dictionary biDict;

	static Logger logger = Logger.getLogger(GraphGenerate.class);

	public GraphGenerate(Dictionary dict, Dictionary biDict) {
		this.dict = dict;
		this.biDict = biDict;

	}

	/**
	 * 全切分,生成切分图.即找出所有可能的词组
	 * 
	 * @param atoms
	 * @return
	 */
	public ArrayList<SegGraph> generate(ArrayList<Atom> atoms) {
		ArrayList<SegGraph> result = null;
		ArrayList<SegGraph> atomGraph = null;
		SegGraph graph = null;
		Atom atom = null;

		if (atoms != null && atoms.size() > 0 && dict != null) {
			result = new ArrayList<SegGraph>();
			atomGraph = new ArrayList<SegGraph>();
			for (int i = 0; i < atoms.size(); i++) {
				atom = atoms.get(i);
				if (atom.getPos() == Utility.CT_CHINESE)
					graph = new SegGraph(i, i + 1, 0, 0, atom.getWord());
				else {
					String word = atom.getWord();
					double value = Utility.MAX_FREQUENCE;
					int pos = 0;

					switch (atom.getPos()) {
					case Utility.CT_INDEX:
					case Utility.CT_NUM:
						pos = -27904;// 'm'*256
						word = "未##数";
						value = 0;
						break;
					case Utility.CT_DELIMITER:
						pos = 30464;// 'w'*256;
						break;
					case Utility.CT_LETTER:
						pos = -'n' * 256 - 'x';//
						value = 0;
						word = "未##串";
						break;
					case Utility.CT_SINGLE:// 12021-2129-3121
						if (Utility.getCharCount("+-1234567890", atom.getWord()) == atom.getLen()) {
							pos = -27904;// 'm'*256
							word = "未##数";
						} else {
							pos = -'n' * 256 - 'x';//
							word = "未##串";
						}
						value = 0;
						break;
					default:
						pos = atom.getLen();// '?'*256;
						break;
					}

					graph = new SegGraph(i, i + 1, value, pos, word);
				}

				atomGraph.add(graph);
			}

			String word = null;
			for (int i = 0; i < atomGraph.size(); i++) {
				int j = i + 1;
				graph = atomGraph.get(i);
				word = graph.getWord();
				// 如果是“月份”，不要分割
				if (i < atomGraph.size() - 1) {
					SegGraph graph2 = atomGraph.get(i + 1);
					if ("月".equals(word) && "份".equals(graph2.getWord()))
						j++;
				}

				WordItem wi = null;
				for (; j <= atomGraph.size(); j++) {
					int totalFreq = 0;

					wi = dict.getMaxMatch(word);
					if (wi != null) {
						// find it
						if (word.equals(wi.getWord())) {
							ArrayList<WordItem> wis = dict.getHandle(word);
							for (WordItem w : wis)
								totalFreq += w.getFrequency();

							// 1年内，1999年末
							if (word.length() == 2 && i >= 1) {
								SegGraph g2 = atomGraph.get(i - 1);
								if (Utility.isAllNum(g2.getWord()) || Utility.isAllChinese(g2.getWord())
										&& (g2.getWord().indexOf("年") == 0 || g2.getWord().indexOf("月") == 0)) {

									if ("末内中底前间初".indexOf(word.substring(1)) != -1)
										break;
								}
							}
							// 只有一个性词，存贮它
							SegGraph sg = null;
							if (wis.size() == 1) {
								sg = new SegGraph(i, j, totalFreq, wis.get(0).getHandle(), word);
								Utility.insertGraph(result, sg, true);
							}

							else {
								sg = new SegGraph(i, j, totalFreq, 0, word);
								Utility.insertGraph(result, sg, true);
							}

						}

						if (j < atoms.size()) {
							SegGraph graph2 = atomGraph.get(j);
							String word2 = graph2.getWord();
							word += word2;
						} else
							break;
					} else
						break;
				}
			}
		}
		return result;
	}

	/**
	 * 生成二叉图表,每个节点表示相邻两个词组的耦合关系,如:说@的确
	 * 
	 * @param sgs
	 */
	public ArrayList<SegGraph> biGenerate(ArrayList<SegGraph> sgs, int[] wordPosMapTable, double smoothParam) {
		ArrayList<SegGraph> result = null;

		double curFreq;

		if (sgs != null && sgs.size() > 0 && dict != null && biDict != null && wordPosMapTable != null) {
			result = new ArrayList<SegGraph>();

			for (int i = 0; i < sgs.size(); i++) {
				SegGraph sg = sgs.get(i);
				if (sg.getPos() >= 0)
					curFreq = sg.getValue();
				else
					curFreq = dict.getFrequency(sg.getWord(), 2);

				// 得到下面行值和该列值相等的所有元素
				ArrayList<SegGraph> nextSgs = Utility.getNextElements(sgs, i);
				for (SegGraph graph : nextSgs) {
					String twoWords = sg.getWord();
					twoWords += Utility.WORD_SEGMENTER;
					twoWords += graph.getWord();

					// 计算相临两个词之间的平滑值
					// -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
					int twoFreq = biDict.getFrequency(twoWords, 3);
					double temp = (double) 1 / Utility.MAX_FREQUENCE;
					double value = smoothParam * (1 + curFreq) / (Utility.MAX_FREQUENCE + 80000);
					value += (1 - smoothParam) * ((1 - temp) * twoFreq / (1 + curFreq) + temp);
					value = -Math.log(value);

					if (value < 0)
						value += sg.getValue();

					SegGraph sg2 = new SegGraph();
					// 分隔符@前的词在wordPosMapTable中的位置
					int index = sg.getRow() * Utility.MAX_SENTENCE_LEN + sg.getCol();
					int wordIndex = getWordIndex(wordPosMapTable, index);
					sg2.setRow(wordIndex);
					
					// 分隔符@后的词在wordPosMapTable中的位置
					index = graph.getRow() * Utility.MAX_SENTENCE_LEN + graph.getCol();
					wordIndex = getWordIndex(wordPosMapTable, index);
					
					sg2.setCol(wordIndex);
					sg2.setWord(twoWords);
					sg2.setValue(value);
					sg2.setPos(sg.getPos());

					Utility.insertGraph(result, sg2, false);
				}
			}
		}
		return result;
	}

	private int getWordIndex(int[] wordPosMapTable, int value) {
		if (wordPosMapTable != null) {
			for (int i = 0; i < wordPosMapTable.length; i++) {
				if (wordPosMapTable[i] == value)
					return i;
			}
		}

		return -1;
	}

}

graphgenerate.java - 源码说明

本页面展示了「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」中的 graphgenerate.java 源码文件，采用 Java 编程语言编写，共 221 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与ICTCLAS相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?