⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dictionary.java

📁 基于java语言的分词系统
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package org.ictclas4j.bean;

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;

import org.apache.log4j.Logger;
import org.ictclas4j.utility.GFCommon;
import org.ictclas4j.utility.GFString;
import org.ictclas4j.utility.Utility;


public class Dictionary {
	/**
	 * 词典表,共6768个,GB2312编码
	 */
	public ArrayList<WordTable> wts;

	/**
	 * 词典修改表
	 */
	public ArrayList<ModifyTable> mts;

	static Logger logger = Logger.getLogger(Dictionary.class);

	public Dictionary() {
		init();

	}

	public Dictionary(String filename) {
		init();
		load(filename);
	}

	public void init() {
		wts = new ArrayList<WordTable>();
		mts = new ArrayList<ModifyTable>();
		for (int i = 0; i < Utility.CC_NUM; i++) {
			wts.add(new WordTable());
			mts.add(new ModifyTable());
		}
	}

	public boolean load(String filename) {
		return load(filename, false);
	}

	/**
	 * 从词典表中加载词条.共6768个大的数据块(包括5个非汉字字符),每个大数据块包括若干个小数据块,
	 * 每个小数据块为一个词条,该数据块中每个词条都是共一个字开头的.
	 * 
	 * @param filename
	 *            核心词典文件名
	 * @param isReset
	 *            是否要重置
	 * @return
	 */
	public boolean load(String filename, boolean isReset) {
		File file;

		int[] nBuffer = new int[3];

		file = new File(filename);
		if (!file.canRead())
			return false;// fail while opening the file

		try {
			delModified();

			DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
			for (int i = 0; i < Utility.CC_NUM; i++) {
				// logger.debug("块" + i);
				// 词典库在写二进制数据时采用低位优先(小头在前)方式,需要转换一下
				int count = GFCommon.bytes2int(Utility.readBytes(in, 4), false);
				// logger.debug(" count:" + count);
				wts.get(i).setCount(count);
				if (count <= 0)
					continue;

				WordItem[] wis = new WordItem[count];
				for (int j = 0; j < count; j++) {
					nBuffer[0] = GFCommon.bytes2int(Utility.readBytes(in, 4), false);
					nBuffer[1] = GFCommon.bytes2int(Utility.readBytes(in, 4), false);
					nBuffer[2] = GFCommon.bytes2int(Utility.readBytes(in, 4), false);

					// String print = " wordLen:" + nBuffer[1] + " frequency:" +
					// nBuffer[0] + " handle:" + nBuffer[2];

					WordItem ti = new WordItem();
					if (nBuffer[1] > 0)// String length is more than 0
					{
						byte[] word = Utility.readBytes(in, nBuffer[1]);
						ti.setWord(new String(word, "GBK"));

					} else
						ti.setWord("");

					// print += " word:(" + Utility.getGB(i) + ")" +
					// ti.getWord();
					// logger.debug(print);

					if (isReset)// Reset the frequency
						ti.setFreq(0);
					else
						ti.setFreq(nBuffer[0]);
					ti.setLen(nBuffer[1] / 2);
					ti.setHandle(nBuffer[2]);
					wis[j] = ti;
				}
				wts.get(i).setWords(wis);
			}

			in.close();
		} catch (FileNotFoundException e) {
			logger.error(e);
		} catch (IOException e) {
			logger.error(e);
		}
		return true;
	}

	/**
	 * 保存词典表.如果有修改的词条,则先要对词典表进行更新才能把内容写入文件
	 * 
	 * @param filename
	 * @return
	 */
	public boolean save(String filename) {
		File file;
		int j, k;
		int[] nBuffer = new int[3];

		file = new File(filename);
		try {
			DataOutputStream out = new DataOutputStream(new FileOutputStream(file));
			for (int i = 0; i < Utility.CC_NUM; i++) {
				if (mts != null) {// Modification made
					int nCount = wts.get(i).getCount() + mts.get(i).getCount() - mts.get(i).getDelete();
					out.write(GFCommon.int2bytes(nCount, false));

					j = 0;
					k = 0;
					// Output to the file after comparision
					for (; j < mts.get(i).getCount() && k < wts.get(i).getCount();) {
						WordItem mwi = mts.get(i).getWords().get(j);
						WordItem wi = wts.get(i).getWords().get(k);

						if (mwi.getLen() < wi.getLen() || (strEqual(mwi.getWord(), wi.getWord()))
								&& mwi.getHandle() < wi.getHandle()) {
							// Output the modified data to the file
							nBuffer[0] = mwi.getFreq();
							nBuffer[1] = mwi.getLen();
							nBuffer[2] = mwi.getHandle();
							for (int n : nBuffer)
								out.write(GFCommon.int2bytes(n, false));
							if (nBuffer[1] > 0)// String length is more than 0
								out.write(mwi.getWord().getBytes());

							j++;
						} else if (mwi.getFreq() == -1) {
							// The item has been removed,so skip it
							k++;
						} else if (mwi.getLen() > wi.getLen() || strEqual(mwi.getWord(), wi.getWord())
								&& mwi.getHandle() > wi.getHandle()) {
							// Output the index table data to the file
							nBuffer[0] = wi.getFreq();
							nBuffer[1] = wi.getLen();
							nBuffer[2] = wi.getHandle();
							for (int n : nBuffer)
								out.write(GFCommon.int2bytes(n, false));
							if (nBuffer[1] > 0)// String length is more than 0
								out.write(wi.getWord().getBytes());

							k++;// Get next item in the original table.
						}
					}

					if (k < wts.get(i).getCount()) {
						for (; k < wts.get(i).getCount();) {
							WordItem wi = wts.get(i).getWords().get(k);

							// Has been deleted
							if (wi.getFreq() != -1) {
								nBuffer[0] = wi.getFreq();
								nBuffer[1] = wi.getLen();
								nBuffer[2] = wi.getHandle();
								for (int n : nBuffer)
									out.write(GFCommon.int2bytes(n, false));

								// String length is more than 0
								if (nBuffer[1] > 0)
									out.write(wi.getWord().getBytes());
							}

							k++;// Get next item in the original table.
						}
					} else
						// //No Modification,Add the rest data to the file.
						for (; j < mts.get(i).getCount();) {
							WordItem wi = mts.get(i).getWords().get(j);
							nBuffer[0] = wi.getFreq();
							nBuffer[1] = wi.getLen();
							nBuffer[2] = wi.getHandle();
							for (int n : nBuffer)
								out.write(GFCommon.int2bytes(n, false));
							if (nBuffer[1] > 0)// String length is more than 0
								out.write(wi.getWord().getBytes());
						}
				} else {
					out.writeInt(wts.get(i).getCount());
					for (j = 0; j < wts.get(i).getCount(); j++) {
						WordItem wi = wts.get(i).getWords().get(j);
						nBuffer[0] = wi.getFreq();
						nBuffer[1] = wi.getLen();
						nBuffer[2] = wi.getHandle();
						for (int n : nBuffer)
							out.write(GFCommon.int2bytes(n, false));
						if (nBuffer[1] > 0)// String length is more than 0
							out.write(wi.getWord().getBytes());
					}
				}
			}
			out.close();
		} catch (FileNotFoundException e) {
			logger.error(e);
		} catch (IOException e) {
			logger.error(e);
		}
		return true;
	}

	/**
	 * 向词典库中添加词条.添加时只是先把词条放到修改表中,保存时才真正把添加的词条写入词典库中
	 * 
	 * @param word
	 *            词
	 * @param handle
	 *            句柄
	 * @param frequency
	 *            频度
	 * @return
	 */
	public boolean addItem(String word, int handle, int frequency) {

		Preword pw = preProcessing(word);
		if (pw != null & pw.getWord() != null) {
			int found = findInOriginalTable(pw.getIndex(), pw.getRes(), handle);
			if (found >= 0) {
				WordItem wi = wts.get(pw.getIndex()).getWords().get(found);
				if (wi.getFreq() != -1) {
					wi.setFreq(frequency);
					if (mts == null)
						mts = new ArrayList<ModifyTable>(Utility.CC_NUM);

					mts.get(pw.getIndex()).setDelete(mts.get(pw.getIndex()).getDelete() - 1);
				} else
					wi.setFreq(wi.getFreq() + frequency);
				return true;
			}

			if (mts == null)
				mts = new ArrayList<ModifyTable>(Utility.CC_NUM);

			int found2 = findInModifyTable(pw.getIndex(), pw.getRes(), handle);
			if (found2 >= 0) {
				WordItem wi = mts.get(pw.getIndex()).getWords().get(found2);
				wi.setFreq(wi.getFreq() + frequency);
				return true;
			}

			WordItem wi = new WordItem();
			wi.setFreq(frequency);
			wi.setHandle(handle);
			wi.setLen(pw.getRes().length());
			wi.setWord(pw.getRes());

			ModifyTable mt = mts.get(pw.getIndex());
			mt.getWords().add(found2, wi);
			mt.setCount(mt.getCount() + 1);
			return true;
		}
		return false;
	}

	public boolean delItem(String word, int handle) {
		Preword pw = preProcessing(word);
		if (pw != null & pw.getWord() != null) {
			int found = findInOriginalTable(pw.getIndex(), pw.getRes(), handle);
			if (found >= 0) {
				if (mts == null)
					mts = new ArrayList<ModifyTable>(Utility.CC_NUM);

				ModifyTable mt = mts.get(pw.getIndex());
				WordItem wi = mt.getWords().get(found);
				wi.setFreq(-1);
				mt.setCount(mt.getDelete() + 1);

				if (handle == -1) {
					for (int i = found; i < mt.getCount() && strEqual(mt.getWords().get(i).getWord(), pw.getRes()); i++) {
						WordItem wi2 = mt.getWords().get(i);
						wi2.setFreq(-1);
						mt.setDelete(mt.getDelete() + 1);

					}

				}

				return true;
			}

			int found2 = findInModifyTable(pw.getIndex(), pw.getRes(), handle);
			if (found2 >= 0) {
				ModifyTable mt = mts.get(pw.getIndex());
				ArrayList<WordItem> wis = mt.getWords();
				for (int i = found2; i < wis.size(); i++) {
					WordItem wi = wis.get(i);
					if (strEqual(wi.getWord(), pw.getRes()) && (wi.getHandle() == handle || handle < 0)) {
						wis.remove(wi);
						mt.setCount(mt.getCount() - 1);
						i--;
					}
				}

				return true;
			}
		}

		return false;
	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -