📄 lexicaltreeanalyzer.java

📁 spam source codejasen-0.9jASEN - java Anti Spam ENgine.zip 如标题所示
💻 JAVA
字号:
/*
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.linguistics;

import it.unimi.dsi.fastutil.chars.Char2ObjectOpenHashMap;

import java.io.IOException;
import java.util.Arrays;

import org.jasen.core.token.SimpleWordTokenizer;

/**
 * Employes a lexical tree approach to word recognition.
 * <p>
 * Based on a sample corpus, the analyser builds a tree of characters such that each characters in a word is a node in the tree.
 * </p>
 * <p>
 * When a word with a similar character sequence is found, the path to the next character is strengthened
 * </p>
 * @author Jason Polites
 */
public class LexicalTreeAnalyzer {

	private String[] tokens;
	private Char2ObjectOpenHashMap forwardTree;
	private Char2ObjectOpenHashMap reverseTree;
	private static final String ENGLISH_DICTIONARY_PATH = "org/jasen/core/linguistics/dictionaries/english.dic";

	/**
	 * Creates and initialized the analyzer
	 */
	public void initialize() throws IOException {
		// Get the dictionary as a resource stream
		SimpleWordTokenizer t = new SimpleWordTokenizer(this.getClass().getClassLoader().getResourceAsStream(ENGLISH_DICTIONARY_PATH));
		t.tokenize();
		tokens = t.getTokens();
		Arrays.sort(tokens);
		buildTrees();
	}

	/**
	 * Computes the probability that the given sequence of characters is an English word.
	 * <p>
	 * This works on the premise that most English words exhibit a similar set of character sequence patterns 
	 * in both their prefix, body and suffix.
	 * </p>
	 * The value of the word is determined by analysis if the characters in the word
	 * against the values in both the forward and backward lexical trees.
	 * <BR><BR>
	 * The maximium possible value a word can have is 1 (100%), thus for each character in the word
	 * which is correctly positioned in accordance with the rules in the tree, the computed value is
	 * increased by 1/W where 'W' is the length of the word; such that if a word perfectly matches a
	 * branch of the tree a result of 1/W x W (or 1) will be returned.
	 * <BR><BR>
	 * Where a word fails to match a forward branch perfectly, two things are done:
	 * <OL>
	 * 	<LI>For each remaining character in the token, the current total is reduced by the same percentile fraction as used to calculate the total.</LI>
	 * 	<LI>The token is given a "second chance" by repeating the initial calculation process with the reverse tree.</LI>
	 * </OL>
	 * @param word The word to be tested
	 * @return A value between 0.0 and 1.0 indicating the probability that the String is an English word.
	 */
	public double computeWordValue(String word) {

		// First, check the dictionary
		word = word.toLowerCase();
		double result;

		if(Arrays.binarySearch(tokens, word) > -1) {
			result = 1.0d;
		}
		else
		{
			long time = System.currentTimeMillis();
			double percentile = (1.0d / (double)word.length());
			result = computeWordValue(forwardTree, word, true, 0, percentile, 0.0d);

			if(result < 0.0d) {
				result = 0.0d;
			}
		}
		return result;
	}


	/**
	 * The value of the word is determined by analysis if the characters in the word
	 * against the values in both the forward and backward lexical trees
	 * <BR><BR>
	 * The maximium possible value a word can have is 1 (100%), thus for each character in the word
	 * which is correctly positioned in accordance with the rules in the tree, the computed value is
	 * increased by 1/W where 'W' is the length of the word; such that if a word perfectly matches a
	 * branch of the tree a result of 1/W x W (or 1) will be returned
	 * <BR><BR>
	 * Where a word fails to match a forward branch perfectly, two things are done:
	 * <OL>
	 * 	<LI>For each remaining character in the token, the current total is reduced by the same percentile fraction as used to calculate the total</LI>
	 * 	<LI>The token is given a "second chance" by repeating the initial calculation process with the reverse tree</LI>
	 * </OL>
	 * @param node
	 * @param word
	 * @param index
	 * @return
	 */
	private double computeWordValue(Char2ObjectOpenHashMap node, String word, boolean forward, int index, double percentile, double total) {

		char chr = word.charAt(index);

		Char2ObjectOpenHashMap next = (Char2ObjectOpenHashMap)node.get(chr);

		if(next != null) {

			total += percentile;

			if(forward) {
				index++;

				if(index < word.length()) {
					total = computeWordValue(next, word, forward, index, percentile, total);
				}
			}
			else
			{
				index--;

				if(index >= 0) {
					total = computeWordValue(next, word, forward, index, percentile, total);
				}
			}
		}
		else if(index < word.length() - 1 && forward) {

			for (int i = index; i < word.length(); i++) {
				total -= percentile;
			}

			total = computeWordValue(reverseTree, word, false, (word.length() - 1), percentile, total);
		}

		return total;
	}

	/**
	 * Loops through the list of tokens and builds a character tree
	 *
	 */
	private void buildTrees() {
		// First, create a root node
		forwardTree = new Char2ObjectOpenHashMap();
		reverseTree = new Char2ObjectOpenHashMap();

		String token = null;
		char chr;

		Char2ObjectOpenHashMap currentNode = null;
		Char2ObjectOpenHashMap targetNode = null;


		for (int i = 0; i < tokens.length; i++) {
			token = tokens[i];

			if(token != null) {

				// Build the forward tree
				currentNode = forwardTree;

				for (int j = 0; j < token.length(); j++) {
					chr = token.charAt(j);

					// Attempt to locate the node from the current node
					targetNode = (Char2ObjectOpenHashMap)currentNode.get(chr);

					if(targetNode == null) {
						targetNode = new Char2ObjectOpenHashMap(1);
						//targetNode.setCharacter(chr);
						currentNode.put(chr, targetNode);
					}
					currentNode = targetNode;
				}

				// Build the reverse tree
				currentNode = reverseTree;

				for (int j = token.length() - 1; j >= 0; j--) {
					chr = token.charAt(j);

					// Attempt to locate the node from the current node
					targetNode =(Char2ObjectOpenHashMap)currentNode.get(chr);

					if(targetNode == null) {
						targetNode = new Char2ObjectOpenHashMap(1);
						//targetNode.setCharacter(chr);
						currentNode.put(chr, targetNode);
					}

					currentNode = targetNode;
				}
			}
		}
	}
}
💿 文件大小 5119 K
👤 上传用户 jl315
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#codejasen #source #ENgine #jASEN
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -