📄 ternaryintervalsearchtree.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.util;/*		 * MG4J: Managing Gigabytes for Java** Copyright (C) 2005-2007 Sebastiano Vigna **  This library is free software; you can redistribute it and/or modify it*  under the terms of the GNU Lesser General Public License as published by the Free*  Software Foundation; either version 2.1 of the License, or (at your option)*  any later version.**  This library is distributed in the hope that it will be useful, but*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License*  for more details.**  You should have received a copy of the GNU Lesser General Public License*  along with this program; if not, write to the Free Software*  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.**/import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.objects.AbstractObjectList;import it.unimi.dsi.mg4j.index.PrefixMap;import it.unimi.dsi.mg4j.index.TermMap;import it.unimi.dsi.mg4j.io.FastBufferedReader;import it.unimi.dsi.mg4j.search.Interval;import it.unimi.dsi.mg4j.search.Intervals;import java.io.IOException;import java.io.InputStreamReader;import java.io.Serializable;import java.nio.charset.Charset;import java.util.Collection;import java.util.Iterator;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.UnflaggedOption;import com.martiansoftware.jsap.stringparsers.ForNameStringParser;/** Ternary interval search trees. *  * <p><em>Ternary search trees</em> are a data structure used to store words over an alphabet; they are * a useful alternatives to tries when the alphabet is large. *  * <p>Ternary <em>interval</em> search trees have the additional properties of being able * to locate quickly intervals of words extending a given prefix (where &ldquo;quickly&rdquo; means * that no more successful character comparisons than the prefix length are performed). They do so  * by storing at each node the number of words covered by that node. *  * <p>This implementation exposes a number of interfaces: in particular, the set of words is * seen as a lexicographically ordered {@link it.unimi.dsi.fastutil.objects.ObjectList}. *  * <p>This class is mutable, but for the time it implements only {@link #add(CharSequence)}. Words cannot * be removed. * @deprecated Moved to <code>dsiutils</code>. */@Deprecatedpublic class TernaryIntervalSearchTree extends AbstractObjectList<CharSequence> implements PrefixMap, TermMap, ImmutableExternalTreePrefixDictionary.IntervalApproximator, Serializable {	private static final long serialVersionUID = 1L;	/** A node of the tree. */	private final static class Node implements Serializable {		private static final long serialVersionUID = 1L;		/** A pointer to the left subtree. */		public Node left;		/** A pointer to the middle subtree. */		public Node middle;		/** A pointer to the right subtree. */		public Node right;		/** The nonempty path compressed at this node. */		public char[] path;		/** Whether this node represents a word. */		public boolean isWord;		/** The number of words covered by this node (including the word possibly represented by this node). */		public int numNodes;		/** Creates a new node containing a path specified by a character-sequence fragment.		 * 		 * @param s a character sequence contaning the path of the node.		 * @param offset the starting character of the path.		 * @param length the length of the path.		 * @param isWord whether this node represents a word.		 * @param numNodes the number of words covered by this node.		 */		public Node( final CharSequence s, final int offset, final int length, final boolean isWord, final int numNodes ) {			path = new char[ length ];			MutableString.getChars( s, offset, offset + length, path, 0 );			this.isWord = isWord;			this.numNodes = numNodes;		}		/** Creates a new node containing a path specified by a character-array fragment.		 * 		 * @param a a character array contaning the path of the node.		 * @param offset the starting character of the path.		 * @param length the length of the path.		 * @param isWord whether this node represents a word.		 * @param numNodes the number of words covered by this node.		 */		public Node( final char[] a, final int offset, final int length, final boolean isWord, final int numNodes ) {			path = new char[ length ];			System.arraycopy( a, offset, path, 0, length );			this.isWord = isWord;			this.numNodes = numNodes;		}		/** Removes a prefix from the path of this node.		 * 		 * @param length the length of the prefix to be removed		 */				public void removePathPrefix( final int length ) {			final char[] a = new char[ path.length - length ];			System.arraycopy( path, length, a, 0, a.length );			path = a;		}	}		/** The root of the tree. */	private Node root;		/** The number of nodes in the tree. */	private int size;		/** Creates a new empty ternary search tree. */	public TernaryIntervalSearchTree() {}	/** Creates a new empty ternary search tree and populates it with a given collection of character sequences. 	 *	 * @param c a collection of character sequences. 	 * */	public TernaryIntervalSearchTree( final Collection<? extends CharSequence> c ) {		int n = c.size();		final Iterator<? extends CharSequence> i = c.iterator();		while( n-- != 0 ) 			add( i.next() );	}		public Interval getInterval( final CharSequence s ) {		final int l = s.length();		Node e = root;		int i;		int offset = 0;		int wordsAtLeft = 0;		char c;		char[] path;				while( e != null ) {			path = e.path;			for( i = 0; i < path.length - 1 && offset + i < l && s.charAt( offset + i ) == path[ i ]; i++ );			if ( offset + i == l ) return Interval.valueOf( wordsAtLeft, wordsAtLeft + e.numNodes - 1 );			if ( i < path.length - 1 ) return Intervals.EMPTY_INTERVAL;			offset += i;						c = s.charAt( offset );			if ( c < path[ i ] ) e = e.left;			else if ( c > path[ i ] ) {				if ( e.left != null ) wordsAtLeft += e.left.numNodes;				if ( e.middle != null ) wordsAtLeft += e.middle.numNodes;				if ( e.isWord ) wordsAtLeft++;				e = e.right;			}			else {				offset++;				if ( e.left != null ) wordsAtLeft += e.left.numNodes;				if ( offset == l ) return Interval.valueOf( wordsAtLeft, wordsAtLeft + ( e.isWord ? 1 : 0 ) + ( e.middle == null ? 0 : e.middle.numNodes ) - 1 );				if ( e.isWord ) wordsAtLeft++;				e = e.middle;			}		}		return Intervals.EMPTY_INTERVAL;	}	public Interval getApproximatedInterval( final CharSequence s ) {		final int l = s.length();		Node e = root;		int i;		int offset = 0;		int wordsAtLeft = 0;		char c;		char[] path;				while( e != null ) {			path = e.path;			for( i = 0; i < path.length - 1 && offset + i < l && s.charAt( offset + i ) == path[ i ]; i++ );			if ( offset + i == l ) {				// Our sequence is a proper prefix of path.				return wordsAtLeft > 0 ? Interval.valueOf( wordsAtLeft - 1, wordsAtLeft + e.numNodes - 1 ) : Interval.valueOf( wordsAtLeft, wordsAtLeft + e.numNodes - 1 );								}			if ( i < path.length - 1 ) {				// We stopped the loop prematurely.								if ( s.charAt( offset + i ) < path[ i ] ) return wordsAtLeft > 0 ? Interval.valueOf( wordsAtLeft -1 ) : Intervals.EMPTY_INTERVAL;				else return Interval.valueOf( wordsAtLeft + e.numNodes  - 1 );			}			offset += i;						c = s.charAt( offset );			if ( c < path[ i ] ) e = e.left;			else if ( c > path[ i ] ) {				if ( e.left != null ) wordsAtLeft += e.left.numNodes;				if ( e.middle != null ) wordsAtLeft += e.middle.numNodes;				if ( e.isWord ) wordsAtLeft++;				e = e.right;			}			else {				offset++;				if ( e.left != null ) wordsAtLeft += e.left.numNodes;				if ( offset == l ) return Interval.valueOf( wordsAtLeft - ( e.isWord ? 0 : 1 ), wordsAtLeft + ( e.isWord ? 1 : 0 ) + ( e.middle == null ? 0 : e.middle.numNodes ) - 1 );				if ( e.isWord ) wordsAtLeft++;				e = e.middle;			}		}		return wordsAtLeft > 0 ? Interval.valueOf( wordsAtLeft - 1 ) : Intervals.EMPTY_INTERVAL;	}	public boolean contains( final CharSequence s ) {		return getIndex( s ) != -1;	}		public boolean contains( final Object o ) {		return contains( (CharSequence)o );	}		public CharSequence getTerm ( final int index ) {		final MutableString s = new MutableString();		return getTerm( index, s );	}		public MutableString getTerm( int index, final MutableString s ) {		ensureIndex( index );		s.length( 0 );		Node e = root;						for( ;; ) {						if ( e.left != null ) {				if ( index < e.left.numNodes ) {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -