📄 immutablebinarytrie.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.util;/*		 * MG4J: Managing Gigabytes for Java** Copyright (C) 2005-2007 Sebastiano Vigna **  This library is free software; you can redistribute it and/or modify it*  under the terms of the GNU Lesser General Public License as published by the Free*  Software Foundation; either version 2.1 of the License, or (at your option)*  any later version.**  This library is distributed in the hope that it will be useful, but*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License*  for more details.**  You should have received a copy of the GNU Lesser General Public License*  along with this program; if not, write to the Free Software*  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.**/import it.unimi.dsi.fastutil.booleans.BooleanIterator;import it.unimi.dsi.mg4j.search.Interval;import it.unimi.dsi.mg4j.search.Intervals;import java.io.Serializable;import java.util.List;import java.util.ListIterator;import cern.colt.bitvector.BitVector;import cern.colt.bitvector.QuickBitVector;/** An immutable implementation of binary tries. *  * <P>Instance of this class are built starting from a lexicographically ordered * list of {@link cern.colt.bitvector.BitVector}s representing binary words. Each word * is assigned its position (starting from 0) in the list. The words are then organised in a * binary trie with path compression. *  * <p>Once the trie has been * built, it is possible to ask whether a word <var>w</var> is {@linkplain #get(BooleanIterator) contained in the trie} * (getting back its position in the list), the {@linkplain #getInterval(BooleanIterator) interval given by the words extending <var>w</var>} and the * {@linkplain #getApproximatedInterval(BooleanIterator) approximated interval defined by <var>w</var>}.   * @author Sebastiano Vigna * @since 0.9.2 * @deprecated Moved to <code>dsiutils</code>. */@Deprecatedpublic class ImmutableBinaryTrie implements Serializable {		private final static boolean ASSERTS = false;	public static final long serialVersionUID = 1L;		/** A node in the trie. */	protected static class Node implements Serializable {		private static final long serialVersionUID = 1L;		public Node left, right;		/** An array containing the path compacted in this node (<code>null</code> if there is no compaction at this node). */		final public long[] path;		/** The length of the path compacted in this node (0 if there is no compaction at this node). */		final public int pathLength;		/** If nonnegative, this node represent the <code>word</code>-th word. */		final public int word ;				/** Creates a node representing a word. 		 * 		 * <p>Note that the long array contained in <code>path</code> will be stored inside the node.		 * 		 * @param path the path compacted in this node, or <code>null</code> for the empty path.		 * @param word the index of the word represented by this node.		 */				public Node( final BitVector path, final int word ) {			if ( path == null ) {				this.path = null;				this.pathLength = 0;			}			else {				this.path = path.elements();				this.pathLength = path.size();			}			this.word = word;		}					/** Creates a node that does not represent a word. 		 * 		 * @param path the path compacted in this node, or <code>null</code> for the empty path.		 */		public Node( final BitVector path ) {			this( path, -1 );		}		/** Returns true if this node is a leaf.		 * 		 * @return  true if this node is a leaf.		 */		public boolean isLeaf() {			return right == null && left == null;		}				public String toString() {			return "[" + path + ", " + word + "]";		}			}			/** The root of the trie. */	protected final Node root;	/** The number of words in this trie. */	private int size;		/** Creates a trie from a list of binary words.	 * 	 * @param words a list of distinct, lexicographically increasing binary words.	 */		public ImmutableBinaryTrie( final List<BitVector> words ) {		int numWords = words.size();		// If there are at least two words, let us check that they are an increasing list.		if ( numWords > 1 ) {			BitVector prev = words.get( 0 ), curr;			for( ListIterator<BitVector> i = words.listIterator( 1 ); --numWords != 0; ) {				curr = i.next();				final int minLength = Math.min( prev.size(), curr.size() ); 				for( int j = 0; j < minLength; j++ ) 					if ( prev.get( j ) && ! curr.get ( j ) ) throw new IllegalArgumentException( "The provided bit vector list is not lexicographically increasing" );					else if ( ! prev.get( j ) && curr.get ( j ) ) break;				prev = curr;			}		}		root = buildTrie( words, 0 );	}	/** Builds a trie recursively. 	 * 	 * <p>The trie will contain the suffixes of words in <code>words</code> starting at <code>pos</code>.	 * 	 * @param words a list of words.	 * @param pos a starting position.	 * @return a trie containing the suffixes of words in <code>words</code> starting at <code>pos</code>.	 */			private Node buildTrie( final List<BitVector> words, final int pos ) {		// TODO: on-the-fly check for lexicographical order				if ( words.size() == 0 ) return null;		BitVector first = words.get( 0 ), curr;		int prefix = first.size(), change = -1, j;		// We rule out the case of a single word (it would work anyway, but it's faster)		if ( words.size() == 1 ) return new Node( pos < prefix ? first.partFromTo( pos, prefix - 1 ) : null, size++ );				// 	Find maximum common prefix. change records the point of change (for splitting the word set).		for( ListIterator<BitVector> i = words.listIterator( 1 ); i.hasNext(); ) {			curr = i.next();						if ( curr.size() < prefix ) prefix = curr.size(); 			for( j = pos; j < prefix; j++ ) if ( first.get( j ) != curr.get( j ) ) break;			if ( j < prefix ) {				change = i.previousIndex();				prefix = j;			}		}				final Node n;		if ( prefix == first.size() ) {			// Special case: the first word is the common prefix. We must store it in the node,			// and explicitly search for the actual change point, which is the first			// word with prefix-th bit true.			change = 1;			for( ListIterator<BitVector> i = words.listIterator( 1 ); i.hasNext(); ) {				curr = i.next();				if ( curr.get( prefix ) ) break;				change++;			}							n = new Node( prefix > pos ? first.partFromTo( pos, prefix - 1 ) : null, size++ );			n.left = buildTrie( words.subList( 1, change ), prefix + 1 );			n.right = buildTrie( words.subList( change, words.size() ), prefix + 1 );		}		else {			n = new Node( prefix > pos ? first.partFromTo( pos, prefix - 1 ) : null ); // There's some common prefix			n.left = buildTrie( words.subList( 0, change ), prefix + 1 );			n.right = buildTrie( words.subList( change, words.size() ), prefix + 1 );		}		return n;	}	/** Returns the number of binary words in this trie.	 * 	 * @return the number of binary words in this trie.	 */	public int size() {		return size;	}			/** Return the index of the specified word, or -1 if the word is not this trie.	 * 	 * @param word a word.	 * @return the index of the specified word, or -1 if the word is not this trie.	 * @see #get(BooleanIterator)	 */		public int get( final BitVector word ) {		final int length = word.size();		Node n = root;					int pos = 0; // Current position in word		long[] path;					while( n != null ) {			if ( pos == length ) return n.word;			path = n.path;			if ( path != null ) {				int minLength = Math.min( length - pos, n.pathLength ), i;				for( i = 0; i < minLength; i++ ) if ( word.get( pos + i ) != QuickBitVector.get( path, i ) ) break;				// Incompatible with current path.				if ( i < minLength ) return -1;							pos += i;				// Completely contained in the current path (note that n.word == -1 if this is not a word).				if ( pos == length ) return n.word;			}			n = word.get( pos++ ) ? n.right : n.left;			}		return -1;	}		/** Return the index of the word returned by the given iterator, or -1 if the word is not this trie.	 * 	 * @param iterator a boolean iterator that will be used to find a word in this trie.	 * @return the index of the specified word, or -1 if the word returned by the iterator is not this trie.	 * @see #get(BitVector)	 */		public int get( final BooleanIterator iterator ) {		Node n = root;		int pathLength;		long[] path;				while( n != null ) {			if ( ! iterator.hasNext() ) return n.word;			pathLength = n.pathLength;						if ( pathLength != 0 ) {				int i;				path = n.path;				for( i = 0; i < pathLength && iterator.hasNext(); i++ ) if ( iterator.nextBoolean() != QuickBitVector.get( path, i ) ) break;				// Incompatible with current path.				if ( i < pathLength ) return -1;							// Completely contained in the current path (note that n.word == -1 if this is not a word).				if ( ! iterator.hasNext() ) return n.word;			}			n = iterator.nextBoolean() ? n.right : n.left;						}		return -1;	}	/** Returns an interval given by the smallest and the largest word in the trie starting with the specified word.	 * 	 * @param word a word.	 * @return  an interval given by the smallest and the largest word in the trie 	 * that start with <code>word</code> (thus, the {@linkplain Intervals#EMPTY_INTERVAL empty inteval}	 * if no such words exist).	 * @see #getInterval(BooleanIterator)	 */			public Interval getInterval( final BitVector word ) {		final int length = word.size();		Node n = root;		long[] path;			int pos = 0; // Current position in word				while( n != null ) {			// We found the current path: we go searching for left and right delimiters.			if ( pos == length ) break;			path = n.path;						if ( path != null ) {				int maxLength = Math.min( length - pos, n.pathLength );				int i;				for( i = 0; i < maxLength; i++ ) if ( word.get( pos + i ) != QuickBitVector.get( path, i ) ) break;				// Incompatible with current path--we return the empty interval.				if ( i < maxLength ) return Intervals.EMPTY_INTERVAL;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -