📄 immutablebinarytrie.java
字号:
package it.unimi.dsi.mg4j.util;/* * MG4J: Managing Gigabytes for Java** Copyright (C) 2005-2007 Sebastiano Vigna ** This library is free software; you can redistribute it and/or modify it* under the terms of the GNU Lesser General Public License as published by the Free* Software Foundation; either version 2.1 of the License, or (at your option)* any later version.** This library is distributed in the hope that it will be useful, but* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License* for more details.** You should have received a copy of the GNU Lesser General Public License* along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.**/import it.unimi.dsi.fastutil.booleans.BooleanIterator;import it.unimi.dsi.mg4j.search.Interval;import it.unimi.dsi.mg4j.search.Intervals;import java.io.Serializable;import java.util.List;import java.util.ListIterator;import cern.colt.bitvector.BitVector;import cern.colt.bitvector.QuickBitVector;/** An immutable implementation of binary tries. * * <P>Instance of this class are built starting from a lexicographically ordered * list of {@link cern.colt.bitvector.BitVector}s representing binary words. Each word * is assigned its position (starting from 0) in the list. The words are then organised in a * binary trie with path compression. * * <p>Once the trie has been * built, it is possible to ask whether a word <var>w</var> is {@linkplain #get(BooleanIterator) contained in the trie} * (getting back its position in the list), the {@linkplain #getInterval(BooleanIterator) interval given by the words extending <var>w</var>} and the * {@linkplain #getApproximatedInterval(BooleanIterator) approximated interval defined by <var>w</var>}. * @author Sebastiano Vigna * @since 0.9.2 * @deprecated Moved to <code>dsiutils</code>. */@Deprecatedpublic class ImmutableBinaryTrie implements Serializable { private final static boolean ASSERTS = false; public static final long serialVersionUID = 1L; /** A node in the trie. */ protected static class Node implements Serializable { private static final long serialVersionUID = 1L; public Node left, right; /** An array containing the path compacted in this node (<code>null</code> if there is no compaction at this node). */ final public long[] path; /** The length of the path compacted in this node (0 if there is no compaction at this node). */ final public int pathLength; /** If nonnegative, this node represent the <code>word</code>-th word. */ final public int word ; /** Creates a node representing a word. * * <p>Note that the long array contained in <code>path</code> will be stored inside the node. * * @param path the path compacted in this node, or <code>null</code> for the empty path. * @param word the index of the word represented by this node. */ public Node( final BitVector path, final int word ) { if ( path == null ) { this.path = null; this.pathLength = 0; } else { this.path = path.elements(); this.pathLength = path.size(); } this.word = word; } /** Creates a node that does not represent a word. * * @param path the path compacted in this node, or <code>null</code> for the empty path. */ public Node( final BitVector path ) { this( path, -1 ); } /** Returns true if this node is a leaf. * * @return true if this node is a leaf. */ public boolean isLeaf() { return right == null && left == null; } public String toString() { return "[" + path + ", " + word + "]"; } } /** The root of the trie. */ protected final Node root; /** The number of words in this trie. */ private int size; /** Creates a trie from a list of binary words. * * @param words a list of distinct, lexicographically increasing binary words. */ public ImmutableBinaryTrie( final List<BitVector> words ) { int numWords = words.size(); // If there are at least two words, let us check that they are an increasing list. if ( numWords > 1 ) { BitVector prev = words.get( 0 ), curr; for( ListIterator<BitVector> i = words.listIterator( 1 ); --numWords != 0; ) { curr = i.next(); final int minLength = Math.min( prev.size(), curr.size() ); for( int j = 0; j < minLength; j++ ) if ( prev.get( j ) && ! curr.get ( j ) ) throw new IllegalArgumentException( "The provided bit vector list is not lexicographically increasing" ); else if ( ! prev.get( j ) && curr.get ( j ) ) break; prev = curr; } } root = buildTrie( words, 0 ); } /** Builds a trie recursively. * * <p>The trie will contain the suffixes of words in <code>words</code> starting at <code>pos</code>. * * @param words a list of words. * @param pos a starting position. * @return a trie containing the suffixes of words in <code>words</code> starting at <code>pos</code>. */ private Node buildTrie( final List<BitVector> words, final int pos ) { // TODO: on-the-fly check for lexicographical order if ( words.size() == 0 ) return null; BitVector first = words.get( 0 ), curr; int prefix = first.size(), change = -1, j; // We rule out the case of a single word (it would work anyway, but it's faster) if ( words.size() == 1 ) return new Node( pos < prefix ? first.partFromTo( pos, prefix - 1 ) : null, size++ ); // Find maximum common prefix. change records the point of change (for splitting the word set). for( ListIterator<BitVector> i = words.listIterator( 1 ); i.hasNext(); ) { curr = i.next(); if ( curr.size() < prefix ) prefix = curr.size(); for( j = pos; j < prefix; j++ ) if ( first.get( j ) != curr.get( j ) ) break; if ( j < prefix ) { change = i.previousIndex(); prefix = j; } } final Node n; if ( prefix == first.size() ) { // Special case: the first word is the common prefix. We must store it in the node, // and explicitly search for the actual change point, which is the first // word with prefix-th bit true. change = 1; for( ListIterator<BitVector> i = words.listIterator( 1 ); i.hasNext(); ) { curr = i.next(); if ( curr.get( prefix ) ) break; change++; } n = new Node( prefix > pos ? first.partFromTo( pos, prefix - 1 ) : null, size++ ); n.left = buildTrie( words.subList( 1, change ), prefix + 1 ); n.right = buildTrie( words.subList( change, words.size() ), prefix + 1 ); } else { n = new Node( prefix > pos ? first.partFromTo( pos, prefix - 1 ) : null ); // There's some common prefix n.left = buildTrie( words.subList( 0, change ), prefix + 1 ); n.right = buildTrie( words.subList( change, words.size() ), prefix + 1 ); } return n; } /** Returns the number of binary words in this trie. * * @return the number of binary words in this trie. */ public int size() { return size; } /** Return the index of the specified word, or -1 if the word is not this trie. * * @param word a word. * @return the index of the specified word, or -1 if the word is not this trie. * @see #get(BooleanIterator) */ public int get( final BitVector word ) { final int length = word.size(); Node n = root; int pos = 0; // Current position in word long[] path; while( n != null ) { if ( pos == length ) return n.word; path = n.path; if ( path != null ) { int minLength = Math.min( length - pos, n.pathLength ), i; for( i = 0; i < minLength; i++ ) if ( word.get( pos + i ) != QuickBitVector.get( path, i ) ) break; // Incompatible with current path. if ( i < minLength ) return -1; pos += i; // Completely contained in the current path (note that n.word == -1 if this is not a word). if ( pos == length ) return n.word; } n = word.get( pos++ ) ? n.right : n.left; } return -1; } /** Return the index of the word returned by the given iterator, or -1 if the word is not this trie. * * @param iterator a boolean iterator that will be used to find a word in this trie. * @return the index of the specified word, or -1 if the word returned by the iterator is not this trie. * @see #get(BitVector) */ public int get( final BooleanIterator iterator ) { Node n = root; int pathLength; long[] path; while( n != null ) { if ( ! iterator.hasNext() ) return n.word; pathLength = n.pathLength; if ( pathLength != 0 ) { int i; path = n.path; for( i = 0; i < pathLength && iterator.hasNext(); i++ ) if ( iterator.nextBoolean() != QuickBitVector.get( path, i ) ) break; // Incompatible with current path. if ( i < pathLength ) return -1; // Completely contained in the current path (note that n.word == -1 if this is not a word). if ( ! iterator.hasNext() ) return n.word; } n = iterator.nextBoolean() ? n.right : n.left; } return -1; } /** Returns an interval given by the smallest and the largest word in the trie starting with the specified word. * * @param word a word. * @return an interval given by the smallest and the largest word in the trie * that start with <code>word</code> (thus, the {@linkplain Intervals#EMPTY_INTERVAL empty inteval} * if no such words exist). * @see #getInterval(BooleanIterator) */ public Interval getInterval( final BitVector word ) { final int length = word.size(); Node n = root; long[] path; int pos = 0; // Current position in word while( n != null ) { // We found the current path: we go searching for left and right delimiters. if ( pos == length ) break; path = n.path; if ( path != null ) { int maxLength = Math.min( length - pos, n.pathLength ); int i; for( i = 0; i < maxLength; i++ ) if ( word.get( pos + i ) != QuickBitVector.get( path, i ) ) break; // Incompatible with current path--we return the empty interval. if ( i < maxLength ) return Intervals.EMPTY_INTERVAL;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -