⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 termcollectionvisitor.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.search.visitor;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import java.io.IOException;import it.unimi.dsi.fastutil.Hash;import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;import it.unimi.dsi.fastutil.objects.Object2IntMap;import it.unimi.dsi.fastutil.objects.Reference2IntLinkedOpenHashMap;import it.unimi.dsi.fastutil.objects.Reference2IntMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.search.DocumentIterator;import org.apache.log4j.Logger;/** A visitor collecting information about terms appearing * in a {@link it.unimi.dsi.mg4j.search.DocumentIterator}. *  * <P>The purpose of this visitor is that of exploring before iteration the structure * of a {@link DocumentIterator} to count how many terms are actually used, and set up some * preliminary access data. More precisely, we count the distinct pairs index/term  * appearing in all leaves of nonzero frequency (the latter * condition is used to skip empty iterators). For this visitor to work, all leaves * of nonzero frequency must return a non-<code>null</code> value on  * a call to {@link it.unimi.dsi.mg4j.index.IndexIterator#term()}. *  * <p>During the visit, we keep track of which index/term pair have been already * seen. Each pair is assigned an distinct <em>offset</em>&mdash;a number between * zero and the overall number of distinct pairs&mdash;which is stored into * each index iterator {@linkplain it.unimi.dsi.mg4j.index.IndexIterator#id() id} * and is used afterwards to access quickly data about the pair. Note that duplicate index/term pairs * get the same offset. The overall number of distinct pairs is returned * by {@link #numberOfPairs()} after a visit.  *  * <p>During the visit, the indices actually appearing in some nonzero-frequency * leaf are gathered; they are accessible as a vector returned  * by {@link #indices()}, and the map from positions in this vector to indices * is inverted by {@link #indexMap()}. *  * <p>The offset assigned to each pair index/term * is returned by {@link #offset(Index, String)}. Should you need to know the terms * associated to each index, they are returned by {@link #terms(Index)}. *  * <p>The after a term collection, usually counters are set  * up by a visit of {@link it.unimi.dsi.mg4j.search.visitor.CounterSetupVisitor}. */public class TermCollectionVisitor extends AbstractDocumentIteratorVisitor {	private final static Logger LOGGER = Logger.getLogger( TermCollectionVisitor.class );	private final static boolean DEBUG = false;	/** The map from indices to maps from terms to offsets. The map themselves are linked,	 * so terms are always returned in the same order (the visit order). */	private final Reference2ObjectMap<Index,Object2IntMap<String>> index2termMap;	/** The overall number of pairs index/term. */	private int numberOfPairs;	/** The array of indices involved in this query, returned by {@link #indices()}. */ 	private Index[] index;	/** A map from indices to positions in {@link #index}. */	private Reference2IntMap<Index> indexMap;		/** Creates a new term-collection visitor. */		public TermCollectionVisitor() {		index2termMap = new Reference2ObjectOpenHashMap<Index,Object2IntMap<String>>();		indexMap = new Reference2IntLinkedOpenHashMap<Index>( Hash.DEFAULT_INITIAL_SIZE, .5f );	}		public TermCollectionVisitor prepare() {		index = null;		index2termMap.clear();		indexMap.clear();		numberOfPairs = 0;		return this;	}		public boolean visit( final IndexIterator indexIterator ) throws IOException {		// TODO: the second condition should be checked elsewhere, maybe...		if ( indexIterator.frequency() > 0 && indexIterator.index().hasCounts) { // We skip empty iterators and indices without counts 			final Index index = indexIterator.index();			final String term = indexIterator.term();						if ( term == null ) throw new NullPointerException( "This visitor needs a non-null term for each index iterator of nonzero frequency" );						if ( DEBUG ) LOGGER.debug( "Visiting leaf: index=" + index + ", term=" + term );						final Object2IntMap<String> termMap;			if ( ! indexMap.containsKey( index ) ) {				// This index has never been seen before				indexMap.put( index, indexMap.size() );				// Lazy instantiation of the term map				index2termMap.put( index, termMap = new Object2IntLinkedOpenHashMap<String>( Hash.DEFAULT_INITIAL_SIZE, .5f ) );				termMap.defaultReturnValue( -1 );			}			else termMap = index2termMap.get( index );						int offset = termMap.getInt( term );			if ( offset == -1 ) termMap.put( term, offset = numberOfPairs++ ); // Unknown index/term pair 			indexIterator.id( offset );			if ( DEBUG ) LOGGER.debug( "Offset for index iterator " + indexIterator + ": " + offset );		}		return true; 	}	/** Returns the number of distinct index/term pair corresponding to 	 * nonzero-frequency index iterators in the last visit.	 * 	 * @return the number distinct index/term pair corresponding to 	 * nonzero-frequency index iterators.	 */	public int numberOfPairs() {		return numberOfPairs;	}		/** Returns the indices met during pair collection.	 * 	 * <p>Note that the returned array does not include indices only associated	 * to index iterators of zero frequency.	 * 	 * @return the indices met during term collection.	 */	public Index[] indices() {		if ( index == null ) index = indexMap.keySet().toArray( new Index[ index2termMap.size() ] );		return index;	}		/** Returns a map from indices met during term collection to their position	 * into {@link #indices()}.	 * 	 * <p>Note that the returned array does not include indices only associated	 * to index iterators of zero frequency.	 * 	 * @return a map from indices met during term collection to their position	 * into {@link #indices()}.	 */	public Reference2IntMap<Index> indexMap() {		return indexMap;	}		/** Returns the terms associated to the given index.	 * 	 * @param index an index.	 * @return the terms associated to <code>index</code>, in the same order in which	 * they appeared during the visit, skipping duplicates, if some nonzero-frequency iterator	 * based on <code>index</code> was found; <code>null</code> otherwise.	 */	public String[] terms( final Index index ) {		final Object2IntMap<String> termMap = index2termMap.get( index );		return termMap == null ? null : termMap.keySet().toArray( new String[ termMap.size() ] );	}		/** Returns the offset associated to a given pair index/term.	 * 	 * @param index an index appearing in {@link #indices()}.	 * @param term a term appearing in the array returned by {@link #terms(Index)} with argument <code>index</code>. 	 * @return the offset associated to the pair <code>index</code>/<code>term</code>.	 */	public int offset( final Index index, final String term ) {		return index2termMap.get( index ).getInt( term );	}	public String toString() {		return "[Leaves: " + numberOfPairs + "; " + index2termMap + "]";	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -