📄 termcollectionvisitor.java
字号:
package it.unimi.dsi.mg4j.search.visitor;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import java.io.IOException;import it.unimi.dsi.fastutil.Hash;import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;import it.unimi.dsi.fastutil.objects.Object2IntMap;import it.unimi.dsi.fastutil.objects.Reference2IntLinkedOpenHashMap;import it.unimi.dsi.fastutil.objects.Reference2IntMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.search.DocumentIterator;import org.apache.log4j.Logger;/** A visitor collecting information about terms appearing * in a {@link it.unimi.dsi.mg4j.search.DocumentIterator}. * * <P>The purpose of this visitor is that of exploring before iteration the structure * of a {@link DocumentIterator} to count how many terms are actually used, and set up some * preliminary access data. More precisely, we count the distinct pairs index/term * appearing in all leaves of nonzero frequency (the latter * condition is used to skip empty iterators). For this visitor to work, all leaves * of nonzero frequency must return a non-<code>null</code> value on * a call to {@link it.unimi.dsi.mg4j.index.IndexIterator#term()}. * * <p>During the visit, we keep track of which index/term pair have been already * seen. Each pair is assigned an distinct <em>offset</em>—a number between * zero and the overall number of distinct pairs—which is stored into * each index iterator {@linkplain it.unimi.dsi.mg4j.index.IndexIterator#id() id} * and is used afterwards to access quickly data about the pair. Note that duplicate index/term pairs * get the same offset. The overall number of distinct pairs is returned * by {@link #numberOfPairs()} after a visit. * * <p>During the visit, the indices actually appearing in some nonzero-frequency * leaf are gathered; they are accessible as a vector returned * by {@link #indices()}, and the map from positions in this vector to indices * is inverted by {@link #indexMap()}. * * <p>The offset assigned to each pair index/term * is returned by {@link #offset(Index, String)}. Should you need to know the terms * associated to each index, they are returned by {@link #terms(Index)}. * * <p>The after a term collection, usually counters are set * up by a visit of {@link it.unimi.dsi.mg4j.search.visitor.CounterSetupVisitor}. */public class TermCollectionVisitor extends AbstractDocumentIteratorVisitor { private final static Logger LOGGER = Logger.getLogger( TermCollectionVisitor.class ); private final static boolean DEBUG = false; /** The map from indices to maps from terms to offsets. The map themselves are linked, * so terms are always returned in the same order (the visit order). */ private final Reference2ObjectMap<Index,Object2IntMap<String>> index2termMap; /** The overall number of pairs index/term. */ private int numberOfPairs; /** The array of indices involved in this query, returned by {@link #indices()}. */ private Index[] index; /** A map from indices to positions in {@link #index}. */ private Reference2IntMap<Index> indexMap; /** Creates a new term-collection visitor. */ public TermCollectionVisitor() { index2termMap = new Reference2ObjectOpenHashMap<Index,Object2IntMap<String>>(); indexMap = new Reference2IntLinkedOpenHashMap<Index>( Hash.DEFAULT_INITIAL_SIZE, .5f ); } public TermCollectionVisitor prepare() { index = null; index2termMap.clear(); indexMap.clear(); numberOfPairs = 0; return this; } public boolean visit( final IndexIterator indexIterator ) throws IOException { // TODO: the second condition should be checked elsewhere, maybe... if ( indexIterator.frequency() > 0 && indexIterator.index().hasCounts) { // We skip empty iterators and indices without counts final Index index = indexIterator.index(); final String term = indexIterator.term(); if ( term == null ) throw new NullPointerException( "This visitor needs a non-null term for each index iterator of nonzero frequency" ); if ( DEBUG ) LOGGER.debug( "Visiting leaf: index=" + index + ", term=" + term ); final Object2IntMap<String> termMap; if ( ! indexMap.containsKey( index ) ) { // This index has never been seen before indexMap.put( index, indexMap.size() ); // Lazy instantiation of the term map index2termMap.put( index, termMap = new Object2IntLinkedOpenHashMap<String>( Hash.DEFAULT_INITIAL_SIZE, .5f ) ); termMap.defaultReturnValue( -1 ); } else termMap = index2termMap.get( index ); int offset = termMap.getInt( term ); if ( offset == -1 ) termMap.put( term, offset = numberOfPairs++ ); // Unknown index/term pair indexIterator.id( offset ); if ( DEBUG ) LOGGER.debug( "Offset for index iterator " + indexIterator + ": " + offset ); } return true; } /** Returns the number of distinct index/term pair corresponding to * nonzero-frequency index iterators in the last visit. * * @return the number distinct index/term pair corresponding to * nonzero-frequency index iterators. */ public int numberOfPairs() { return numberOfPairs; } /** Returns the indices met during pair collection. * * <p>Note that the returned array does not include indices only associated * to index iterators of zero frequency. * * @return the indices met during term collection. */ public Index[] indices() { if ( index == null ) index = indexMap.keySet().toArray( new Index[ index2termMap.size() ] ); return index; } /** Returns a map from indices met during term collection to their position * into {@link #indices()}. * * <p>Note that the returned array does not include indices only associated * to index iterators of zero frequency. * * @return a map from indices met during term collection to their position * into {@link #indices()}. */ public Reference2IntMap<Index> indexMap() { return indexMap; } /** Returns the terms associated to the given index. * * @param index an index. * @return the terms associated to <code>index</code>, in the same order in which * they appeared during the visit, skipping duplicates, if some nonzero-frequency iterator * based on <code>index</code> was found; <code>null</code> otherwise. */ public String[] terms( final Index index ) { final Object2IntMap<String> termMap = index2termMap.get( index ); return termMap == null ? null : termMap.keySet().toArray( new String[ termMap.size() ] ); } /** Returns the offset associated to a given pair index/term. * * @param index an index appearing in {@link #indices()}. * @param term a term appearing in the array returned by {@link #terms(Index)} with argument <code>index</code>. * @return the offset associated to the pair <code>index</code>/<code>term</code>. */ public int offset( final Index index, final String term ) { return index2termMap.get( index ).getInt( term ); } public String toString() { return "[Leaves: " + numberOfPairs + "; " + index2termMap + "]"; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -