📄 tfidfscorer.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.search.score;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Sebastiano Vigna * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntList;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.search.DocumentIterator;import it.unimi.dsi.mg4j.search.visitor.CounterCollectionVisitor;import it.unimi.dsi.mg4j.search.visitor.CounterSetupVisitor;import it.unimi.dsi.mg4j.search.visitor.TermCollectionVisitor;import java.io.IOException;import java.util.Arrays;import org.apache.log4j.Logger;/** A scorer that implements the TF/IDF ranking formula. *  * <p>There are a number * of incarnations with small variations of the formula itself. Here, the weight * assigned to a term which appears in <var>f</var> documents out of a collection of <var>N</var> documents * w.r.t. to a document of length <var>l</var> in which the term appears <var>c</var> times is * <div style="text-align: center"> * log(<var>N</var> / <var>f</var>) <var>c</var> / <var>l</var>, * </div> *   * <p>This class uses a {@link it.unimi.dsi.mg4j.search.visitor.CounterCollectionVisitor} * and related classes to take into consideration only terms that are actually involved * in the current document.  *  * @author Sebastiano Vigna */public class TfIdfScorer extends AbstractWeightedScorer implements DelegatingScorer {	private static final Logger LOGGER = Logger.getLogger( TfIdfScorer.class );	private static final boolean DEBUG = false;	/** The counter collection visitor used to estimate counts. */	private final CounterCollectionVisitor counterCollectionVisitor;	/** The counter setup visitor used to estimate counts. */	private final CounterSetupVisitor setupVisitor;	/** The term collection visitor used to estimate counts. */	private final TermCollectionVisitor termVisitor;	/** An array (parallel to {@link #currIndex}) that caches size lists. */	private IntList sizes[];	/** An array (parallel to {@link #currIndex}) used by {@link #score()} to cache the current document sizes. */	private int[] size;	/** An array indexed by offsets that caches the inverse document-frequency part of the formula, multiplied by the index weight. */	private double[] weightedIdfPart;		public TfIdfScorer() {		termVisitor = new TermCollectionVisitor();		setupVisitor = new CounterSetupVisitor( termVisitor );		counterCollectionVisitor = new CounterCollectionVisitor( setupVisitor );	}	public synchronized TfIdfScorer copy() {		final TfIdfScorer scorer = new TfIdfScorer();		scorer.setWeights( index2Weight );		return scorer;	}	public double score() throws IOException {		setupVisitor.clear();		documentIterator.acceptOnTruePaths( counterCollectionVisitor );				final int document = documentIterator.document();		final int[] count = setupVisitor.count;		final int[] indexNumber = setupVisitor.indexNumber;		final double[] weightedIdfPart = this.weightedIdfPart;		final int[] size = this.size;				for( int i = currIndex.length; i-- != 0; ) size[ i ] = sizes[ i ].getInt( document );		int k;		double score = 0;		for ( int i = count.length; i-- != 0; ) {			k = indexNumber[ i ];			score += (double)count[ i ] / size[ k ] * weightedIdfPart[ i ];		}		return score;	}	public double score( final Index index ) {		throw new UnsupportedOperationException();	}	public void wrap( DocumentIterator d ) throws IOException {		documentIterator = d;		termVisitor.prepare();		d.accept( termVisitor );		if ( DEBUG ) LOGGER.debug( "Term Visitor found " + termVisitor.numberOfPairs() + " leaves" );		// Note that we use the index array provided by the visitor, *not* by the iterator.		final Index[] index = termVisitor.indices();		if ( DEBUG ) LOGGER.debug( "Indices: " + Arrays.toString( index ) );		// Some caching of frequently-used values		sizes = new IntList[ index.length ];		for( int i = index.length; i-- != 0; )			if ( ( sizes[ i ] = index[ i ].sizes ) == null ) throw new IllegalStateException( "A BM25 scorer requires document sizes" );					setupVisitor.prepare();		d.accept( setupVisitor );		final int[] frequency = setupVisitor.frequency;		final int[] indexNumber = setupVisitor.indexNumber;				// We do all logs here, and multiply by the weight		weightedIdfPart = new double[ frequency.length ];		for( int i = weightedIdfPart.length; i-- != 0; )			weightedIdfPart[ i ] = Math.log( index[ indexNumber[ i ] ].numberOfDocuments / (double)frequency[ i ] ) * index2Weight.getDouble( index[ indexNumber[ i ] ] );		size = new int[ index.length ];		currIndex = index;	}		public boolean usesIntervals() {		return false;	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -