📄 query.java
字号:
package it.unimi.dsi.mg4j.query;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.Util;import it.unimi.dsi.fastutil.Hash;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;import it.unimi.dsi.fastutil.objects.Object2ReferenceLinkedOpenHashMap;import it.unimi.dsi.fastutil.objects.Object2ReferenceMap;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.Reference2DoubleMap;import it.unimi.dsi.fastutil.objects.Reference2DoubleOpenHashMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;import it.unimi.dsi.fastutil.objects.Reference2ReferenceOpenHashMap;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.mg4j.document.DocumentCollection;import it.unimi.dsi.mg4j.document.DocumentFactory;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.TermProcessor;import it.unimi.dsi.mg4j.query.nodes.QueryTransformer;import it.unimi.dsi.mg4j.query.parser.QueryParserException;import it.unimi.dsi.mg4j.query.parser.SimpleParser;import it.unimi.dsi.mg4j.search.DocumentIteratorBuilderVisitor;import it.unimi.dsi.mg4j.search.score.BM25Scorer;import it.unimi.dsi.mg4j.search.score.DocumentScoreInfo;import it.unimi.dsi.mg4j.search.score.Scorer;import it.unimi.dsi.mg4j.search.score.VignaScorer;import it.unimi.dsi.mg4j.util.MG4JClassParser;import java.io.BufferedReader;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.PrintStream;import java.io.Reader;import java.lang.reflect.Constructor;import java.lang.reflect.InvocationTargetException;import java.net.URISyntaxException;import java.util.Arrays;import java.util.Comparator;import java.util.List;import org.apache.commons.configuration.ConfigurationException;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** A command-line interpreter to query indices. * * <p>This class can be used to start a {@linkplain it.unimi.dsi.mg4j.query.QueryEngine query engine} * from the command line. Optionally, it can * start a {@linkplain HttpQueryServer web server} that will serve the results in a * search-engine-like environment. Changes * to the query engine made on the command line will reflect on subsequent queries (also on the * web server). The web server access is fully multithreaded. * * <p>This class does not provide command-line history or editing: to get that effect, * we suggest to rely on some operating-system utility such as * <a href="http://utopia.knoware.nl/~hlub/uck/rlwrap/"><samp>rlwrap</samp></a>. * * <p><strong>Warning:</strong> This class is <strong>highly experimental</strong> (it is the * place that we tweak to experiment every kind of new indexing/ranking method). */public class Query { private static final Logger LOGGER = Util.getLogger( Query.class ); /** A formatter for TREC results. */ private static final java.text.NumberFormat FORMATTER = new java.text.DecimalFormat( "0.0000000000" ); public final static int MAX_STEMMING = 1024; public static enum Command { MODE, LIMIT, SELECT, SCORE, MPLEX, EXPAND, DIVERT, WEIGHT, EQUALIZE, QUIT } public static enum OutputType { /** Display just timings. */ TIME, /** Display document pointers, but not intervals. */ SHORT, /** Display document pointers and not intervals (requires an index with positions). */ LONG, /** Display document pointers and snippets (requires an index with positions and a collection). */ SNIPPET, /** Display results in TREC format. */ TREC; } /** The maximum number of items output to the console. */ private int maxOutput = 10; /** Current topic number, for {@link OutputType#TREC} only. */ private int trecTopicNumber; /** Current run tag, for {@link OutputType#TREC} only. */ private String trecRunTag; /** The current display mode. */ private OutputType displayMode = OutputType.SHORT; /** The current output stream, changeable with <samp>$divert</samp>. */ private PrintStream output = System.out; /** The current query engine. */ private final QueryEngine queryEngine; public Query( final QueryEngine queryEngine ) { this.queryEngine = queryEngine; } /** Parses a given array of index URIs/weights, loading the correspoding indices * and writing the result of parsing in the given maps. * * @param basenameWeight an array of index URIs of the form <samp><var>uri</var>[:<var>weight</var>]</samp>, specifying * the URI of an index and the weight for the index (1, if missing). * @param loadSizes forces size loading. * @param documentCollection an optional document collection, or <code>null</code>. * @param name2Index an empty, writable map that will be filled with pairs given by an index basename (or field name, if available) and an {@link Index}. * @param index2Weight an empty, writable map that will be filled with a map from indices to respective weights. */ private static void loadIndicesFromSpec( final String[] basenameWeight, boolean loadSizes, final DocumentCollection documentCollection, final Object2ReferenceMap<String,Index> name2Index, final Reference2DoubleMap<Index> index2Weight ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { for ( int i = 0; i < basenameWeight.length; i++ ) { // We must be careful, as ":" is used by Windows to separate the device from the path. final int split = basenameWeight[ i ].lastIndexOf( ':' ); double weight = 1; if ( split != -1 ) { try { weight = Double.parseDouble( basenameWeight[ i ].substring( split + 1 ) ); } catch( NumberFormatException e ) {} } final Index index; if ( split == -1 || basenameWeight[ i ].startsWith("mg4j://") ) { index = Index.getInstance( basenameWeight[ i ], true, loadSizes ); index2Weight.put( index, 1 ); } else { index = Index.getInstance( basenameWeight[ i ].substring( 0, split ) ); index2Weight.put( index, weight ); } if ( documentCollection != null && index.numberOfDocuments != documentCollection.size() ) LOGGER.warn( "Index " + index + " has " + index.numberOfDocuments + " documents, but the document collection has size " + documentCollection.size() ); name2Index.put( index.field != null ? index.field : basenameWeight[ i ], index ); } } /** Parses a specification of the form <samp>class(<arg>,…)[:weight]</samp> and returns the weight * (1 if missing) as result, assigning the just created object in the given index of the given array. * The arguments are all considered as strings. * * @param spec the specification. * @param array the array where the object is going to be stored. * @param index the offset within the array. * @return the weight (1 if missing). */ @SuppressWarnings("unchecked") private static <S> double loadClassFromSpec( String spec, final S[] array, final int index ) throws IllegalArgumentException, InstantiationException, IllegalAccessException, InvocationTargetException { int pos = spec.indexOf( ':' ); Class<S> type = (Class<S>)array.getClass().getComponentType(); double weightSpec = 1; if ( pos >= 0 ) { try { weightSpec = Double.parseDouble ( spec.substring( pos + 1 ) ); } catch ( NumberFormatException e ) { throw new IllegalArgumentException( "Malformed weight " + spec.substring( 0, pos ) ); } spec = spec.substring( 0, pos ); } int endOfName = spec.indexOf( '(' ); if ( endOfName < 0 ) endOfName = spec.length(); Class<? extends S> scorerClass = null; try { scorerClass = (Class<? extends S>)Class.forName (spec.substring(0, endOfName)); if ( ! type.isAssignableFrom( scorerClass ) ) throw new ClassCastException( "Class " + scorerClass.getSimpleName() + " is not assignable to " + type ); } catch ( ClassNotFoundException e ) { try { scorerClass = (Class<? extends S>)Class.forName( "it.unimi.dsi.mg4j.search.score." + spec.substring( 0, endOfName ) ); if ( ! type.isAssignableFrom( scorerClass ) ) throw new ClassCastException( "Class " + scorerClass.getSimpleName() + " is not assignable to " + type ); } catch ( ClassNotFoundException e1 ) { throw new IllegalArgumentException( "Unknown or improper class " + "[it.unimi.dsi.mg4j.search.score.]" + spec.substring( 0, endOfName ) ); } } String[] args = new String[ 0 ]; if ( endOfName < spec.length() ) { if ( spec.charAt( spec.length() - 1 ) != ')' ) throw new IllegalArgumentException( ") missing at the end of argument list" ); args = spec.substring( endOfName + 1 , spec.length() - 1 ).split( "," ); } Class[] argTypes = new Class[ args.length ]; for ( int i = 0; i < argTypes.length; i++ ) argTypes[ i ] = String.class; Constructor constr; try { constr = scorerClass.getConstructor( argTypes ); } catch ( Exception e ) { throw new IllegalArgumentException( "No constructor with " + argTypes.length + " strings as argument for class " + scorerClass.getName() + ": " + e ); } array[ index ] = (S)constr.newInstance( (Object [])args ); return weightSpec; } /** Interpret the given command, changing the static variables. * See the help printing code for possible commands. * * @param line the command line. * @return false iff we should exit after this command. */ public boolean interpretCommand( final String line ) { String[] part = line.substring( 1 ).split( "[ \t\n\r]+" ); final Command command; int i; if ( part[ 0 ].length() == 0 ) { System.err.println( "$ prints this help." ); System.err.println( "$mode [time|short|long|snippet|trec <topicNo> <runTag>] chooses display mode." ); System.err.println( "$select [<maxIntervals> <maxLength>] [all] installs or removes an interval selector." ); System.err.println( "$limit <max> output at most <max> results per query." ); System.err.println( "$divert [<filename>] diverts output to <filename> or to stdout." ); System.err.println( "$weight {index:weight} set index weights (unspecified weights are set to 1)." ); System.err.println( "$mplex [<on>|<off>] set/unset multiplex mode." ); System.err.println( "$equalize <sample> equalize scores using the given sample size." ); System.err.println( "$score {<scorerClass>(<arg>,...)[:<weight>]} order documents according to <scorerClass>." ); System.err.println( "$expand {<expanderClass>(<arg>,...)} expand terms and prefixes according to <expanderClass>." ); System.err.println( "$quit quits." ); return true; } try { command = Command.valueOf( part[ 0 ].toUpperCase() ); } catch( IllegalArgumentException e ) { System.err.println( "Invalid command \"" + part[ 0 ] + "\"; type $ for help." ); return true; } switch( command ) { case MODE: if ( part.length >= 2 ) { try { final OutputType tempMode = OutputType.valueOf( part[ 1 ].toUpperCase() ); if ( tempMode != OutputType.TREC && part.length > 2 ) System.err.println( "Extra arguments." ); else if ( tempMode == OutputType.TREC && part.length != 4 ) System.err.println( "Missing or extra arguments." ); else { displayMode = tempMode; if ( displayMode == OutputType.TREC ) { trecTopicNumber = Integer.parseInt( part[ 2 ] ); trecRunTag = part[ 3 ]; } } } catch( IllegalArgumentException e ) { System.err.println( "Unknown mode: " + part[ 1 ] ); } } else System.err.println( "Missing mode." ); break; case LIMIT: int out = -1; if ( part.length == 2 ) { try { out = Integer.parseInt( part[ 1 ] ); } catch( NumberFormatException e ) {} if ( out >= 0 ) maxOutput = out; } if ( out < 0 ) System.err.println( "Missing or incorrect limit." ); break; case SELECT: int maxIntervals = -1, maxLength = -1; if ( part.length == 1 ) { queryEngine.intervalSelector = null; System.err.println( "Intervals have been disabled." );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -