📄 query.java
字号:
} else if ( part.length == 2 && "all".equals( part[ 1 ] ) ) { queryEngine.intervalSelector = new IntervalSelector(); // All intervals System.err.println( "Interval selection has been disabled (will compute all intervals)." ); } else { if ( part.length == 3 ) { try { maxIntervals = Integer.parseInt( part[ 1 ] ); maxLength = Integer.parseInt( part[ 2 ] ); queryEngine.intervalSelector = new IntervalSelector( maxIntervals, maxLength ); } catch( NumberFormatException e ) {} } if ( maxIntervals < 0 || maxLength < 0 ) System.err.println( "Missing or incorrect selector parameters." ); } break; case SCORE: final Scorer[] scorer = new Scorer[ part.length - 1 ]; final double[] weight = new double[ part.length - 1 ]; for ( i = 1; i < part.length; i++ ) try { weight[ i - 1 ] = loadClassFromSpec( part[ i ], scorer, i - 1 ); if ( weight[ i - 1 ] < 0 ) throw new IllegalArgumentException( "Weights should be non-negative" ); } catch ( Exception e ) { System.err.print( "Error while parsing specification: " ); e.printStackTrace( System.err ); break; } if ( i == part.length ) queryEngine.score( scorer, weight ); break; case EXPAND: if ( part.length > 2 ) System.err.println( "Wrong argument(s) to command" ); else if ( part.length == 1 ) { queryEngine.transformer( null ); } else { QueryTransformer[] t = new QueryTransformer[ 1 ]; try { loadClassFromSpec( part[ 1 ], t, 0 ); queryEngine.transformer( t[ 0 ] ); } catch ( Exception e ) { System.err.print( "Error while parsing specification: " ); e.printStackTrace( System.err ); break; } } break; case MPLEX: if ( part.length != 2 || ( part.length == 2 && !"on".equals( part[ 1 ] ) && !"off".equals( part[ 1 ] ) ) ) System.err.println( "Wrong argument(s) to command" ); else { if ( part.length > 1 ) queryEngine.multiplex = "on".equals( part[ 1 ] ); System.err.println( "Multiplex: " + part[ 1 ] ); } break; case DIVERT: if ( part.length > 2 ) System.err.println( "Wrong argument(s) to command" ); else { if ( output != System.out ) output.close(); try { output = part.length == 1 ? System.out : new PrintStream( new FastBufferedOutputStream( new FileOutputStream( part[ 1 ] ) ) ); } catch ( FileNotFoundException e ) { System.err.println( "Cannot create file " + part[ 1 ] ); output = System.out; } } break; case WEIGHT: final Reference2DoubleOpenHashMap<Index> newIndex2Weight = new Reference2DoubleOpenHashMap<Index>(); for ( i = 1; i < part.length; i++ ) { final int pos = part[ i ].indexOf( ':' ); if ( pos < 0 ) { System.err.println( "Missing colon: " + part[ i ] ); break; } else if ( ! queryEngine.indexMap.containsKey( part[ i ].substring( 0, pos ) ) ) { System.err.println( "Unknown index: " + part[ i ].substring( 0, pos ) ); break; } try { double newWeight = Double.parseDouble( part[ i ].substring( pos + 1 ) ); newIndex2Weight.put( queryEngine.indexMap.get( part[ i ].substring( 0, pos ) ), newWeight ); } catch ( NumberFormatException e ) { System.err.println( "Wrong weight specification: " + part[ i ].substring( pos + 1 ) ); break; } } if ( i == part.length ) { if ( i > 1 ) queryEngine.setWeights( newIndex2Weight ); for( String key : queryEngine.indexMap.keySet() ) System.err.print( key + ":" + newIndex2Weight.getDouble( queryEngine.indexMap.get( key ) ) + " " ); System.err.println(); } break; case EQUALIZE: try { if ( part.length != 2 ) throw new NumberFormatException( "Illegal number of arguments" ); queryEngine.equalize( Integer.parseInt( part[ 1 ] ) ); System.err.println( "Equalization sample set to " + Integer.parseInt( part[ 1 ] ) ); } catch ( NumberFormatException e ) { System.err.println( e.getMessage() ); } break; case QUIT: return false; } return true; } /** Scores the given document iterator and produces score output. * * @param results an iterator returning instances of {@link DocumentScoreInfo}. * @param documentCollection an optional document collection, or <code>null</code>. * @param titleList an optional list of titles, or <code>null</code>. * @param marker an optional text marker to mark snippets, or <code>null</code>. * @return the number of documents scanned. */ @SuppressWarnings("boxing") public int output( final ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index,SelectedInterval[]>>> results, final DocumentCollection documentCollection, final List<? extends CharSequence> titleList, final Marker marker ) throws IOException { int i; DocumentScoreInfo<Reference2ObjectMap<Index,SelectedInterval[]>> dsi; if ( displayMode == OutputType.TREC ) { if ( titleList == null ) throw new IllegalStateException( "You cannot use TREC mode without a title list" ); for( i = 0; i < results.size(); i++ ) { dsi = results.get( i ); output.println( trecTopicNumber + " Q0 " + titleList.get( dsi.document ) + " " + i + " " + FORMATTER.format( dsi.score ) + " " + trecRunTag ); } // Horrible patch for no-answer queries (a workaround for TREC necessity of at least one result per query) if ( results.size() == 0 ) output.println( trecTopicNumber + " Q0 GX000-00-0000000 1 0 " + trecRunTag ); } else { for ( i = 0; i < results.size(); i++ ) { dsi = results.get( i ); final int document = dsi.document; output.print( "Document #" + document ); output.printf( " [%.6f]", dsi.score ); Document d = null; // Filled lazily // We try to print a title, preferring the supplied title list if present if ( titleList != null ) output.println( " " + titleList.get( document ) ); else if ( documentCollection != null ) { d = documentCollection.document( document ); output.println( " " + d.title().toString().trim() ); d.close(); } else output.println(); if ( ( displayMode == OutputType.LONG || displayMode == OutputType.SNIPPET ) && dsi.info != null && queryEngine.intervalSelector != null ) { final Index[] sortedIndex = dsi.info.keySet().toArray( new Index[ 0 ] ); if ( documentCollection != null ) Arrays.sort( sortedIndex, new Comparator<Index>() { public int compare( final Index i0, final Index i1 ) { return documentCollection.factory().fieldIndex( i0.field ) - documentCollection.factory().fieldIndex( i1.field ); }} ); for( Index index: sortedIndex ) if ( index.hasPositions ) { SelectedInterval[] interval = dsi.info.get( index ); if ( interval == SelectedInterval.TRUE_ARRAY ) output.println( index + ": TRUE" ); else if ( interval == SelectedInterval.FALSE_ARRAY ) output.println( index + ": FALSE" ); else if ( displayMode == OutputType.LONG || documentCollection == null ) output.println( index + ": " + Arrays.toString( interval ) ); else { // SNIPPET_MODE final MarkingMutableString s = new MarkingMutableString( marker ); s.startField( interval ); // TODO: this must be in increasing field order if ( d == null ) d = documentCollection.document( document ); int fieldIndex = documentCollection.factory().fieldIndex( index.field ); if ( fieldIndex == -1 || documentCollection.factory().fieldType( fieldIndex ) != DocumentFactory.FieldType.TEXT ) continue; final Reader reader = (Reader)d.content( fieldIndex ); s.appendAndMark( d.wordReader( fieldIndex ).setReader( reader ) ); s.endField(); d.close(); output.println( index.field + ": " + s.toString() ); } } else if ( index.hasPayloads && dsi.info.get( index ) == SelectedInterval.TRUE_ARRAY ) { if ( d == null ) d = documentCollection.document( document ); int fieldIndex = documentCollection.factory().fieldIndex( index.field ); if ( fieldIndex == -1 ) continue; output.println( d.content( fieldIndex ) ); } output.println(); } } } return i; } @SuppressWarnings("unchecked") public static void main( final String[] arg ) throws Exception { SimpleJSAP jsap = new SimpleJSAP( Query.class.getName(), "Loads indices relative to a collection, possibly loads the collection, and answers to queries.", new Parameter[] { new FlaggedOption( "collection", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "collection", "The collection of documents indexed by the given indices." ), new FlaggedOption( "titleList", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 't', "titleList", "A serialized list of titles (only if collection is not specified)." ), new FlaggedOption( "input", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'I', "input", "A file containing the input." ), new Switch( "noSizes", 'n', "no-sizes", "Disable loading document sizes (they are necessary for BM25 scoring)." ), new Switch( "http", 'h', "http", "Starts an HTTP query server." ), new Switch( "verbose", 'v', "verbose", "Print full exception stack traces." ), new FlaggedOption( "itemClass", MG4JClassParser.getParser(), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'i', "item-class", "The class that will handle item display in the HTTP server." ), new FlaggedOption( "itemMimeType", JSAP.STRING_PARSER, "text/html", JSAP.NOT_REQUIRED, 'm', "item-mime-type", "A MIME type suggested to the class handling item display in the HTTP server." ), new FlaggedOption( "port", JSAP.INTEGER_PARSER, "4242", JSAP.NOT_REQUIRED, 'p', "port", "The port on localhost where the server will appear." ), new UnflaggedOption( "basenameWeight", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "The indices that the servlet will use. Indices are specified using their basename, optionally followed by a colon and a double representing the weight used to score results from that index. Indices without a specified weight are weighted 1." ) }); final JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; final DocumentCollection documentCollection = (DocumentCollection)( jsapResult.getString( "collection" ) != null ? BinIO.loadObject( jsapResult.getString( "collection" ) ) : null ); final List titleList = (List) ( jsapResult.getString( "titleList" ) != null? BinIO.loadObject( jsapResult.getString( "titleList" ) ) : null ); final String[] basenameWeight = jsapResult.getStringArray( "basenameWeight" ); final Object2ReferenceLinkedOpenHashMap<String,Index> indexMap = new Object2ReferenceLinkedOpenHashMap<String,Index>( Hash.DEFAULT_INITIAL_SIZE, .5f ); final Reference2DoubleOpenHashMap<Index> index2Weight = new Reference2DoubleOpenHashMap<Index>(); final boolean verbose = jsapResult.getBoolean( "verbose" ); final boolean loadSizes = ! jsapResult.getBoolean( "noSizes" ); Query.loadIndicesFromSpec( basenameWeight, loadSizes, documentCollection, indexMap, index2Weight ); final Object2ObjectOpenHashMap<String,TermProcessor> termProcessors = new Object2ObjectOpenHashMap<String,TermProcessor>( indexMap.size() ); for( String alias: indexMap.keySet() ) termProcessors.put( alias, indexMap.get( alias ).termProcessor ); final SimpleParser simpleParser = new SimpleParser( indexMap.keySet(), indexMap.firstKey(), termProcessors ); final Reference2ReferenceMap<Index, Object> index2Parser = new Reference2ReferenceOpenHashMap<Index, Object>(); /* // Fetch parsers for payload-based fields. for( Index index: indexMap.values() ) if ( index.hasPayloads ) { if ( index.payload.getClass() == DatePayload.class ) index2Parser.put( index, DateFormat.getDateInstance( DateFormat.SHORT, Locale.UK ) ); } */ final QueryEngine queryEngine = new QueryEngine( simpleParser, new DocumentIteratorBuilderVisitor( indexMap, index2Parser, indexMap.get( indexMap.firstKey() ), MAX_STEMMING ), indexMap ); queryEngine.setWeights( index2Weight ); queryEngine.score( new Scorer[] { new BM25Scorer(), new VignaScorer() }, new double[] { 1, 1 } ); // We set up an interval selector only if there is a collection for snippeting queryEngine.intervalSelector = documentCollection != null ? new IntervalSelector( 4, 40 ) : new IntervalSelector(); queryEngine.multiplex = true; queryEngine.equalize( 1000 ); Query query = new Query( queryEngine ); query.displayMode = OutputType.SNIPPET; String q; System.err.println( "Welcome to the MG4J query class (setup with $mode snippet, $score BM25Scorer VignaScorer, $mplex on, $equalize 1000, $select " + ( documentCollection != null ? "4 40" : "all" ) + ")" ); System.err.println( "Please type $ for help." ); String prompt = indexMap.keySet().toString() + ">"; int n; HttpQueryServer httpQueryServer = null; if ( jsapResult.getBoolean( "http" ) ) httpQueryServer = new HttpQueryServer( queryEngine, documentCollection, jsapResult.getClass( "itemClass" ), jsapResult.getString( "itemMimeType" ), jsapResult.getInt( "port" ), titleList ); try { final BufferedReader br = new BufferedReader( new InputStreamReader( jsapResult.userSpecified( "input" ) ? new FileInputStream( jsapResult.getString( "input") ) : System.in ) ); final ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index,SelectedInterval[]>>> results = new ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index,SelectedInterval[]>>>(); for ( ;; ) { System.out.print( prompt ); q = br.readLine(); if ( q == null ) { System.err.println(); break; // CTRL-D } if ( q.length() == 0 ) continue; if ( q.charAt( 0 ) == '$' ) { if ( ! query.interpretCommand( q ) ) break; continue; } long time = -System.currentTimeMillis(); try { n = queryEngine.process( q, 0, query.maxOutput, results ); } catch( QueryParserException e ) { if ( verbose ) e.getCause().printStackTrace( System.err ); else System.err.println( e.getCause() ); continue; } catch( Exception e ) { if ( verbose ) e.printStackTrace( System.err ); else System.err.println( e ); continue; } time += System.currentTimeMillis(); query.output( results, documentCollection, titleList, TextMarker.TEXT_BOLDFACE ); System.err.println( results.size() + " results; " + n + " documents examined; " + time + " ms; " + Util.format( ( n * 1000.0 ) / time ) + " documents/s" ); } } finally { if ( httpQueryServer != null ) httpQueryServer.server.stop(); if ( query.output != System.out ) query.output.close(); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -