📄 trecdocumentcollection.java
字号:
} private final InputStream openFileStream( String fileName ) throws IOException { final InputStream s = new FileInputStream( fileName ); if ( useGzip ) return new GZIPInputStream( s ); else return s; } /** Creates a new TREC collection by parsing the given files. * * @param file an array of file names containing documents in TREC GOV2 format. * @param factory the document factory (usually, a composite one). * @param bufferSize the buffer size. * @param useGzip true iff the files are gzipped. */ public TRECDocumentCollection( String[] file, DocumentFactory factory, int bufferSize, boolean useGzip ) throws IOException { this.file = file; this.factory = factory; this.bufferSize = bufferSize; this.descriptors = new ObjectArrayList<TRECDocumentDescriptor>(); this.useGzip = useGzip; final ProgressLogger progressLogger = new ProgressLogger( LOGGER ); progressLogger.expectedUpdates = file.length; progressLogger.itemsName = "files"; progressLogger.start( "Parsing " + ( useGzip ? "GZip" : "plain" ) + " files" ); for ( int i = 0; i < file.length; i++ ) { parseContent( i, openFileStream( file[ i ] ) ); progressLogger.update(); } progressLogger.done(); } public int size() { return descriptors.size(); } @SuppressWarnings("unchecked") public Document document( int n ) throws IOException { Reference2ObjectMap<Enum<?>, Object> metadata = metadata( n ); return factory.getDocument( stream( n ), metadata ); } public InputStream stream( final int n ) throws IOException { // Creates a Segmented Input Stream with only one segment in (the requested one). ensureDocumentIndex( n ); if ( lastStream != null ) lastStream.close(); final TRECDocumentDescriptor descr = descriptors.get( n ); return lastStream = new SegmentedInputStream( openFileStream( file[ descr.fileIndex ] ), descr.toSegments() ); } public Reference2ObjectMap<Enum<?>, Object> metadata( final int index ) { ensureDocumentIndex( index ); final Reference2ObjectArrayMap<Enum<?>, Object> metadata = new Reference2ObjectArrayMap<Enum<?>, Object>( 4 ); metadata.put( MetadataKeys.URI, "Document #" + index ); return metadata; } public DocumentFactory factory() { return this.factory; } public void close() throws IOException { super.close(); if ( lastStream != null ) lastStream.close(); descriptors = null; } /** * Merges a new collection in this one, by rebuilding the gzFile array and * appending the other object one, concatenating the descriptors while * rebuilding all. * <p> * It is supposed that the passed object contains no duplicates for the * local collection. */ public void merge( TRECDocumentCollection other ) { int oldLength = this.file.length; this.file = ObjectArrays.ensureCapacity( this.file, this.file.length + other.file.length ); System.arraycopy( other.file, 0, this.file, oldLength, other.file.length ); ObjectIterator<TRECDocumentDescriptor> iter = other.descriptors.iterator(); while ( iter.hasNext() ) { final TRECDocumentDescriptor tdd = (TRECDocumentDescriptor)iter.next().clone(); tdd.fileIndex += oldLength; this.descriptors.add( tdd ); } } public DocumentIterator iterator() throws IOException { return new AbstractDocumentIterator() { /** * An iterator returning the descriptors of the documents in the * enveloping collection. */ private final ObjectIterator<TRECDocumentDescriptor> descriptorIterator = descriptors.iterator(); /** The current stream. */ private SegmentedInputStream siStream; /** The current document. */ private int currentDocument = 0; /** The last returned document. */ private Document last; /** The first descriptor of the next file, if any, or <code>null</code> if nextFile() has never been called. */ private TRECDocumentDescriptor firstNextDescriptor; private boolean nextFile() throws FileNotFoundException, IOException { if ( size() == 0 ) return false; if ( siStream != null ) siStream.close(); if ( ! descriptorIterator.hasNext() ) return false; /* * We assume documents contained in the same gzip file are * contigous so we collect all of them until we find a different * file index. */ TRECDocumentDescriptor currentDescriptor = firstNextDescriptor != null ? firstNextDescriptor : descriptorIterator.next(); int currentFileIndex = currentDescriptor.fileIndex; if ( DEBUG ) LOGGER.debug( "Skipping to contents file " + currentFileIndex + " (" + file[ currentFileIndex ] + ")" ); /* * We create the segmented input stream with all just collected * descriptors */ siStream = new SegmentedInputStream( openFileStream( file[ currentFileIndex ] ) ); do { siStream.addBlock( currentDescriptor.toSegments() ); if ( ! descriptorIterator.hasNext() ) break; currentDescriptor = descriptorIterator.next(); } while ( currentDescriptor.fileIndex == currentFileIndex ); firstNextDescriptor = currentDescriptor; // The last assignment will be meaningless, but it won't be used anyway return true; } public Document nextDocument() throws IOException { /* If necessary, skip to the next segment, else, try skipping to the next gzip file. */ if ( DEBUG ) LOGGER.debug( "nextDocument() has been called " ); if ( last != null ) { last.close(); if ( ! siStream.hasMoreBlocks() ) { if ( ! nextFile() ) return last = null; } else siStream.nextBlock(); } else if ( ! nextFile() ) return null; // First call return last = factory.getDocument( siStream, metadata( currentDocument++ ) ); } public void close() throws IOException { if ( siStream != null ) { if ( last != null ) last.close(); super.close(); siStream.close(); siStream = null; } } }; } private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException { s.defaultReadObject(); final int size = s.readInt(); final ObjectArrayList<TRECDocumentDescriptor> descriptors = new ObjectArrayList<TRECDocumentDescriptor>(); descriptors.ensureCapacity( size ); for ( int i = 0; i < size; i++ ) descriptors.add( new TRECDocumentDescriptor( s.readInt(), s.readLong(), s.readInt(), s.readInt() ) ); this.descriptors = descriptors; } private void writeObject(final ObjectOutputStream s) throws IOException { s.defaultWriteObject(); s.writeInt(descriptors.size()); for (TRECDocumentDescriptor descriptor : descriptors) { s.writeInt(descriptor.fileIndex); s.writeLong(descriptor.startMarker); s.writeInt(descriptor.intermediateMarkerDiff); s.writeInt(descriptor.stopMarkerDiff); } } public static void main( final String[] arg ) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { SimpleJSAP jsap = new SimpleJSAP( TRECDocumentCollection.class.getName(), "Saves a serialised TREC document collection based on a set of file names (which will be sorted lexicographically).", new Parameter[] { new FlaggedOption( "factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), JSAP.NOT_REQUIRED, 'f', "factory", "A document factory with a standard constructor." ), new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file" ).setAllowMultipleDeclarations( true ), new Switch( "gzipped", 'z', "gzipped", "The files are gzipped." ), new Switch( "unsorted", 'u', "unsorted", "Keep the file list unsorted." ), new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, DEFAULT_BUFFER_SIZE, JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ), new UnflaggedOption( "collection", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the serialised collection." ), new UnflaggedOption( "file", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.GREEDY, "A list of files that will be indexed. If missing, a list of files will be read from standard input." ) } ); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; final DocumentFactory userFactory = PropertyBasedDocumentFactory.getInstance( jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ) ); String[] file = jsapResult.getStringArray( "file" ); if ( file.length == 0 ) { final ObjectArrayList<String> files = new ObjectArrayList<String>(); BufferedReader bufferedReader = new BufferedReader( new InputStreamReader( System.in ) ); String s; while ( ( s = bufferedReader.readLine() ) != null ) files.add( s ); file = files.toArray( new String[ 0 ] ); } // To avoid problems with find and similar utilities, we sort the file names if ( !jsapResult.getBoolean( "unsorted" ) ) Arrays.sort( file ); final DocumentFactory composite = CompositeDocumentFactory.getFactory( new TRECHeaderDocumentFactory(), userFactory ); if ( file.length == 0 ) System.err.println( "WARNING: empty file set." ); BinIO.storeObject( new TRECDocumentCollection( file, composite, jsapResult.getInt( "bufferSize" ), jsapResult.getBoolean( "gzipped" ) ), jsapResult.getString( "collection" ) ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -