📄 trecdocumentcollection.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
	}	private final InputStream openFileStream( String fileName ) throws IOException {		final InputStream s = new FileInputStream( fileName );		if ( useGzip ) return new GZIPInputStream( s );		else return s;	}	/** Creates a new TREC collection by parsing the given files.	 * 	 * @param file an array of file names containing documents in TREC GOV2 format.	 * @param factory the document factory (usually, a composite one).	 * @param bufferSize the buffer size.	 * @param useGzip true iff the files are gzipped.	 */	public TRECDocumentCollection( String[] file, DocumentFactory factory, int bufferSize, boolean useGzip ) throws IOException {		this.file = file;		this.factory = factory;		this.bufferSize = bufferSize;		this.descriptors = new ObjectArrayList<TRECDocumentDescriptor>();		this.useGzip = useGzip;		final ProgressLogger progressLogger = new ProgressLogger( LOGGER );		progressLogger.expectedUpdates = file.length;		progressLogger.itemsName = "files";		progressLogger.start( "Parsing " + ( useGzip ? "GZip" : "plain" ) + " files" );		for ( int i = 0; i < file.length; i++ ) {			parseContent( i, openFileStream( file[ i ] ) );			progressLogger.update();		}		progressLogger.done();	}	public int size() {		return descriptors.size();	}	@SuppressWarnings("unchecked")	public Document document( int n ) throws IOException {		Reference2ObjectMap<Enum<?>, Object> metadata = metadata( n );		return factory.getDocument( stream( n ), metadata );	}	public InputStream stream( final int n ) throws IOException {		// Creates a Segmented Input Stream with only one segment in (the requested one).		ensureDocumentIndex( n );		if ( lastStream != null ) lastStream.close();		final TRECDocumentDescriptor descr = descriptors.get( n );		return lastStream = new SegmentedInputStream( openFileStream( file[ descr.fileIndex ] ), descr.toSegments() );	}	public Reference2ObjectMap<Enum<?>, Object> metadata( final int index ) {		ensureDocumentIndex( index );		final Reference2ObjectArrayMap<Enum<?>, Object> metadata = new Reference2ObjectArrayMap<Enum<?>, Object>( 4 );		metadata.put( MetadataKeys.URI, "Document #" + index );		return metadata;	}	public DocumentFactory factory() {		return this.factory;	}	public void close() throws IOException {		super.close();		if ( lastStream != null ) lastStream.close();		descriptors = null;	}	/**	 * Merges a new collection in this one, by rebuilding the gzFile array and	 * appending the other object one, concatenating the descriptors while	 * rebuilding all.	 * <p>	 * It is supposed that the passed object contains no duplicates for the	 * local collection.	 */	public void merge( TRECDocumentCollection other ) {		int oldLength = this.file.length;		this.file = ObjectArrays.ensureCapacity( this.file, this.file.length + other.file.length );		System.arraycopy( other.file, 0, this.file, oldLength, other.file.length );		ObjectIterator<TRECDocumentDescriptor> iter = other.descriptors.iterator();		while ( iter.hasNext() ) {			final TRECDocumentDescriptor tdd = (TRECDocumentDescriptor)iter.next().clone();			tdd.fileIndex += oldLength;			this.descriptors.add( tdd );		}	}	public DocumentIterator iterator() throws IOException {		return new AbstractDocumentIterator() {			/**			 * An iterator returning the descriptors of the documents in the			 * enveloping collection.			 */			private final ObjectIterator<TRECDocumentDescriptor> descriptorIterator = descriptors.iterator();			/** The current stream. */			private SegmentedInputStream siStream;			/** The current document. */			private int currentDocument = 0;			/** The last returned document. */			private Document last;			/** The first descriptor of the next file, if any, or <code>null</code> if nextFile() has never been called. */			private TRECDocumentDescriptor firstNextDescriptor;						private boolean nextFile() throws FileNotFoundException, IOException {				if ( size() == 0 ) return false;				if ( siStream != null ) siStream.close();				if ( ! descriptorIterator.hasNext() ) return false;				/*				 * We assume documents contained in the same gzip file are				 * contigous so we collect all of them until we find a different				 * file index.				 */				TRECDocumentDescriptor currentDescriptor = firstNextDescriptor != null ? firstNextDescriptor : descriptorIterator.next();				int currentFileIndex = currentDescriptor.fileIndex;				if ( DEBUG ) LOGGER.debug( "Skipping to contents file " + currentFileIndex + " (" + file[ currentFileIndex ] + ")" );				/*				 * We create the segmented input stream with all just collected				 * descriptors				 */				siStream = new SegmentedInputStream( openFileStream( file[ currentFileIndex ] ) );				do {					siStream.addBlock( currentDescriptor.toSegments() );					if ( ! descriptorIterator.hasNext() ) break;					currentDescriptor = descriptorIterator.next();				} while ( currentDescriptor.fileIndex == currentFileIndex );				firstNextDescriptor = currentDescriptor; // The last assignment will be meaningless, but it won't be used anyway				return true;			}			public Document nextDocument() throws IOException {				/* If necessary, skip to the next segment, else, try skipping to the next gzip file. */				if ( DEBUG ) LOGGER.debug( "nextDocument() has been called " );								if ( last != null ) {					last.close();					if ( ! siStream.hasMoreBlocks() ) {						if ( ! nextFile() ) return last = null;					}					else siStream.nextBlock();				}				else if ( ! nextFile() ) return null; // First call								return last = factory.getDocument( siStream, metadata( currentDocument++ ) );			}			public void close() throws IOException {				if ( siStream != null ) {					if ( last != null ) last.close();					super.close();					siStream.close();					siStream = null;				}			}		};	}	private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {		s.defaultReadObject();		final int size = s.readInt();		final ObjectArrayList<TRECDocumentDescriptor> descriptors = new ObjectArrayList<TRECDocumentDescriptor>();		descriptors.ensureCapacity( size );		for ( int i = 0; i < size; i++ )			descriptors.add( new TRECDocumentDescriptor( s.readInt(), s.readLong(), s.readInt(), s.readInt() ) );		this.descriptors = descriptors;	}	private void writeObject(final ObjectOutputStream s) throws IOException {		s.defaultWriteObject();		s.writeInt(descriptors.size());		for (TRECDocumentDescriptor descriptor : descriptors) {			s.writeInt(descriptor.fileIndex);			s.writeLong(descriptor.startMarker);			s.writeInt(descriptor.intermediateMarkerDiff);			s.writeInt(descriptor.stopMarkerDiff);		}	}	public static void main( final String[] arg ) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {		SimpleJSAP jsap = new SimpleJSAP(				TRECDocumentCollection.class.getName(), "Saves a serialised TREC document collection based on a set of file names (which will be sorted lexicographically).",				new Parameter[] {						new FlaggedOption( "factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), JSAP.NOT_REQUIRED, 'f', "factory", "A document factory with a standard constructor." ),						new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file" ).setAllowMultipleDeclarations( true ),						new Switch( "gzipped", 'z', "gzipped", "The files are gzipped." ),						new Switch( "unsorted", 'u', "unsorted", "Keep the file list unsorted." ),						new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, DEFAULT_BUFFER_SIZE, JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),						new UnflaggedOption( "collection", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the serialised collection." ),						new UnflaggedOption( "file", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.GREEDY, "A list of files that will be indexed. If missing, a list of files will be read from standard input." ) 		} );		JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;		final DocumentFactory userFactory = PropertyBasedDocumentFactory.getInstance( jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ) );		String[] file = jsapResult.getStringArray( "file" );		if ( file.length == 0 ) {			final ObjectArrayList<String> files = new ObjectArrayList<String>();			BufferedReader bufferedReader = new BufferedReader( new InputStreamReader( System.in ) );			String s;			while ( ( s = bufferedReader.readLine() ) != null ) files.add( s );			file = files.toArray( new String[ 0 ] );		}		// To avoid problems with find and similar utilities, we sort the file names		if ( !jsapResult.getBoolean( "unsorted" ) ) Arrays.sort( file );		final DocumentFactory composite = CompositeDocumentFactory.getFactory( new TRECHeaderDocumentFactory(), userFactory );		if ( file.length == 0 ) System.err.println( "WARNING: empty file set." );		BinIO.storeObject( new TRECDocumentCollection( file, composite, jsapResult.getInt( "bufferSize" ), jsapResult.getBoolean( "gzipped" ) ), jsapResult.getString( "collection" ) );	}}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -