📄 diskbasedindex.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
		catch( Exception ignore ) {}		File indexFile = new File( basename + INDEX_EXTENSION );		if ( ! indexFile.exists() ) throw new FileNotFoundException( "Cannot find index file " + indexFile.getName() );				final Map<Component,Coding> flags = CompressionFlags.valueOf( properties.getStringArray( Index.PropertyKeys.CODING ), null );		final int numberOfDocuments = properties.getInt( Index.PropertyKeys.DOCUMENTS ); 		final int numberOfTerms = properties.getInt( Index.PropertyKeys.TERMS );		final long numberOfPostings= properties.getLong( Index.PropertyKeys.POSTINGS ); 		final long numberOfOccurrences = properties.getLong( Index.PropertyKeys.OCCURRENCES );		final int maxCount = properties.getInt( Index.PropertyKeys.MAXCOUNT, -1 );		final String field = properties.getString( Index.PropertyKeys.FIELD );		if ( termMap != null && termMap.size() != numberOfTerms ) throw new IllegalArgumentException( "The size of the term map (" + termMap.size() + ") is not equal to the number of terms (" + numberOfTerms + ")" );		if ( prefixMap != null && prefixMap.size() != numberOfTerms ) throw new IllegalArgumentException( "The size of the prefix map (" + prefixMap.size() + ") is not equal to the number of terms (" + numberOfTerms + ")" );		final Payload payload = (Payload)( properties.containsKey( Index.PropertyKeys.PAYLOADCLASS ) ? Class.forName( properties.getString( Index.PropertyKeys.PAYLOADCLASS ) ).newInstance() : null );		final Coding frequencyCoding = flags.get( Component.FREQUENCIES );		final Coding pointerCoding = flags.get( Component.POINTERS );		final Coding countCoding = flags.get( Component.COUNTS );		final Coding positionCoding = flags.get( Component.POSITIONS );				if ( countCoding == null && positionCoding != null ) throw new IllegalArgumentException( "Index " + basename + " has positions but no counts (this can't happen)" );				// Load document sizes if forced to do so, or if the pointer/position compression methods make it necessary.		IntList sizes = null;		// TODO: quick patch to avoid loading sizes in case of payloads.		if ( payload == null && ( documentSizes || positionCoding == Coding.GOLOMB || positionCoding == Coding.INTERPOLATIVE ) ) {			sizes = DiskBasedIndex.readSizes( new InputBitStream( basename + DiskBasedIndex.SIZES_EXTENSION ), numberOfDocuments );			if ( sizes.size() != numberOfDocuments ) throw new IllegalStateException( "The length of the size list (" + sizes.size() + ") is not equal to the number of documents (" + numberOfDocuments + ")" );		}				// Load offsets if forced to do so. Depending on a property, we use the core-memory or the semi-external version.		final LongList offsets;		// TODO: quick patch to avoid loading sizes in case of payloads.		if ( payload == null && randomAccess ) {			int offsetStep = queryProperties != null && queryProperties.get( UriKeys.OFFSETSTEP ) != null ? Integer.parseInt( queryProperties.get( UriKeys.OFFSETSTEP ) ) : DEFAULT_OFFSET_STEP;						if ( offsetStep < 0 ) { // Memory-mapped				offsetStep  = -offsetStep;				final long length = new File( basename + DiskBasedIndex.OFFSETS_EXTENSION ).length();				offsets = LongLists.synchronize( new SemiExternalOffsetList( 						new InputBitStream( new ByteBufferInputStream( new FileInputStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ).getChannel().map( MapMode.READ_ONLY, 0, length ) ) ),						offsetStep, numberOfTerms + 1 ) );			}			else {				offsets = offsetStep == 0? 						DiskBasedIndex.readOffsets( new InputBitStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ), numberOfTerms ) :							LongLists.synchronize( new SemiExternalOffsetList( new InputBitStream( basename + DiskBasedIndex.OFFSETS_EXTENSION, 1024 ), offsetStep, numberOfTerms + 1 ) );			}			if ( offsets.size() != numberOfTerms + 1 ) throw new IllegalStateException( "The length of the offset list (" + offsets.size() + ") is not equal to the number of terms plus one (" + numberOfTerms + " + 1)" );		}		else offsets = null;						final int quantum = properties.getInt( BitStreamIndex.PropertyKeys.SKIPQUANTUM, -1 );		final int height = properties.getInt( BitStreamIndex.PropertyKeys.SKIPHEIGHT, -1 );		final int bufferSize = properties.getInt( BitStreamIndex.PropertyKeys.BUFFERSIZE, BitStreamIndex.DEFAULT_BUFFER_SIZE );		final TermProcessor termProcessor = Index.getTermProcessor( properties );		final boolean highPerformance = indexClass != null && FileHPIndex.class.isAssignableFrom( indexClass );				if ( queryProperties != null && queryProperties.containsKey( UriKeys.INMEMORY ) ) {			/*if ( SqrtSkipIndex.class.isAssignableFrom( indexClass ) )				return new SqrtSkipInMemoryIndex( BinIO.loadBytes( indexFile.toString() ), 						numberOfDocuments, numberOfTerms, numberOfPostings, numberOfOccurrences, maxCount, 						frequencyCoding, pointerCoding, countCoding, positionCoding,						termProcessor,						field, properties, termMap, prefixMap, sizes, offsets );*/			return highPerformance			? new InMemoryHPIndex( BinIO.loadBytes( indexFile.toString() ), BinIO.loadBytes( basename + POSITIONS_EXTENSION ), 					numberOfDocuments, numberOfTerms, numberOfPostings, numberOfOccurrences, maxCount, 					payload, frequencyCoding, pointerCoding, countCoding, positionCoding, quantum, height,					termProcessor,					field, properties, termMap, prefixMap, sizes, offsets )			: new InMemoryIndex( BinIO.loadBytes( indexFile.toString() ), 					numberOfDocuments, numberOfTerms, numberOfPostings, numberOfOccurrences, maxCount, 					payload, frequencyCoding, pointerCoding, countCoding, positionCoding, quantum, height,					termProcessor,					field, properties, termMap, prefixMap, sizes, offsets );		}		else if ( queryProperties != null && queryProperties.containsKey( UriKeys.MAPPED ) ) {			final File positionsFile = new File( basename + POSITIONS_EXTENSION );			final ByteBuffer index = new FileInputStream( indexFile ).getChannel().map( MapMode.READ_ONLY, 0, indexFile.length() );			return highPerformance 					? new MemoryMappedHPIndex( index, new FileInputStream( positionsFile ).getChannel().map( MapMode.READ_ONLY, 0, positionsFile.length() ),					numberOfDocuments, numberOfTerms, numberOfPostings, numberOfOccurrences, maxCount, 					payload, frequencyCoding, pointerCoding, countCoding, positionCoding, quantum, height,					termProcessor,					field, properties, termMap, prefixMap, sizes, offsets )					: new MemoryMappedIndex( index,							numberOfDocuments, numberOfTerms, numberOfPostings, numberOfOccurrences, maxCount, 							payload, frequencyCoding, pointerCoding, countCoding, positionCoding, quantum, height,							termProcessor,							field, properties, termMap, prefixMap, sizes, offsets );					}		/*if ( SqrtSkipIndex.class.isAssignableFrom( indexClass ) )			return new SqrtSkipFileIndex( basename.toString(), 				numberOfDocuments, numberOfTerms, numberOfPostings, numberOfOccurrences, maxCount, 				frequencyCoding, pointerCoding, countCoding, positionCoding,				termProcessor,				field, properties, termMap, prefixMap, sizes, offsets, indexFile );*/				return highPerformance  				? new FileHPIndex( basename.toString(), 						numberOfDocuments, numberOfTerms, numberOfPostings, numberOfOccurrences, maxCount, 						payload, frequencyCoding, pointerCoding, countCoding, positionCoding, quantum, height, bufferSize,						termProcessor,						field, properties, termMap, prefixMap, sizes, offsets )				: new FileIndex( basename.toString(), 				numberOfDocuments, numberOfTerms, numberOfPostings, numberOfOccurrences, maxCount, 				payload, frequencyCoding, pointerCoding, countCoding, positionCoding, quantum, height, bufferSize,				termProcessor,				field, properties, termMap, prefixMap, sizes, offsets );		 	}	/** Returns a new disk-based index, using preloaded {@link Properties} and possibly guessing reasonable term and prefix maps from the basename.	 * 	 * @param basename the basename of the index.	 * @param properties the properties obtained by stemming <code>basename</code>.	 * @param randomAccess whether the index should be accessible randomly.	 * @param documentSizes if true, document sizes will be loaded.	 * @param maps if true, {@linkplain StringMap term} and {@linkplain PrefixMap prefix} maps will be guessed and loaded.	 * @param queryProperties a map containing associations between {@link Index.UriKeys} and values, or <code>null</code>.	 * @throws IllegalAccessException 	 * @throws InstantiationException 	 * 	 * @see #getInstance(CharSequence, Properties, StringMap, PrefixMap, boolean, boolean, EnumMap)	 */	public static BitStreamIndex getInstance( final CharSequence basename, final Properties properties, final boolean randomAccess, final boolean documentSizes, final boolean maps, final EnumMap<UriKeys,String> queryProperties ) throws ClassNotFoundException, IOException, InstantiationException, IllegalAccessException {		StringMap<? extends CharSequence> termMap = null;		PrefixMap<? extends CharSequence> prefixMap = null;		if ( maps ) {			// TODO: check this logic			termMap = DiskBasedIndex.loadStringMap( basename + DiskBasedIndex.TERMMAP_EXTENSION );			if ( termMap != null && termMap instanceof PrefixMap ) return getInstance( basename, properties, termMap, (PrefixMap<?>)termMap, randomAccess, documentSizes, queryProperties );			prefixMap = DiskBasedIndex.loadPrefixMap( basename + DiskBasedIndex.PREFIXMAP_EXTENSION );			if ( termMap != null ) return getInstance( basename, properties, termMap, prefixMap, randomAccess, documentSizes, queryProperties );			if ( prefixMap != null ) return getInstance( basename, properties, prefixMap, prefixMap, randomAccess, documentSizes, queryProperties );		}		return getInstance( basename, properties, null, prefixMap, randomAccess, documentSizes, queryProperties );	}	/** Returns a new disk-based index, possibly guessing reasonable term and prefix maps from the basename.	 * 	 * <p>If there is a term map file (basename stemmed with <samp>.termmap</samp>), it is used as term map and,	 * in case it implements {@link PrefixMap}. Otherwise, we search for a prefix map (basename stemmed with <samp>.prefixmap</samp>)	 * and, if it implements {@link StringMap} and no term map has been found, we use it as prefix map.	 * 	 * @param basename the basename of the index.	 * @param randomAccess whether the index should be accessible randomly (e.g., if it will	 * be possible to call {@link IndexReader#documents(int)} on the index readers returned by the index).	 * @param documentSizes if true, document sizes will be loaded (note that sometimes document sizes	 * might be loaded anyway because the compression method for positions requires it).	 * @param maps if true, {@linkplain StringMap term} and {@linkplain PrefixMap prefix} maps will be guessed and loaded (this	 * feature might not be available with some kind of index). 	 * @param queryProperties a map containing associations between {@link Index.UriKeys} and values, or <code>null</code>.	 */	public static BitStreamIndex getInstance( final CharSequence basename, final boolean randomAccess, final boolean documentSizes, final boolean maps, final EnumMap<UriKeys,String> queryProperties ) throws ConfigurationException, ClassNotFoundException, IOException, InstantiationException, IllegalAccessException {		return getInstance( basename, new Properties( basename + DiskBasedIndex.PROPERTIES_EXTENSION ), randomAccess, documentSizes, maps, queryProperties );	}	/** Returns a new disk-based index, using preloaded {@link Properties} and possibly guessing reasonable term and prefix maps from the basename.	 * 	 * <p>If there is a term map file (basename stemmed with <samp>.termmap</samp>), it is used as term map and,	 * in case it implements {@link PrefixMap}. Otherwise, we search for a prefix map (basename stemmed with <samp>.prefixmap</samp>)	 * and, if it implements {@link StringMap} and no term map has been found, we use it as prefix map.	 * 	 * @param basename the basename of the index.	 * @param randomAccess whether the index should be accessible randomly (e.g., if it will	 * be possible to call {@link IndexReader#documents(int)} on the index readers returned by the index).	 * @param documentSizes if true, document sizes will be loaded (note that sometimes document sizes	 * might be loaded anyway because the compression method for positions requires it).	 * @param maps if true, {@linkplain StringMap term} and {@linkplain PrefixMap prefix} maps will be guessed and loaded (this	 * feature might not be available with some kind of index).	 * @see #getInstance(CharSequence, boolean, boolean, boolean, EnumMap) 	 */	public static BitStreamIndex getInstance( final CharSequence basename, final boolean randomAccess, final boolean documentSizes, final boolean maps ) throws ConfigurationException, ClassNotFoundException, IOException, InstantiationException, IllegalAccessException {		return getInstance( basename, new Properties( basename + DiskBasedIndex.PROPERTIES_EXTENSION ), randomAccess, documentSizes, maps, null );	}		/** Returns a new disk-based index, guessing reasonable term and prefix maps from the basename.	 * 	 * @param basename the basename of the index.	 * @param randomAccess whether the index should be accessible randomly (e.g., if it will	 * be possible to call {@link IndexReader#documents(int)} on the index readers returned by the index).	 * @param documentSizes if true, document sizes will be loaded (note that sometimes document sizes	 * might be loaded anyway because the compression method for positions requires it).	 */	public static BitStreamIndex getInstance( final CharSequence basename, final boolean randomAccess, final boolean documentSizes ) throws ConfigurationException, ClassNotFoundException, IOException, InstantiationException, IllegalAccessException {		return getInstance( basename, randomAccess, documentSizes, true );	}	/** Returns a new local index, trying to guess reasonable term and prefix maps from the basename,	 * and loading document sizes only if it is necessary.	 * 	 * @param basename the basename of the index.	 * @param randomAccess whether the index should be accessible randomly (e.g., if it will	 * be possible to call {@link IndexReader#documents(int)} on the index readers returned by the index).	 */	public static BitStreamIndex getInstance( final CharSequence basename, final boolean randomAccess ) throws ConfigurationException, ClassNotFoundException, IOException, InstantiationException, IllegalAccessException {		return getInstance( basename, randomAccess, false );	}	/** Returns a new local index, trying to guess reasonable term and prefix maps from the basename,	 *  loading offsets but loading document sizes only if it is necessary.	 * 	 * @param basename the basename of the index.	 */	public static BitStreamIndex getInstance( final CharSequence basename ) throws ConfigurationException, ClassNotFoundException, IOException, InstantiationException, IllegalAccessException {		return getInstance( basename, true );	}}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -