📄 concatenate.java
字号:
package it.unimi.dsi.mg4j.tool;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.io.OutputBitStream;import java.io.Closeable;import java.io.IOException;import java.lang.reflect.InvocationTargetException;import java.net.URISyntaxException;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import com.martiansoftware.jsap.JSAPException;/** Concatenates several indices. * * <p>This implementation of {@link it.unimi.dsi.mg4j.tool.Combine} concatenates * the involved indices: document 0 of the first index is document 0 of the * final collection, but document 0 of the second index is numbered after * the number of documents in the first index, and so on. The resulting * index is exactly what you would obtain by concatenating the document * sequences at the origin of each index. * * <p>Note that this class can be used also with a single index, making it possible to recompress easily * an index using different compression flags. * * @author Sebastiano Vigna * @since 1.0 * */final public class Concatenate extends Combine { public Concatenate( final String outputBasename, final String[] inputBasename, final boolean metadataOnly, final int bufferSize, final Map<Component,Coding> writerFlags, final boolean interleaved, final boolean skips, final int quantum, final int height, final int skipBufferSize, final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { super( outputBasename, inputBasename, metadataOnly, bufferSize, writerFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ); } protected int combineNumberOfDocuments() { int n = 0; for( int i = 0; i < numIndices; i++ ) n += index[ i ].numberOfDocuments; return n; } protected int combineSizes() throws IOException { int currDoc = 0, maxDocSize = 0; for( int i = 0; i < numIndices; i++ ) { final IntIterator sizes = sizes( i ); int s = 0; int j = index[ i ].numberOfDocuments; while( j-- != 0 ) { s = ( size[ currDoc++ ] += sizes.nextInt() ); if ( s > maxDocSize ) maxDocSize = s; } if ( sizes instanceof Closeable ) ((Closeable)sizes).close(); } return maxDocSize; } protected int combine( final int numUsedIndices ) throws IOException { int currIndex, numPrevDocs = 0, currDoc, count, totalFrequency; OutputBitStream obs; Index i; IndexIterator ii; // We gather the frequencies from the subindices and just add up. totalFrequency = 0; for( int k = numUsedIndices; k-- != 0; ) totalFrequency += ( frequency[ usedIndex[ k ] ] = indexIterator[ usedIndex[ k ] ].frequency() ); indexWriter.newInvertedList(); indexWriter.writeFrequency( totalFrequency ); for( int k = currIndex = 0; k < numUsedIndices; k++ ) { // We can just concatenated posting lists. // We must update the number of previously seen documents, possibly adding those in skipped indices. while( currIndex < usedIndex[ k ] ) numPrevDocs += index[ currIndex++ ].numberOfDocuments; i = index[ currIndex ]; ii = indexIterator[ currIndex ]; for( int j = frequency[ currIndex ]; j-- != 0; ) { obs = indexWriter.newDocumentRecord(); currDoc = ii.nextDocument() + numPrevDocs; indexWriter.writeDocumentPointer( obs, currDoc ); if ( i.hasPayloads ) indexWriter.writePayload( obs, ii.payload() ); if ( i.hasCounts ) { count = ii.count(); if ( hasCounts ) indexWriter.writePositionCount( obs, count ); if ( i.hasPositions && hasPositions ) indexWriter.writeDocumentPositions( obs, ii.positionArray(), 0, count, size != null ? size[ currDoc ] : -1 ); } } } return totalFrequency; } public static void main( String arg[] ) throws ConfigurationException, SecurityException, JSAPException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { Combine.main( arg, Concatenate.class ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -