📄 indextest.java
字号:
package test.it.unimi.dsi.mg4j.tool;import static it.unimi.dsi.logging.ProgressLogger.DEFAULT_LOG_INTERVAL;import static it.unimi.dsi.mg4j.index.CompressionFlags.DEFAULT_PAYLOAD_INDEX;import static it.unimi.dsi.mg4j.index.CompressionFlags.DEFAULT_STANDARD_INDEX;import static it.unimi.dsi.mg4j.index.DiskBasedIndex.FREQUENCIES_EXTENSION;import static it.unimi.dsi.mg4j.index.DiskBasedIndex.GLOBCOUNTS_EXTENSION;import static it.unimi.dsi.mg4j.index.DiskBasedIndex.INDEX_EXTENSION;import static it.unimi.dsi.mg4j.index.DiskBasedIndex.OFFSETS_EXTENSION;import static it.unimi.dsi.mg4j.index.DiskBasedIndex.PROPERTIES_EXTENSION;import static it.unimi.dsi.mg4j.index.DiskBasedIndex.SIZES_EXTENSION;import static it.unimi.dsi.mg4j.index.DiskBasedIndex.TERMMAP_EXTENSION;import static it.unimi.dsi.mg4j.index.DiskBasedIndex.TERMS_EXTENSION;import it.unimi.dsi.Util;import it.unimi.dsi.bits.Utf16TransformationStrategy;import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;import it.unimi.dsi.fastutil.ints.Int2ObjectMap;import it.unimi.dsi.fastutil.ints.IntArrayList;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.ints.IntIterators;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.ObjectRBTreeSet;import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.FileLinesCollection;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.io.FileLinesCollection.FileLinesIterator;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.mg4j.document.CompositeDocumentSequence;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.mg4j.document.DocumentFactory;import it.unimi.dsi.mg4j.document.DocumentIterator;import it.unimi.dsi.mg4j.document.DocumentSequence;import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;import it.unimi.dsi.mg4j.document.InputStreamDocumentSequence;import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.DowncaseTermProcessor;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.IndexReader;import it.unimi.dsi.mg4j.index.TermProcessor;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.cluster.DocumentalPartitioningStrategy;import it.unimi.dsi.mg4j.index.cluster.DocumentalStrategies;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.mg4j.index.cluster.LexicalPartitioningStrategy;import it.unimi.dsi.mg4j.index.cluster.LexicalStrategies;import it.unimi.dsi.mg4j.tool.Concatenate;import it.unimi.dsi.mg4j.tool.IndexBuilder;import it.unimi.dsi.mg4j.tool.Merge;import it.unimi.dsi.mg4j.tool.PartitionDocumentally;import it.unimi.dsi.mg4j.tool.PartitionLexically;import it.unimi.dsi.mg4j.tool.Scan;import it.unimi.dsi.mg4j.tool.VirtualDocumentResolver;import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;import it.unimi.dsi.sux4j.mph.MWHCFunction;import it.unimi.dsi.sux4j.util.ShiftAddXorSignedStringMap;import it.unimi.dsi.util.Properties;import it.unimi.dsi.util.StringMap;import java.io.DataInputStream;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.Reader;import java.lang.reflect.InvocationTargetException;import java.net.URISyntaxException;import java.util.Collections;import java.util.Comparator;import java.util.Date;import junit.framework.TestCase;import org.apache.commons.configuration.ConfigurationException;import org.apache.commons.io.FileUtils;import org.apache.commons.io.IOUtils;import org.apache.commons.io.filefilter.FileFilterUtils;import org.apache.log4j.Level;import test.it.unimi.dsi.mg4j.document.DateArrayDocumentCollection;import test.it.unimi.dsi.mg4j.document.IntArrayDocumentCollection;import test.it.unimi.dsi.mg4j.document.MapVirtualDocumentCollection;import test.it.unimi.dsi.mg4j.document.StringArrayDocumentCollection;import cern.colt.GenericSorting;import cern.colt.Swapper;import cern.colt.function.IntComparator;public class IndexTest extends TestCase { static { Util.ensureLog4JIsConfigured( Level.INFO ); } private static StringMap<? extends CharSequence> createMap( String basename ) { FileLinesCollection flc = new FileLinesCollection( basename, "UTF-8" ); return new ShiftAddXorSignedStringMap( flc.iterator(), new MWHCFunction<CharSequence>( flc, new Utf16TransformationStrategy() ) ); } private String basename; private final int NUMBER_OF_DOCUMENTS = 100; private final int[] INTEGER_DOCUMENT = new int[ NUMBER_OF_DOCUMENTS ]; private final Date[] DATE_DOCUMENT = new Date[ NUMBER_OF_DOCUMENTS ]; @SuppressWarnings("unchecked") private final Int2ObjectMap<String>[] VIRTUAL_DOCUMENT = new Int2ObjectMap[ NUMBER_OF_DOCUMENTS ]; { for ( int i = INTEGER_DOCUMENT.length; i-- != 0; ) INTEGER_DOCUMENT[ i ] = i; for ( int i = DATE_DOCUMENT.length; i-- != 0; ) DATE_DOCUMENT[ i ] = new Date( i * 86400000L ); for ( int i = VIRTUAL_DOCUMENT.length; i-- != 0; ) { VIRTUAL_DOCUMENT[ i ] = new Int2ObjectArrayMap<String>(); VIRTUAL_DOCUMENT[ i ].put( i - 1, "link to previous document" ); VIRTUAL_DOCUMENT[ i ].put( i, "link to this document" ); VIRTUAL_DOCUMENT[ i ].put( i + 1, "link to next document" ); } } private final VirtualDocumentResolver RESOLVER = new MapVirtualDocumentCollection.TrivialVirtualDocumentResolver( NUMBER_OF_DOCUMENTS ); public final static TermProcessor KILL_A_PROCESSOR = KillATermProcessor.getInstance(); public final static class KillATermProcessor implements TermProcessor { private static final long serialVersionUID = 1L; private static final KillATermProcessor INSTANCE = new KillATermProcessor(); public TermProcessor copy() { return this; } public static TermProcessor getInstance() { return INSTANCE; } public boolean processPrefix( MutableString prefix ) { return true; } public boolean processTerm( MutableString term ) { return term.indexOf( 'a' ) == -1; } }; final static int[] INDEXED_FIELD = { 0, 1, 2 }; /** * Checks that the two provided indices are byte-by-byte the same, and that property files * coincide except for the provided property keys. * * @param basename0 the basename of an index. * @param basename1 the basename of an index. * @param excludedProperty a list of property keys that will not be considered when evaluating * the equality of property fiels. */ private void sameIndex( final String basename0, final String basename1, final String... excludedProperty ) throws IOException, ConfigurationException { // The two indices must be byte-by-byte identical in all components for ( String ext : new String[] { INDEX_EXTENSION, OFFSETS_EXTENSION, TERMS_EXTENSION, SIZES_EXTENSION, FREQUENCIES_EXTENSION, GLOBCOUNTS_EXTENSION } ) { File f0 = new File( basename0 + ext ); File f1 = new File( basename1 + ext ); assertEquals( ext, f0.exists(), f1.exists() ); if ( f0.exists() ) assertTrue( ext, IOUtils.contentEquals( new FileInputStream( f0 ), new FileInputStream( f1 ) ) ); } Properties properties0 = new Properties( basename0 + PROPERTIES_EXTENSION ); Properties properties1 = new Properties( basename1 + PROPERTIES_EXTENSION ); for ( String p : excludedProperty ) { properties0.setProperty( p, null ); properties1.setProperty( p, null ); } assertEquals( properties0, properties1 ); } public void sameContent( CharSequence basename0, CharSequence basename1, FileLinesIterator terms ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { sameContent( it.unimi.dsi.mg4j.index.Index.getInstance( basename0 ), it.unimi.dsi.mg4j.index.Index.getInstance( basename1 ), terms ); } public void sameContent( CharSequence basename0, CharSequence basename1 ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { sameContent( basename0, basename1, null ); } public void sameContent( it.unimi.dsi.mg4j.index.Index index0, it.unimi.dsi.mg4j.index.Index index1 ) throws IOException { sameContent( index0, index1, null ); } public void sameContent( it.unimi.dsi.mg4j.index.Index index0, it.unimi.dsi.mg4j.index.Index index1, final FileLinesIterator terms ) throws IOException { assertEquals( index0.hasCounts, index1.hasCounts ); assertEquals( index0.hasPositions, index1.hasPositions ); assertEquals( index0.hasPayloads, index1.hasPayloads ); assertEquals( index0.numberOfTerms, index1.numberOfTerms ); assertEquals( index0.numberOfDocuments, index1.numberOfDocuments ); final int numTerms = index0.numberOfTerms; int document; int[] p0 = IntArrays.EMPTY_ARRAY, p1 = IntArrays.EMPTY_ARRAY; boolean hasCounts = index0.hasCounts, hasPositions = index0.hasPositions; final IndexReader reader0 = index0.getReader(), reader1 = index1.getReader(); IndexIterator i0, i1; for ( int i = 0; i < numTerms; i++ ) { if ( terms != null ) { final CharSequence term = terms.next(); i0 = reader0.documents( term ); i1 = reader1.documents( term ); } else { i0 = reader0.documents( i ); i1 = reader1.documents( i ); } while ( i0.hasNext() && i1.hasNext() ) { assertEquals( "term " + i, document = i0.nextDocument(), i1.nextDocument() ); if ( hasCounts ) { assertEquals( "term " + i + ", document " + document, i0.count(), i1.count() ); if ( i0.count() > p0.length ) p0 = new int[ i0.count() ]; if ( i1.count() > p1.length ) p1 = new int[ i1.count() ]; if ( hasPositions ) for ( int p = i0.count(); p-- != 0; ) assertEquals( "term " + i + ", document " + document + ", position " + p, p0[ p ], p1[ p ] ); } } assertEquals( "term " + i, i0.hasNext(), i1.hasNext() ); } reader0.close(); reader1.close(); } public int processDocument( WordReader wordReader, int documentIndex, int startPos, Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>> termMap, TermProcessor termProcessor ) throws IOException { assertTrue( documentIndex >= 0 ); Object2ObjectOpenHashMap<MutableString, IntArrayList> terms = new Object2ObjectOpenHashMap<MutableString, IntArrayList>(); MutableString word = new MutableString(), nonWord = new MutableString(); int pos = startPos; while ( wordReader.next( word, nonWord ) ) { if ( word.length() == 0 ) continue; if ( !termProcessor.processTerm( word ) ) { pos++; continue; } IntArrayList positions = terms.get( word ); if ( positions == null ) terms.put( word.copy(), positions = new IntArrayList() ); positions.add( pos++ ); } for ( MutableString term : terms.keySet() ) { ObjectArrayList<int[]> list = termMap.get( term ); IntArrayList positions = terms.get( term ); if ( list == null ) termMap.put( term, list = new ObjectArrayList<int[]>() ); int[] t = new int[ positions.size() + 1 ]; t[ 0 ] = documentIndex; System.arraycopy( positions.elements(), 0, t, 1, positions.size() ); list.add( t ); } return pos; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -