📄 wikipediadocumentcollection.java
字号:
package it.unimi.dsi.mg4j.document;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2008 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.Util;import it.unimi.dsi.fastutil.bytes.ByteArrays;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.io.FastBufferedInputStream;import it.unimi.dsi.fastutil.io.FastByteArrayInputStream;import it.unimi.dsi.fastutil.longs.LongArrayList;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.FileLinesCollection;import it.unimi.dsi.io.MultipleInputStream;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;import java.io.BufferedReader;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.ObjectInputStream;import java.io.Serializable;import java.lang.reflect.InvocationTargetException;import java.net.URLEncoder;import java.util.Arrays;import java.util.Collection;import java.util.zip.GZIPInputStream;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** A {@link it.unimi.dsi.mg4j.document.DocumentCollection} corresponding to * a given set of files in the Yahoo! Wikipedia format. * * <P>This class provides a main method with a flexible syntax that serialises * into a document collection a list of (possibly gzip'd) files given on the command line or * piped into standard input. The position of each record is stored using * an {@link EliasFanoMonotoneLongBigList} per file. */public class WikipediaDocumentCollection extends AbstractDocumentCollection implements Serializable { private final static Logger LOGGER = Util.getLogger( WikipediaDocumentCollection.class ); private static final long serialVersionUID = 1L; private static final byte[] META_MARKER = "%%#".getBytes(); private static final byte[] DOC_MARKER = "%%#DOC".getBytes(); private static final byte[] PAGE_MARKER = "%%#PAGE".getBytes(); private static final byte[] SENTENCE_MARKER = "%%#SEN".getBytes(); private final static int NUM_FIELDS = 10; private final static String[] FIELD_NAME = { "token", "POS", "lemma", "CONL", "WNSS", "WSJ", "ana", "head", "deplabel", "link" }; /** The files in this collection. */ private final String[] file; /** The files in {@link #file} are gzip'd. */ private boolean gzipped; /** The factory to be used by this collection. */ private final DocumentFactory factory; /** A list of lists of pointers parallel to {@link #file}. Each list contains the * starting pointer of each document (within its file), plus a final pointer at the end of the file. */ private final ObjectArrayList<EliasFanoMonotoneLongBigList> pointers; /** The number of documents in this collection. */ private final int size; /** Whether this index contains phrases (as opposed to documents). */ private final boolean phrase; /** An array parallel to {@link #file} containing the index of the first * document within each file, plus a final entry equal to {@link #size}. */ private final int[] firstDocument; /** Byte array buffers used to reconstruct each field for random access. */ private transient byte[][] buffer; /** Line buffer. */ private transient byte[] lineBuffer; /** An array parallel to {@link #buffer} specifying the number of valid bytes. */ private transient int[] bufferSize; /** The metadata of the last document. */ private transient Reference2ObjectMap<Enum<?>, Object> metadata; /** The last document read, or -1 if no document has been read. */ private transient int lastDocument; private final void initBuffers() { bufferSize = new int[ NUM_FIELDS ]; buffer = new byte[ NUM_FIELDS ][]; lineBuffer = ByteArrays.EMPTY_ARRAY; lastDocument = -1; metadata = new Reference2ObjectArrayMap<Enum<?>, Object>(); for( int i = NUM_FIELDS; i-- != 0; ) buffer[ i ] = ByteArrays.EMPTY_ARRAY; } /** Builds a document collection corresponding to a given set of Wikipedia files specified as an array. * * <p><strong>Beware.</strong> This class is not guaranteed to work if files are * deleted or modified after creation! * * @param file an array containing the files that will be contained in the collection. * @param factory the factory that will be used to create documents. * @param phrase whether phrases should be indexed instead of documents. */ public WikipediaDocumentCollection( final String[] file, final DocumentFactory factory, final boolean phrase ) throws IOException { this( file, factory, phrase, false ); } /** Builds a document collection corresponding to a given set of (possibly gzip'd) Wikipedia files specified as an array. * * <p><strong>Beware.</strong> This class is not guaranteed to work if files are * deleted or modified after creation! * * @param file an array containing the files that will be contained in the collection. * @param factory the factory that will be used to create documents. * @param phrase whether phrases should be indexed instead of documents. * @param gzipped the files in <code>file</code> are gzip'd. */ public WikipediaDocumentCollection( final String[] file, final DocumentFactory factory, final boolean phrase, final boolean gzipped ) throws IOException { this.file = file; this.factory = factory; this.gzipped = gzipped; this.phrase = phrase; initBuffers(); LongArrayList p = new LongArrayList(); pointers = new ObjectArrayList<EliasFanoMonotoneLongBigList>( file.length ); firstDocument = new int[ file.length + 1 ]; int count = 0; final ProgressLogger pl = new ProgressLogger( LOGGER ); pl.expectedUpdates = file.length; pl.itemsName = "files"; pl.start( "Scanning files..." ); // Scan files and retrieve page pointers for( String f : file ) { p.clear(); final FastBufferedInputStream fbis = gzipped ? new FastBufferedInputStream( new GZIPInputStream( new FileInputStream( f ) ) ) : new FastBufferedInputStream( new FileInputStream( f ) ); long position; for(;;) { position = fbis.position(); if ( readLine( fbis ) == -1 ) break; if ( startsWith( lineBuffer, DOC_MARKER ) ) p.add( position ); if ( phrase && startsWith( lineBuffer, SENTENCE_MARKER ) ) p.add( position ); } count += p.size(); p.add( fbis.position() ); fbis.close(); pointers.add( new EliasFanoMonotoneLongBigList( p ) ); firstDocument[ pointers.size() ] = count; pl.update(); } pl.done(); size = count; } private final int readLine( final FastBufferedInputStream fbis ) throws IOException { int start = 0, len; while( ( len = fbis.readLine( lineBuffer, start, lineBuffer.length - start, FastBufferedInputStream.ALL_TERMINATORS ) ) == lineBuffer.length - start ) { start += len; lineBuffer = ByteArrays.grow( lineBuffer, lineBuffer.length + 1 ); } if ( len != -1 ) start += len;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -