📄 wikipediadocumentcollection.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.document;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2008 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.Util;import it.unimi.dsi.fastutil.bytes.ByteArrays;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.io.FastBufferedInputStream;import it.unimi.dsi.fastutil.io.FastByteArrayInputStream;import it.unimi.dsi.fastutil.longs.LongArrayList;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.FileLinesCollection;import it.unimi.dsi.io.MultipleInputStream;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;import java.io.BufferedReader;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.ObjectInputStream;import java.io.Serializable;import java.lang.reflect.InvocationTargetException;import java.net.URLEncoder;import java.util.Arrays;import java.util.Collection;import java.util.zip.GZIPInputStream;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** A {@link it.unimi.dsi.mg4j.document.DocumentCollection} corresponding to *  a given set of files in the Yahoo! Wikipedia format. *  * <P>This class provides a main method with a flexible syntax that serialises * into a document collection a list of (possibly gzip'd) files given on the command line or * piped into standard input. The position of each record is stored using * an {@link EliasFanoMonotoneLongBigList} per file. */public class WikipediaDocumentCollection extends AbstractDocumentCollection implements Serializable {	private final static Logger LOGGER = Util.getLogger( WikipediaDocumentCollection.class );		private static final long serialVersionUID = 1L;		private static final byte[] META_MARKER = "%%#".getBytes();	private static final byte[] DOC_MARKER = "%%#DOC".getBytes();	private static final byte[] PAGE_MARKER = "%%#PAGE".getBytes();	private static final byte[] SENTENCE_MARKER = "%%#SEN".getBytes();		private final static int NUM_FIELDS = 10;	private final static String[] FIELD_NAME = { "token", "POS", "lemma", "CONL", "WNSS", "WSJ", "ana", "head", "deplabel", "link" }; 	/** The files in this collection. */	private final String[] file;	/** The files in {@link #file} are gzip'd. */	private boolean gzipped;	/** The factory to be used by this collection. */	private final DocumentFactory factory;	/** A list of lists of pointers parallel to {@link #file}. Each list contains the	 * starting pointer of each document (within its file), plus a final pointer at the end of the file. */	private final ObjectArrayList<EliasFanoMonotoneLongBigList> pointers;	/** The number of documents in this collection. */	private final int size;	/** Whether this index contains phrases (as opposed to documents). */	private final boolean phrase;	/** An array parallel to {@link #file} containing the index of the first	 * document within each file, plus a final entry equal to {@link #size}. */	private final int[] firstDocument;	/** Byte array buffers used to reconstruct each field for random access. */	private transient byte[][] buffer;	/** Line buffer. */	private transient byte[] lineBuffer;	/** An array parallel to {@link #buffer} specifying the number of valid bytes. */	private transient int[] bufferSize;	/** The metadata of the last document. */	private transient Reference2ObjectMap<Enum<?>, Object> metadata;	/** The last document read, or -1 if no document has been read. */	private transient int lastDocument;		private final void initBuffers() {		bufferSize = new int[ NUM_FIELDS ];		buffer = new byte[ NUM_FIELDS ][];		lineBuffer = ByteArrays.EMPTY_ARRAY;		lastDocument = -1;		metadata = new Reference2ObjectArrayMap<Enum<?>, Object>();		for( int i = NUM_FIELDS; i-- != 0; ) buffer[ i ] = ByteArrays.EMPTY_ARRAY;	}		/** Builds a document collection corresponding to a given set of Wikipedia files specified as an array.	 * 	 *  <p><strong>Beware.</strong> This class is not guaranteed to work if files are	 *  deleted or modified after creation!	 * 	 * @param file an array containing the files that will be contained in the collection.	 * @param factory the factory that will be used to create documents.	 * @param phrase whether phrases should be indexed instead of documents.	 */	public WikipediaDocumentCollection( final String[] file, final DocumentFactory factory, final boolean phrase ) throws IOException {		this( file, factory, phrase, false );	}		/** Builds a document collection corresponding to a given set of (possibly gzip'd) Wikipedia files specified as an array.	 * 	 *  <p><strong>Beware.</strong> This class is not guaranteed to work if files are	 *  deleted or modified after creation!	 * 	 * @param file an array containing the files that will be contained in the collection.	 * @param factory the factory that will be used to create documents.	 * @param phrase whether phrases should be indexed instead of documents.	 * @param gzipped the files in <code>file</code> are gzip'd.	 */	public WikipediaDocumentCollection( final String[] file, final DocumentFactory factory, final boolean phrase, final boolean gzipped ) throws IOException {		this.file = file;		this.factory = factory;		this.gzipped = gzipped;		this.phrase = phrase;				initBuffers();		LongArrayList p = new LongArrayList();		pointers = new ObjectArrayList<EliasFanoMonotoneLongBigList>( file.length );		firstDocument = new int[ file.length + 1 ];		int count = 0;				final ProgressLogger pl = new ProgressLogger( LOGGER );		pl.expectedUpdates = file.length;		pl.itemsName = "files";		pl.start( "Scanning files..." );				// Scan files and retrieve page pointers		for( String f : file ) {			p.clear();			final FastBufferedInputStream fbis = gzipped ? new FastBufferedInputStream( new GZIPInputStream( new FileInputStream( f ) ) ) : new FastBufferedInputStream( new FileInputStream( f ) );			long position;			for(;;) {				position = fbis.position();				if ( readLine( fbis ) == -1 ) break;				if ( startsWith( lineBuffer, DOC_MARKER ) ) p.add( position );				if ( phrase && startsWith( lineBuffer, SENTENCE_MARKER ) ) p.add( position );			}						count += p.size();			p.add( fbis.position() );			fbis.close();						pointers.add( new EliasFanoMonotoneLongBigList( p ) );			firstDocument[ pointers.size() ] = count;						pl.update();		}				pl.done();				size = count;	}	private final int readLine( final FastBufferedInputStream fbis ) throws IOException {		int start = 0, len;		while( ( len = fbis.readLine( lineBuffer, start, lineBuffer.length - start, FastBufferedInputStream.ALL_TERMINATORS ) ) == lineBuffer.length - start ) {			start += len;			lineBuffer = ByteArrays.grow( lineBuffer, lineBuffer.length + 1 );		}				if ( len != -1 ) start += len;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -