📄 documentcollectiontest.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package test.it.unimi.dsi.mg4j.document;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.mg4j.document.DocumentCollection;import it.unimi.dsi.mg4j.document.DocumentIterator;import it.unimi.dsi.mg4j.document.DocumentSequence;import it.unimi.dsi.mg4j.document.FileSetDocumentCollection;import it.unimi.dsi.mg4j.document.HtmlDocumentFactory;import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;import it.unimi.dsi.mg4j.document.InputStreamDocumentSequence;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory;import it.unimi.dsi.mg4j.document.ZipDocumentCollection;import it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder;import it.unimi.dsi.util.Properties;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStreamWriter;import java.io.Reader;import java.io.Writer;import java.util.StringTokenizer;import junit.framework.TestCase;import org.apache.commons.configuration.ConfigurationException;import org.apache.commons.io.FileUtils;import cern.colt.GenericSorting;import cern.colt.Swapper;import cern.colt.function.IntComparator;public class DocumentCollectionTest extends TestCase {	/** We consider documents abstractly described by two fields each. */	private final static String[][] document = new String[][] {			//              0   1   2   3      0   1   2   3   4   5   6   7   8   9   10  11  12  13  14			new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },		/*	new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },			new String[] { "aaa uuu aaa"    , "aaa uuu aaa xxx xxx xxx aaa xxx" },			// This tests that zipped collections handle properly initial spaces			new String[] { " aaa uuu aaa"    , " aaa uuu aaa xxx xxx xxx aaa xxx" },*/	};	private final static Properties DEFAULT_PROPERTIES = new Properties();	static {		DEFAULT_PROPERTIES.setProperty( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "ASCII" );	}		/** The number of documents. */	private final static int ndoc = document.length;	/** The temporary directory where all tests are run. */	private File tempDir;	/** The set of files in the HTML directory. */	private String[] htmlFileSet;		/** Given a two-field document, produce an HTML document with the first field as title and	 *  the second field as body.	 *  	 *  @param document the document.	 *  @return the HTML version of the document.	 */	private String getHTMLDocument( String[] document ) {		MutableString res = new MutableString();		res.append( "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n" );		res.append( "<HTML>\n<HEAD>\n<TITLE>" + document[ 0 ] + "</TITLE>\n" );		// Do NOT append the first part of the body		res.append( "<BODY>\n" + document[ 1 ].substring( document[ 0 ].length() ) );		res.append( "\n</BODY>\n" );		res.append( "</HTML>" );		return res.toString();	}		/** Given a two-field document, produce a mbox document with the first field as subject and	 *  the second field as body.	 *  	 *  @param document the document.	 *  @return the HTML version of the document.	 */	private String getMboxDocument( String[] document ) {		MutableString res = new MutableString();		res.append( "From MAILER-DAEMON Fri Apr 15 16:22:32 2005\n" );		res.append( "Date: 15 Apr 2005 16:22:32 +0200\n" );		res.append( "From: Mail System Internal Data <MAILER-DAEMON@sliver.usr.dsi.unimi.it>\n" );		res.append( "Subject: " + document[ 0 ] + "\n" );		res.append( "Message-ID: <1113574952@sliver.usr.dsi.unimi.it>\n" );		res.append( "X-IMAP: 1102967122 0000138458\n" );		res.append( "Return-Path: <matteo.xxx@unimi.it>\n" );		res.append( "Received: from localhost (localhost.localdomain [127.0.0.1])\n" );        res.append( "\tby sliver.usr.dsi.unimi.it (8.12.11/8.12.11) with ESMTP id iAUNtadn007305\n");        res.append( "\tfor <vigna@localhost>; Wed, 1 Dec 2004 00:55:36 +0100\n" );        res.append( "Received: from law5.usr.dsi.unimi.it [159.149.146.241]\n" );        res.append( "\tby localhost with IMAP (fetchmail-6.2.5)\n" );        res.append( "\tfor vigna@localhost (single-drop); Wed, 01 Dec 2004 00:55:36 +0100 (CET)\n" );        res.append( "To: vigna@dsi.unimi.it\n" );        res.append( "Message-id: <Pine.WNT.4.33.0412010051240.-209505@p233-mmx>\n" );        res.append( "Content-type: TEXT/PLAIN; charset=iso-8859-15\n" );        res.append( "X-Warning: UNAuthenticated Sender\n" );        res.append( "Content-Transfer-Encoding: 8bit\n" );        res.append( "Content-Length: " + document[ 1 ].length() + "\n" );		res.append( "\n" );		res.append( document[ 1 ] + "\n" );		return res.toString();	}		/** Checks that the tokenizer and the word reader return exactly the same sequence of words. 	 * 	 * @param wordReader the word reader.	 * @param tok the tokenizer.	 * @throws IOException	 */	private void checkSameWords( WordReader wordReader, StringTokenizer tok ) throws IOException {		MutableString word = new MutableString();		MutableString nonWord = new MutableString();		boolean aWordInDocum, aWordInDocument;		boolean firstTime = true;		for (;;) {			aWordInDocum = wordReader.next( word, nonWord );			if ( firstTime ) {				firstTime = false;				if ( word.equals( "" ) ) continue;			}			assertFalse( aWordInDocum && word.equals( "" ) );			aWordInDocument = tok.hasMoreElements();			assertEquals( aWordInDocum, aWordInDocument );			if ( !aWordInDocum ) break;			assertEquals( tok.nextElement(), word.toString() );		}	}		/** Checks that the documents in the collection have the same sequence of words as in	 *  document: the names of the fields to be checked are specified in the array.	 *  	 * @param coll the collection.	 * @param fieldName the field names.	 * @param document documents to be checked against.	 * @throws IOException	 */	private void checkAllDocuments( final DocumentCollection coll, final String[] fieldName, final String[][] document ) throws IOException {		final int nfields = fieldName.length;		final int[] fieldNumber = new int[ nfields ];		final int[] arrayIndex = new int[ nfields ];		// Look for field indices		for ( int i = 0; i < nfields; i++ ) {			arrayIndex[ i ] = i;			int j;			for ( j = 0; j < coll.factory().numberOfFields(); j++ )				if ( coll.factory().fieldName( j ).equals( fieldName[ i ] ) ) {					fieldNumber[ i ] = j;					break;				}			assert j < coll.factory().numberOfFields();		}		// Sort fields to guarantee that they are correctly numbered		GenericSorting.quickSort( 0, nfields, new IntComparator() {			public int compare( int x, int y ) {				return fieldNumber[ x ] - fieldNumber[ y ];			}}, new Swapper() {				public void swap( int x, int y ) {					int t = fieldNumber[ x ]; fieldNumber[ x ] = fieldNumber[ y ]; fieldNumber[ y ] = t;					t = arrayIndex[ x ]; arrayIndex[ x ] = arrayIndex[ y ]; arrayIndex[ y ] = t;					String q = fieldName[ x ]; fieldName[ x ] = fieldName[ y ]; fieldName[ y ] = q;				}} );		// Start checking
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -