📄 documentcollectiontest.java
字号:
package test.it.unimi.dsi.mg4j.document;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.mg4j.document.DocumentCollection;import it.unimi.dsi.mg4j.document.DocumentIterator;import it.unimi.dsi.mg4j.document.DocumentSequence;import it.unimi.dsi.mg4j.document.FileSetDocumentCollection;import it.unimi.dsi.mg4j.document.HtmlDocumentFactory;import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;import it.unimi.dsi.mg4j.document.InputStreamDocumentSequence;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory;import it.unimi.dsi.mg4j.document.ZipDocumentCollection;import it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder;import it.unimi.dsi.util.Properties;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStreamWriter;import java.io.Reader;import java.io.Writer;import java.util.StringTokenizer;import junit.framework.TestCase;import org.apache.commons.configuration.ConfigurationException;import org.apache.commons.io.FileUtils;import cern.colt.GenericSorting;import cern.colt.Swapper;import cern.colt.function.IntComparator;public class DocumentCollectionTest extends TestCase { /** We consider documents abstractly described by two fields each. */ private final static String[][] document = new String[][] { // 0 1 2 3 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" }, /* new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" }, new String[] { "aaa uuu aaa" , "aaa uuu aaa xxx xxx xxx aaa xxx" }, // This tests that zipped collections handle properly initial spaces new String[] { " aaa uuu aaa" , " aaa uuu aaa xxx xxx xxx aaa xxx" },*/ }; private final static Properties DEFAULT_PROPERTIES = new Properties(); static { DEFAULT_PROPERTIES.setProperty( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "ASCII" ); } /** The number of documents. */ private final static int ndoc = document.length; /** The temporary directory where all tests are run. */ private File tempDir; /** The set of files in the HTML directory. */ private String[] htmlFileSet; /** Given a two-field document, produce an HTML document with the first field as title and * the second field as body. * * @param document the document. * @return the HTML version of the document. */ private String getHTMLDocument( String[] document ) { MutableString res = new MutableString(); res.append( "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n" ); res.append( "<HTML>\n<HEAD>\n<TITLE>" + document[ 0 ] + "</TITLE>\n" ); // Do NOT append the first part of the body res.append( "<BODY>\n" + document[ 1 ].substring( document[ 0 ].length() ) ); res.append( "\n</BODY>\n" ); res.append( "</HTML>" ); return res.toString(); } /** Given a two-field document, produce a mbox document with the first field as subject and * the second field as body. * * @param document the document. * @return the HTML version of the document. */ private String getMboxDocument( String[] document ) { MutableString res = new MutableString(); res.append( "From MAILER-DAEMON Fri Apr 15 16:22:32 2005\n" ); res.append( "Date: 15 Apr 2005 16:22:32 +0200\n" ); res.append( "From: Mail System Internal Data <MAILER-DAEMON@sliver.usr.dsi.unimi.it>\n" ); res.append( "Subject: " + document[ 0 ] + "\n" ); res.append( "Message-ID: <1113574952@sliver.usr.dsi.unimi.it>\n" ); res.append( "X-IMAP: 1102967122 0000138458\n" ); res.append( "Return-Path: <matteo.xxx@unimi.it>\n" ); res.append( "Received: from localhost (localhost.localdomain [127.0.0.1])\n" ); res.append( "\tby sliver.usr.dsi.unimi.it (8.12.11/8.12.11) with ESMTP id iAUNtadn007305\n"); res.append( "\tfor <vigna@localhost>; Wed, 1 Dec 2004 00:55:36 +0100\n" ); res.append( "Received: from law5.usr.dsi.unimi.it [159.149.146.241]\n" ); res.append( "\tby localhost with IMAP (fetchmail-6.2.5)\n" ); res.append( "\tfor vigna@localhost (single-drop); Wed, 01 Dec 2004 00:55:36 +0100 (CET)\n" ); res.append( "To: vigna@dsi.unimi.it\n" ); res.append( "Message-id: <Pine.WNT.4.33.0412010051240.-209505@p233-mmx>\n" ); res.append( "Content-type: TEXT/PLAIN; charset=iso-8859-15\n" ); res.append( "X-Warning: UNAuthenticated Sender\n" ); res.append( "Content-Transfer-Encoding: 8bit\n" ); res.append( "Content-Length: " + document[ 1 ].length() + "\n" ); res.append( "\n" ); res.append( document[ 1 ] + "\n" ); return res.toString(); } /** Checks that the tokenizer and the word reader return exactly the same sequence of words. * * @param wordReader the word reader. * @param tok the tokenizer. * @throws IOException */ private void checkSameWords( WordReader wordReader, StringTokenizer tok ) throws IOException { MutableString word = new MutableString(); MutableString nonWord = new MutableString(); boolean aWordInDocum, aWordInDocument; boolean firstTime = true; for (;;) { aWordInDocum = wordReader.next( word, nonWord ); if ( firstTime ) { firstTime = false; if ( word.equals( "" ) ) continue; } assertFalse( aWordInDocum && word.equals( "" ) ); aWordInDocument = tok.hasMoreElements(); assertEquals( aWordInDocum, aWordInDocument ); if ( !aWordInDocum ) break; assertEquals( tok.nextElement(), word.toString() ); } } /** Checks that the documents in the collection have the same sequence of words as in * document: the names of the fields to be checked are specified in the array. * * @param coll the collection. * @param fieldName the field names. * @param document documents to be checked against. * @throws IOException */ private void checkAllDocuments( final DocumentCollection coll, final String[] fieldName, final String[][] document ) throws IOException { final int nfields = fieldName.length; final int[] fieldNumber = new int[ nfields ]; final int[] arrayIndex = new int[ nfields ]; // Look for field indices for ( int i = 0; i < nfields; i++ ) { arrayIndex[ i ] = i; int j; for ( j = 0; j < coll.factory().numberOfFields(); j++ ) if ( coll.factory().fieldName( j ).equals( fieldName[ i ] ) ) { fieldNumber[ i ] = j; break; } assert j < coll.factory().numberOfFields(); } // Sort fields to guarantee that they are correctly numbered GenericSorting.quickSort( 0, nfields, new IntComparator() { public int compare( int x, int y ) { return fieldNumber[ x ] - fieldNumber[ y ]; }}, new Swapper() { public void swap( int x, int y ) { int t = fieldNumber[ x ]; fieldNumber[ x ] = fieldNumber[ y ]; fieldNumber[ y ] = t; t = arrayIndex[ x ]; arrayIndex[ x ] = arrayIndex[ y ]; arrayIndex[ y ] = t; String q = fieldName[ x ]; fieldName[ x ] = fieldName[ y ]; fieldName[ y ] = q; }} ); // Start checking
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -