📄 documentcollectiontest.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
		for ( int doc = 0; doc < coll.size(); doc++ ) {			Document docum = coll.document( doc );			for ( int i = 0; i < nfields; i++ ) {				int field = fieldNumber[ i ];				Reader content = (Reader)docum.content( field );				WordReader wordReader = docum.wordReader( field );				wordReader.setReader( content );				StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] );				System.err.println( "Checking document " + doc + " field " + fieldName[ i ] + " (" + field + ")" );				checkSameWords( wordReader, tok );			}			docum.close();		}	}	/** Checks that the documents in the sequence have the same sequence of words as in	 *  <code>document</code>: the names of the fields to be checked are specified in the array.	 *  	 * @param seq the sequence.	 * @param fieldName the field names.	 * @param document documents to be checked against.	 * @throws IOException	 */	private void checkAllDocumentsSeq( final DocumentSequence seq, final String[] fieldName, final String[][] document ) throws IOException {		final int nfields = fieldName.length;		final int[] fieldNumber = new int[ nfields ];		final int[] arrayIndex = new int[ nfields ];		// Look for field indices		for ( int i = 0; i < nfields; i++ ) {			arrayIndex[ i ] = i;			int j;			for ( j = 0; j < seq.factory().numberOfFields(); j++ )				if ( seq.factory().fieldName( j ).equals( fieldName[ i ] ) ) {					fieldNumber[ i ] = j;					break;				}			assert j < seq.factory().numberOfFields();		}		// Sort fields to guarantee that they are correctly numbered		GenericSorting.quickSort( 0, nfields, new IntComparator() {			public int compare( int x, int y ) {				return fieldNumber[ x ] - fieldNumber[ y ];			}}, new Swapper() {				public void swap( int x, int y ) {					int t = fieldNumber[ x ]; fieldNumber[ x ] = fieldNumber[ y ]; fieldNumber[ y ] = t;					t = arrayIndex[ x ]; arrayIndex[ x ] = arrayIndex[ y ]; arrayIndex[ y ] = t;					String q = fieldName[ x ]; fieldName[ x ] = fieldName[ y ]; fieldName[ y ] = q;				}} );		// Start checking		DocumentIterator iterator = seq.iterator();		Document docum;		int doc = 0;		while ( ( docum = iterator.nextDocument() ) != null ) {			for ( int i = 0; i < nfields; i++ ) {				int field = fieldNumber[ i ];				Reader content = (Reader)docum.content( field );				WordReader wordReader = docum.wordReader( field );				wordReader.setReader( content );				StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] );				System.err.println( "Checking sequentially document " + doc + " field " + fieldName[ i ] + " (" + field + ")" );				checkSameWords( wordReader, tok );			}			docum.close();			doc++;		}		iterator.close();	}	protected void setUp() throws IOException, ClassNotFoundException, ConfigurationException {		// Create a new directory under /tmp		tempDir = File.createTempFile( "mg4jtest", null );		tempDir.delete();		tempDir.mkdir();		// Now create the hierarchy for HTML files		File htmlDir = new File( tempDir, "html" );		htmlDir.mkdir();		System.err.println( "Temporary directory: " + tempDir );		htmlFileSet = new String[ ndoc ];		for ( int i = 0; i < ndoc; i++ ) {			String docFile = new File( htmlDir, "doc" + i + ".html" ).toString();			htmlFileSet[ i ] = docFile;			Writer docWriter = new OutputStreamWriter( new FileOutputStream( docFile ), "ISO-8859-1" );			docWriter.write( getHTMLDocument( document[ i ] ) );			docWriter.close();		}		// Now create the mbox file		Writer mboxWriter = new OutputStreamWriter( new FileOutputStream( new File( tempDir, "mbox" ) ), "ISO-8859-1" );		for ( int i = 0; i < ndoc; i++ ) 			mboxWriter.write( getMboxDocument( document[ i ] ) );		mboxWriter.close();		// Now create the zip collections		FileSetDocumentCollection fileSetDocumentCollection = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) );		ZipDocumentCollectionBuilder collBuilder = new ZipDocumentCollectionBuilder( new File( tempDir, "zip" ).toString(), 				fileSetDocumentCollection.factory(), true,				new ProgressLogger() );		ZipDocumentCollection zipDocumentCollection = collBuilder.build( fileSetDocumentCollection );		BinIO.storeObject( zipDocumentCollection, new File( tempDir, "zip.collection" ).toString() );		zipDocumentCollection.close();				ZipDocumentCollectionBuilder apprCollBuilder = new ZipDocumentCollectionBuilder( new File( tempDir, "azip" ).toString(), 				fileSetDocumentCollection.factory(), false,				new ProgressLogger() );		zipDocumentCollection = apprCollBuilder.build( fileSetDocumentCollection );		BinIO.storeObject( zipDocumentCollection, new File( tempDir, "azip.collection" ).toString() );		zipDocumentCollection.close();		fileSetDocumentCollection.close();	}	public void testFileSetDocumentCollection() throws IOException, ConfigurationException {		System.err.println( "Checking fileset collection" );		FileSetDocumentCollection coll = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) );		assertEquals( coll.size(), ndoc );		checkAllDocuments( coll, new String[] { "title", "text" }, document );		coll.close();	}	public void testFileSetDocumentCollectionSeq() throws IOException, ConfigurationException {		System.err.println( "Checking fileset collection sequentially" );		FileSetDocumentCollection coll = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) );		checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );		coll.close();	}	/*	public void testMboxDocumentCollection() throws IOException, ConfigurationException, MessagingException {		System.err.println( "Checking mbox collection" );		JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES );		checkAllDocuments( coll, new String[] { "subject", "body" }, document );		coll.close();	}	public void testMboxDocumentCollectionSeq() throws IOException, ConfigurationException, MessagingException {		System.err.println( "Checking mbox collection sequentially" );		JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES );		checkAllDocumentsSeq( coll, new String[] { "subject", "body" }, document );		coll.close();	}*/	public void testZipDocumentCollection() throws IOException, ClassNotFoundException {		System.err.println( "Checking zipped collection" );		ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "zip.collection" ).toString() );		checkAllDocuments( coll, new String[] { "title", "text" }, document );		coll.close();	}	public void testZipDocumentCollectionSeq() throws IOException, ClassNotFoundException {		System.err.println( "Checking zipped collection sequentially" );		ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "zip.collection" ).toString() );		checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );		coll.close();	}	public void testZipDocumentCollectionAppr() throws IOException, ClassNotFoundException {		System.err.println( "Checking approximated zipped collection" );		ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "azip.collection" ).toString() );		checkAllDocuments( coll, new String[] { "title", "text" }, document );		coll.close();	}	public void testZipDocumentCollectionApprSeq() throws IOException, ClassNotFoundException {		System.err.println( "Checking approximated zipped collection sequentially" );		ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "azip.collection" ).toString() );		checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );		coll.close();	}	public void testInputStreamSequence() throws IOException, ConfigurationException {		System.err.println( "Checking input stream (text field only)" );		// Extract only field number 1, and write it out with separator '\u0000'		MutableString res = new MutableString();		String[][] justSecondField = new String[ ndoc ][ 1 ];		for ( int i = 0; i < ndoc; i++ ) {			res.append( document[ i ][ 1 ] + "\u0000" );			justSecondField[ i ][ 0 ] = document[ i ][ 1 ];		}		String resString = res.toString();		// Write the sequence on a file (in UTF-8)		Writer resWriter = new OutputStreamWriter( new FileOutputStream( new File( tempDir, "stream" ) ), "UTF-8" );		resWriter.write( resString );		resWriter.close();		// Read it as a input stream document sequence		InputStream is = new FileInputStream( new File( tempDir, "stream" ) );		DocumentSequence seq = new InputStreamDocumentSequence( is, '\u0000', new IdentityDocumentFactory( DEFAULT_PROPERTIES ) );		checkAllDocumentsSeq( seq, new String[] { "text" }, justSecondField );		seq.close();	}		protected void tearDown() throws IOException {		FileUtils.forceDeleteOnExit( tempDir );	}}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -