⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trecdocumentcollectiontest.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package test.it.unimi.dsi.mg4j.document;import it.unimi.dsi.mg4j.document.CompositeDocumentFactory;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.mg4j.document.DocumentFactory;import it.unimi.dsi.mg4j.document.DocumentIterator;import it.unimi.dsi.mg4j.document.HtmlDocumentFactory;import it.unimi.dsi.mg4j.document.TRECDocumentCollection;import it.unimi.dsi.mg4j.document.TRECHeaderDocumentFactory;import java.io.File;import java.io.FileOutputStream;import java.io.OutputStream;import java.io.Reader;import junit.framework.TestCase;import org.apache.commons.io.IOUtils;public class TRECDocumentCollectionTest extends TestCase {	public void testChar255() throws Exception {		File temp = File.createTempFile(TRECDocumentCollectionTest.class				.getName(), ".testChar255");		temp.deleteOnExit();		OutputStream outputStream = new FileOutputStream(temp);		IOUtils.copy(this.getClass().getResourceAsStream("testChar255.data"),				outputStream);		outputStream.close();		TRECDocumentCollection collection = new TRECDocumentCollection(				new String[] { temp.toString() },				CompositeDocumentFactory						.getFactory(new DocumentFactory[] {								new TRECHeaderDocumentFactory(),								new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),				4, // Very small, to induce fragmentation				false);		try {			DocumentIterator iter = collection.iterator();			Document d;			while ((d = iter.nextDocument()) != null)				d.title();		} catch (IllegalStateException e) {			assertTrue(false);		}	}	public void testContents() throws Exception {		File temp = File.createTempFile( TRECDocumentCollectionTest.class.getName(), ".testContents" );		File tempAgain = File.createTempFile( TRECDocumentCollectionTest.class.getName(), ".testContentsAgain" );		temp.deleteOnExit();		tempAgain.deleteOnExit();		OutputStream outputStream = new FileOutputStream( temp );		OutputStream outputStreamAgain = new FileOutputStream( tempAgain );		IOUtils.copy( this.getClass().getResourceAsStream( "testContents.data" ), outputStream );		outputStream.close();		IOUtils.copy( this.getClass().getResourceAsStream( "testContentsAgain.data" ), outputStreamAgain );		outputStreamAgain.close();		TRECDocumentCollection collection = new TRECDocumentCollection(				new String[] { temp.toString(), tempAgain.toString() },				CompositeDocumentFactory						.getFactory(new DocumentFactory[] {								new TRECHeaderDocumentFactory(),								new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),				4, // Very small, to induce fragmentation				false);		DocumentIterator iter = collection.iterator();		Document d = null;		d = iter.nextDocument();		assertNotNull(d);		assertEquals("http://gx0001/", d.uri());		assertEquals("GX001", d.title());		final int textIndex = collection.factory().fieldIndex( "text" );				assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );				d = iter.nextDocument();		assertNotNull(d);		assertEquals("http://gx0002/", d.uri());		assertEquals("GX002", d.title());		assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d = iter.nextDocument();		assertNotNull(d);		assertEquals("http://gx0003/", d.uri());		assertEquals("GX003", d.title());		assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d = iter.nextDocument();		assertNotNull(d);		assertEquals("http://gx0004/", d.uri());		assertEquals("GX004", d.title());		assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );				d = iter.nextDocument();		assertNotNull(d);		assertEquals("http://gx0005/", d.uri());		assertEquals("GX005", d.title());		assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d = iter.nextDocument();		assertNotNull(d);		assertEquals("http://gx0006/", d.uri());		assertEquals("GX006", d.title());		assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d = iter.nextDocument();		assertNotNull(d);		assertEquals("http://gx0007/", d.uri());		assertEquals("GX007", d.title());		assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d = iter.nextDocument();		assertNull(d);		iter.close();				d = collection.document( 0 );		assertNotNull(d);		assertEquals("http://gx0001/", d.uri());		assertEquals("GX001", d.title());		assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d.close();				d = collection.document( 1 );		assertNotNull(d);		assertEquals("http://gx0002/", d.uri());		assertEquals("GX002", d.title());		assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d.close();		d = collection.document( 2 );		assertNotNull(d);		assertEquals("http://gx0003/", d.uri());		assertEquals("GX003", d.title());		assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d.close();		d = collection.document( 3 );		assertNotNull(d);		assertEquals("http://gx0004/", d.uri());		assertEquals("GX004", d.title());		assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d.close();				d = collection.document( 4 );		assertNotNull(d);		assertEquals("http://gx0005/", d.uri());		assertEquals("GX005", d.title());		assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d.close();		d = collection.document( 5 );		assertNotNull(d);		assertEquals("http://gx0006/", d.uri());		assertEquals("GX006", d.title());		assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d.close();		d = collection.document( 6 );		assertNotNull(d);		assertEquals("http://gx0007/", d.uri());		assertEquals("GX007", d.title());		assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );		d.close();	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -