📄 trecdocumentcollectiontest.java
字号:
package test.it.unimi.dsi.mg4j.document;import it.unimi.dsi.mg4j.document.CompositeDocumentFactory;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.mg4j.document.DocumentFactory;import it.unimi.dsi.mg4j.document.DocumentIterator;import it.unimi.dsi.mg4j.document.HtmlDocumentFactory;import it.unimi.dsi.mg4j.document.TRECDocumentCollection;import it.unimi.dsi.mg4j.document.TRECHeaderDocumentFactory;import java.io.File;import java.io.FileOutputStream;import java.io.OutputStream;import java.io.Reader;import junit.framework.TestCase;import org.apache.commons.io.IOUtils;public class TRECDocumentCollectionTest extends TestCase { public void testChar255() throws Exception { File temp = File.createTempFile(TRECDocumentCollectionTest.class .getName(), ".testChar255"); temp.deleteOnExit(); OutputStream outputStream = new FileOutputStream(temp); IOUtils.copy(this.getClass().getResourceAsStream("testChar255.data"), outputStream); outputStream.close(); TRECDocumentCollection collection = new TRECDocumentCollection( new String[] { temp.toString() }, CompositeDocumentFactory .getFactory(new DocumentFactory[] { new TRECHeaderDocumentFactory(), new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ), 4, // Very small, to induce fragmentation false); try { DocumentIterator iter = collection.iterator(); Document d; while ((d = iter.nextDocument()) != null) d.title(); } catch (IllegalStateException e) { assertTrue(false); } } public void testContents() throws Exception { File temp = File.createTempFile( TRECDocumentCollectionTest.class.getName(), ".testContents" ); File tempAgain = File.createTempFile( TRECDocumentCollectionTest.class.getName(), ".testContentsAgain" ); temp.deleteOnExit(); tempAgain.deleteOnExit(); OutputStream outputStream = new FileOutputStream( temp ); OutputStream outputStreamAgain = new FileOutputStream( tempAgain ); IOUtils.copy( this.getClass().getResourceAsStream( "testContents.data" ), outputStream ); outputStream.close(); IOUtils.copy( this.getClass().getResourceAsStream( "testContentsAgain.data" ), outputStreamAgain ); outputStreamAgain.close(); TRECDocumentCollection collection = new TRECDocumentCollection( new String[] { temp.toString(), tempAgain.toString() }, CompositeDocumentFactory .getFactory(new DocumentFactory[] { new TRECHeaderDocumentFactory(), new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ), 4, // Very small, to induce fragmentation false); DocumentIterator iter = collection.iterator(); Document d = null; d = iter.nextDocument(); assertNotNull(d); assertEquals("http://gx0001/", d.uri()); assertEquals("GX001", d.title()); final int textIndex = collection.factory().fieldIndex( "text" ); assertEquals( "Line 1\n The line 2!\n Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d = iter.nextDocument(); assertNotNull(d); assertEquals("http://gx0002/", d.uri()); assertEquals("GX002", d.title()); assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d = iter.nextDocument(); assertNotNull(d); assertEquals("http://gx0003/", d.uri()); assertEquals("GX003", d.title()); assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d = iter.nextDocument(); assertNotNull(d); assertEquals("http://gx0004/", d.uri()); assertEquals("GX004", d.title()); assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d = iter.nextDocument(); assertNotNull(d); assertEquals("http://gx0005/", d.uri()); assertEquals("GX005", d.title()); assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d = iter.nextDocument(); assertNotNull(d); assertEquals("http://gx0006/", d.uri()); assertEquals("GX006", d.title()); assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d = iter.nextDocument(); assertNotNull(d); assertEquals("http://gx0007/", d.uri()); assertEquals("GX007", d.title()); assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d = iter.nextDocument(); assertNull(d); iter.close(); d = collection.document( 0 ); assertNotNull(d); assertEquals("http://gx0001/", d.uri()); assertEquals("GX001", d.title()); assertEquals( "Line 1\n The line 2!\n Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d.close(); d = collection.document( 1 ); assertNotNull(d); assertEquals("http://gx0002/", d.uri()); assertEquals("GX002", d.title()); assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d.close(); d = collection.document( 2 ); assertNotNull(d); assertEquals("http://gx0003/", d.uri()); assertEquals("GX003", d.title()); assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d.close(); d = collection.document( 3 ); assertNotNull(d); assertEquals("http://gx0004/", d.uri()); assertEquals("GX004", d.title()); assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d.close(); d = collection.document( 4 ); assertNotNull(d); assertEquals("http://gx0005/", d.uri()); assertEquals("GX005", d.title()); assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d.close(); d = collection.document( 5 ); assertNotNull(d); assertEquals("http://gx0006/", d.uri()); assertEquals("GX006", d.title()); assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d.close(); d = collection.document( 6 ); assertNotNull(d); assertEquals("http://gx0007/", d.uri()); assertEquals("GX007", d.title()); assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) ); d.close(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -