📄 documentcollectiontest.java
字号:
for ( int doc = 0; doc < coll.size(); doc++ ) { Document docum = coll.document( doc ); for ( int i = 0; i < nfields; i++ ) { int field = fieldNumber[ i ]; Reader content = (Reader)docum.content( field ); WordReader wordReader = docum.wordReader( field ); wordReader.setReader( content ); StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] ); System.err.println( "Checking document " + doc + " field " + fieldName[ i ] + " (" + field + ")" ); checkSameWords( wordReader, tok ); } docum.close(); } } /** Checks that the documents in the sequence have the same sequence of words as in * <code>document</code>: the names of the fields to be checked are specified in the array. * * @param seq the sequence. * @param fieldName the field names. * @param document documents to be checked against. * @throws IOException */ private void checkAllDocumentsSeq( final DocumentSequence seq, final String[] fieldName, final String[][] document ) throws IOException { final int nfields = fieldName.length; final int[] fieldNumber = new int[ nfields ]; final int[] arrayIndex = new int[ nfields ]; // Look for field indices for ( int i = 0; i < nfields; i++ ) { arrayIndex[ i ] = i; int j; for ( j = 0; j < seq.factory().numberOfFields(); j++ ) if ( seq.factory().fieldName( j ).equals( fieldName[ i ] ) ) { fieldNumber[ i ] = j; break; } assert j < seq.factory().numberOfFields(); } // Sort fields to guarantee that they are correctly numbered GenericSorting.quickSort( 0, nfields, new IntComparator() { public int compare( int x, int y ) { return fieldNumber[ x ] - fieldNumber[ y ]; }}, new Swapper() { public void swap( int x, int y ) { int t = fieldNumber[ x ]; fieldNumber[ x ] = fieldNumber[ y ]; fieldNumber[ y ] = t; t = arrayIndex[ x ]; arrayIndex[ x ] = arrayIndex[ y ]; arrayIndex[ y ] = t; String q = fieldName[ x ]; fieldName[ x ] = fieldName[ y ]; fieldName[ y ] = q; }} ); // Start checking DocumentIterator iterator = seq.iterator(); Document docum; int doc = 0; while ( ( docum = iterator.nextDocument() ) != null ) { for ( int i = 0; i < nfields; i++ ) { int field = fieldNumber[ i ]; Reader content = (Reader)docum.content( field ); WordReader wordReader = docum.wordReader( field ); wordReader.setReader( content ); StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] ); System.err.println( "Checking sequentially document " + doc + " field " + fieldName[ i ] + " (" + field + ")" ); checkSameWords( wordReader, tok ); } docum.close(); doc++; } iterator.close(); } protected void setUp() throws IOException, ClassNotFoundException, ConfigurationException { // Create a new directory under /tmp tempDir = File.createTempFile( "mg4jtest", null ); tempDir.delete(); tempDir.mkdir(); // Now create the hierarchy for HTML files File htmlDir = new File( tempDir, "html" ); htmlDir.mkdir(); System.err.println( "Temporary directory: " + tempDir ); htmlFileSet = new String[ ndoc ]; for ( int i = 0; i < ndoc; i++ ) { String docFile = new File( htmlDir, "doc" + i + ".html" ).toString(); htmlFileSet[ i ] = docFile; Writer docWriter = new OutputStreamWriter( new FileOutputStream( docFile ), "ISO-8859-1" ); docWriter.write( getHTMLDocument( document[ i ] ) ); docWriter.close(); } // Now create the mbox file Writer mboxWriter = new OutputStreamWriter( new FileOutputStream( new File( tempDir, "mbox" ) ), "ISO-8859-1" ); for ( int i = 0; i < ndoc; i++ ) mboxWriter.write( getMboxDocument( document[ i ] ) ); mboxWriter.close(); // Now create the zip collections FileSetDocumentCollection fileSetDocumentCollection = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) ); ZipDocumentCollectionBuilder collBuilder = new ZipDocumentCollectionBuilder( new File( tempDir, "zip" ).toString(), fileSetDocumentCollection.factory(), true, new ProgressLogger() ); ZipDocumentCollection zipDocumentCollection = collBuilder.build( fileSetDocumentCollection ); BinIO.storeObject( zipDocumentCollection, new File( tempDir, "zip.collection" ).toString() ); zipDocumentCollection.close(); ZipDocumentCollectionBuilder apprCollBuilder = new ZipDocumentCollectionBuilder( new File( tempDir, "azip" ).toString(), fileSetDocumentCollection.factory(), false, new ProgressLogger() ); zipDocumentCollection = apprCollBuilder.build( fileSetDocumentCollection ); BinIO.storeObject( zipDocumentCollection, new File( tempDir, "azip.collection" ).toString() ); zipDocumentCollection.close(); fileSetDocumentCollection.close(); } public void testFileSetDocumentCollection() throws IOException, ConfigurationException { System.err.println( "Checking fileset collection" ); FileSetDocumentCollection coll = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) ); assertEquals( coll.size(), ndoc ); checkAllDocuments( coll, new String[] { "title", "text" }, document ); coll.close(); } public void testFileSetDocumentCollectionSeq() throws IOException, ConfigurationException { System.err.println( "Checking fileset collection sequentially" ); FileSetDocumentCollection coll = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) ); checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document ); coll.close(); } /* public void testMboxDocumentCollection() throws IOException, ConfigurationException, MessagingException { System.err.println( "Checking mbox collection" ); JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES ); checkAllDocuments( coll, new String[] { "subject", "body" }, document ); coll.close(); } public void testMboxDocumentCollectionSeq() throws IOException, ConfigurationException, MessagingException { System.err.println( "Checking mbox collection sequentially" ); JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES ); checkAllDocumentsSeq( coll, new String[] { "subject", "body" }, document ); coll.close(); }*/ public void testZipDocumentCollection() throws IOException, ClassNotFoundException { System.err.println( "Checking zipped collection" ); ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "zip.collection" ).toString() ); checkAllDocuments( coll, new String[] { "title", "text" }, document ); coll.close(); } public void testZipDocumentCollectionSeq() throws IOException, ClassNotFoundException { System.err.println( "Checking zipped collection sequentially" ); ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "zip.collection" ).toString() ); checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document ); coll.close(); } public void testZipDocumentCollectionAppr() throws IOException, ClassNotFoundException { System.err.println( "Checking approximated zipped collection" ); ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "azip.collection" ).toString() ); checkAllDocuments( coll, new String[] { "title", "text" }, document ); coll.close(); } public void testZipDocumentCollectionApprSeq() throws IOException, ClassNotFoundException { System.err.println( "Checking approximated zipped collection sequentially" ); ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "azip.collection" ).toString() ); checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document ); coll.close(); } public void testInputStreamSequence() throws IOException, ConfigurationException { System.err.println( "Checking input stream (text field only)" ); // Extract only field number 1, and write it out with separator '\u0000' MutableString res = new MutableString(); String[][] justSecondField = new String[ ndoc ][ 1 ]; for ( int i = 0; i < ndoc; i++ ) { res.append( document[ i ][ 1 ] + "\u0000" ); justSecondField[ i ][ 0 ] = document[ i ][ 1 ]; } String resString = res.toString(); // Write the sequence on a file (in UTF-8) Writer resWriter = new OutputStreamWriter( new FileOutputStream( new File( tempDir, "stream" ) ), "UTF-8" ); resWriter.write( resString ); resWriter.close(); // Read it as a input stream document sequence InputStream is = new FileInputStream( new File( tempDir, "stream" ) ); DocumentSequence seq = new InputStreamDocumentSequence( is, '\u0000', new IdentityDocumentFactory( DEFAULT_PROPERTIES ) ); checkAllDocumentsSeq( seq, new String[] { "text" }, justSecondField ); seq.close(); } protected void tearDown() throws IOException { FileUtils.forceDeleteOnExit( tempDir ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -