📄 wikipediadocumentcollection.java
字号:
return len == -1 ? -1 : start; } public static class WhitespaceWordReader extends FastBufferedReader { private static final long serialVersionUID = 1L; @Override protected boolean isWordConstituent( final char c ) { return ! Character.isWhitespace( c ); } } protected WikipediaDocumentCollection( String[] file, DocumentFactory factory, ObjectArrayList<EliasFanoMonotoneLongBigList> pointers, int size, int[] firstDocument, boolean phrase, boolean gzipped ) { this.file = file; this.factory = factory; this.pointers = pointers; this.size = size; this.firstDocument = firstDocument; this.gzipped = gzipped; this.phrase = phrase; initBuffers(); } private static boolean startsWith( byte[] array, byte[] pattern ) { int length = pattern.length; if ( array.length < length ) return false; while( length-- != 0 ) if ( array[ length ] != pattern[ length ] ) return false; return true; } public DocumentFactory factory() { return factory; } public int size() { return size; } public Reference2ObjectMap<Enum<?>,Object> metadata( final int index ) throws IOException { readDocument( index, -1, null ); if ( ! metadata.containsKey( MetadataKeys.TITLE ) ) metadata.put( MetadataKeys.TITLE, "Sentence #" + ( index + 1 ) ); return metadata; } public Document document( final int index ) throws IOException { return factory.getDocument( stream( index ), metadata( index ) ); } public InputStream stream( final int index ) throws IOException { readDocument( index, -1, null ); FastByteArrayInputStream[] is = new FastByteArrayInputStream[ NUM_FIELDS ]; for( int i = 0; i < NUM_FIELDS; i++ ) is[ i ] = new FastByteArrayInputStream( buffer[ i ], 0, bufferSize[ i ] ); return MultipleInputStream.getStream( is ); } @Override public DocumentIterator iterator() throws IOException { return new AbstractDocumentIterator() { private int index = 0; private int f = 0; private FastBufferedInputStream fbis = new FastBufferedInputStream( new FileInputStream( file[ 0 ] ) ); public void close() throws IOException { super.close(); if( fbis != null ) { fbis.close(); fbis = null; } } public Document nextDocument() throws IOException { if ( index == size ) return null; if ( index == firstDocument[ f + 1 ] ) { fbis.close(); fbis = new FastBufferedInputStream( new FileInputStream( file[ ++f ] ) ); } readDocument( index, f, fbis ); return document( index++ ); } }; } private void readDocument( final int index, int f, FastBufferedInputStream fbis ) throws IOException { ensureDocumentIndex( index ); if ( index == lastDocument ) return; final boolean openStream = fbis == null; if ( openStream ){ f = Arrays.binarySearch( firstDocument, index ); if ( f < 0 ) f = -f - 2; fbis = new FastBufferedInputStream( new FileInputStream( file[ f ] ) ); } long start = pointers.get( f ).getLong( index - firstDocument[ f ] ); fbis.position( start ); final long end = pointers.get( f ).getLong( index - firstDocument[ f ] + 1 ); IntArrays.fill( bufferSize, 0 ); metadata.clear(); int l, field; boolean startOfPage, startOfSentence; String title; while( fbis.position() < end ) { l = readLine( fbis ); if ( startsWith( lineBuffer, META_MARKER ) ) { startOfPage = startOfSentence = false; if ( startsWith( lineBuffer, DOC_MARKER ) && phrase ) return; if ( startsWith( lineBuffer, PAGE_MARKER ) ) startOfPage = true; else if ( startsWith( lineBuffer, SENTENCE_MARKER ) ) startOfSentence = true; if ( startOfPage ) { title = new String( lineBuffer, Math.min( PAGE_MARKER.length + 1, l ), Math.max( l - PAGE_MARKER.length - 1, 0 ), "UTF-8" ).trim(); metadata.put( MetadataKeys.TITLE, title ); metadata.put( MetadataKeys.URI, "http://en.wikipedia.org/wiki/" + URLEncoder.encode( title, "UTF-8" ) ); } if ( ( startOfPage || startOfSentence ) && ! phrase ) { for( int i = 0; i < NUM_FIELDS; i++ ) { // Add a paragraph symbol (UTF-8: 0xC2 0xB6). buffer[ i ] = ByteArrays.grow( buffer[ i ], bufferSize[ i ] + 3 ); buffer[ i ][ bufferSize[ i ]++ ] = (byte)0xc2; buffer[ i ][ bufferSize[ i ]++ ] = (byte)0xb6; buffer[ i ][ bufferSize[ i ]++ ] = '\n'; } } } else for( int i = field = 0; i < l; i++ ) { if ( lineBuffer[ i ] == '\t' ) { field++; } else { buffer[ field ] = ByteArrays.grow( buffer[ field ], bufferSize[ field ] + 2 ); buffer[ field ][ bufferSize[ field ]++ ] = lineBuffer[ i ]; if ( i == l - 1 || lineBuffer[ i + 1 ] == '\t' ) buffer[ field ][ bufferSize[ field ]++ ] = ' '; } } } if ( openStream ) fbis.close(); } public WikipediaDocumentCollection copy() { return new WikipediaDocumentCollection( file, factory.copy(), pointers, size, firstDocument, phrase, gzipped ); } private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException { s.defaultReadObject(); initBuffers(); } public static void main( final String[] arg ) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { SimpleJSAP jsap = new SimpleJSAP( WikipediaDocumentCollection.class.getName(), "Saves a serialised document collection based on a set of files.", new Parameter[] { new FlaggedOption( "uris", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "uris", "A file containing a list of URIs in ASCII encoding, one per line, that will be associated to each file" ), new Switch( "phrase", 'p', "phrase", "Index phrases rather than documents." ), new Switch( "gzipped", 'z', "gzipped", "The files are gzipped." ), new UnflaggedOption( "collection", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the serialised collection." ), new UnflaggedOption( "file", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.GREEDY, "A list of files that will be indexed. If missing, a list of files will be read from standard input." ) } ); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; String uri[] = null; if ( jsapResult.getString( "uris" ) != null ) { Collection<MutableString> lines = new FileLinesCollection( jsapResult.getString( "uris" ), "ASCII" ).allLines(); uri = new String[ lines.size() ]; int i = 0; for( Object l: lines ) uri[ i++ ] = l.toString(); } final DocumentFactory factory = new IdentityDocumentFactory( new Reference2ObjectOpenHashMap<Enum<?>, Object>( new Enum[] { PropertyBasedDocumentFactory.MetadataKeys.ENCODING, PropertyBasedDocumentFactory.MetadataKeys.WORDREADER }, new Object[] { "UTF-8", WhitespaceWordReader.class.getName() } ) ); String[] file = (String[])jsapResult.getObjectArray( "file", new String[ 0 ] ); if ( file.length == 0 ) { final ObjectArrayList<String> files = new ObjectArrayList<String>(); BufferedReader bufferedReader = new BufferedReader( new InputStreamReader( System.in ) ); String s; while( ( s = bufferedReader.readLine() ) != null ) files.add( s ); file = files.toArray( new String[ 0 ] ); } if ( file.length == 0 ) System.err.println( "WARNING: empty file set." ); if ( uri != null && file.length != uri.length ) throw new IllegalArgumentException( "The number of files (" + file.length + ") and the number of URIs (" + uri.length + ") differ" ); BinIO.storeObject( new WikipediaDocumentCollection( file, ReplicatedDocumentFactory.getFactory( factory, NUM_FIELDS, FIELD_NAME ), jsapResult.getBoolean( "phrase"), jsapResult.getBoolean( "gzipped" ) ), jsapResult.getString( "collection" ) ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -