📄 htmldocumentfactory.java
字号:
package it.unimi.dsi.mg4j.document;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.chars.CharArrays;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;import it.unimi.dsi.parser.BulletParser;import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;import it.unimi.dsi.parser.callback.TextExtractor;import it.unimi.dsi.util.Properties;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.ObjectInputStream;import java.io.Reader;import java.nio.charset.Charset;import org.apache.commons.configuration.ConfigurationException;/** A factory that provides fields for body and title of HTML documents. * It uses internally a {@link BulletParser}. * A default encoding can be provided * using the property {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#ENCODING}. */public class HtmlDocumentFactory extends PropertyBasedDocumentFactory { private static final long serialVersionUID = 1L; public static enum MetadataKeys { /** The maximum number of characters before an anchor. */ MAXPREANCHOR, /** The maximum number of characters in an anchor. */ MAXANCHOR, /** The maximum number of characters after an anchor. */ MAXPOSTANCHOR, }; private static final int DEFAULT_BUFFER_SIZE = 16 * 1024; /** A parser that will be used to extract text from HTML documents. */ private transient BulletParser parser; /** The callback recording text. */ private transient TextExtractor textExtractor; /** The callback for anchors. */ private transient AnchorExtractor anchorExtractor; /** The word reader used for all documents. */ private transient WordReader wordReader; /** The maximum number of characters before an anchor. */ private int maxPreAnchor; /** The maximum number of characters in an anchor. */ private int maxAnchor; /** The maximum number of characters after an anchor. */ private int maxPostAnchor; private transient char[] text; protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException { if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, key ) ) { metadata.put( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, ensureJustOne( key, values ) ); return true; } else if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, key ) ) { metadata.put( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, Charset.forName( ensureJustOne( key, values ) ).toString() ); return true; } else if ( sameKey( MetadataKeys.MAXPREANCHOR, key ) ) { metadata.put( MetadataKeys.MAXPREANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) ); return true; } else if ( sameKey( MetadataKeys.MAXANCHOR, key ) ) { metadata.put( MetadataKeys.MAXANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) ); return true; } else if ( sameKey( MetadataKeys.MAXPOSTANCHOR, key ) ) { metadata.put( MetadataKeys.MAXPOSTANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) ); return true; } return super.parseProperty( key, values, metadata ); } private void init() { this.parser = new BulletParser(); ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder(); composedBuilder.add( this.textExtractor = new TextExtractor() ); composedBuilder.add( this.anchorExtractor = new AnchorExtractor( maxPreAnchor, maxAnchor, maxPostAnchor ) ); parser.setCallback( composedBuilder.compose() ); this.wordReader = new FastBufferedReader(); text = new char[ DEFAULT_BUFFER_SIZE ]; } @SuppressWarnings("boxing") private void initVars() { maxPreAnchor = (Integer)resolve( MetadataKeys.MAXPREANCHOR, defaultMetadata, 32 ); maxAnchor = (Integer)resolve( MetadataKeys.MAXANCHOR, defaultMetadata, 256 ); maxPostAnchor = (Integer)resolve( MetadataKeys.MAXPOSTANCHOR, defaultMetadata, 32 ); } /** Returns a copy of this document factory. A new parser is allocated for the copy. */ public HtmlDocumentFactory copy() { return new HtmlDocumentFactory( defaultMetadata ); } public HtmlDocumentFactory( final Properties properties ) throws ConfigurationException { super( properties ); initVars(); init(); } public HtmlDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) { super( defaultMetadata ); initVars(); init(); } public HtmlDocumentFactory( final String[] property ) throws ConfigurationException { super( property ); initVars(); init(); } public HtmlDocumentFactory() { super(); initVars(); init(); } public int numberOfFields() { return 3; } public String fieldName( final int field ) { ensureFieldIndex( field ); switch( field ) { case 0: return "text"; case 1: return "title"; case 2: return "anchor"; default: throw new IllegalArgumentException(); } } public int fieldIndex( final String fieldName ) { for ( int i = 0; i < numberOfFields(); i++ ) if ( fieldName( i ).equals( fieldName ) ) return i; return -1; } public FieldType fieldType( final int field ) { ensureFieldIndex( field ); switch( field ) { case 0: return FieldType.TEXT; case 1: return FieldType.TEXT; case 2: return FieldType.VIRTUAL; default: throw new IllegalArgumentException(); } } private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException { s.defaultReadObject(); init(); } /** An HTML document. If a <samp>TITLE</samp> element is available, it will be used for {@link #title()} * instead of the default value. * * <p>We delay the actual parsing until it is actually necessary, so operations like * getting the document URI will not require parsing. */ protected class HtmlDocument extends AbstractDocument { private final Reference2ObjectMap<Enum<?>,Object> metadata; /** Whether we already parsed the document. */ private boolean parsed; /** The cached raw content. */ private final InputStream rawContent; private void ensureParsed() throws IOException { if ( parsed ) return; int offset = 0, l; Reader r = new InputStreamReader( rawContent, (String)resolveNotNull( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, metadata ) ); while( ( l = r.read( text, offset, text.length - offset ) ) > 0 ) { offset += l; text = CharArrays.grow( text, offset + 1 ); } parser.parse( text, 0, offset ); textExtractor.title.trim(); parsed = true; } protected HtmlDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) { this.metadata = metadata; this.rawContent = rawContent; } public CharSequence title() { try { ensureParsed(); } catch ( IOException e ) { throw new RuntimeException( e ); } return (CharSequence)( textExtractor.title.length() == 0 ? resolve( PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata ): textExtractor.title ); } public String toString() { return title().toString(); } public CharSequence uri() { return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.URI, metadata ); } public Object content( final int field ) throws IOException { ensureFieldIndex( field ); ensureParsed(); switch( field ) { case 0: return new FastBufferedReader( textExtractor.text ); case 1: return new FastBufferedReader( textExtractor.title ); case 2: return anchorExtractor.anchors; default: throw new IllegalArgumentException(); } } public WordReader wordReader( final int field ) { ensureFieldIndex( field ); return wordReader; } } public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws IOException { return new HtmlDocument( rawContent, metadata ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -