📄 htmldocumentfactory.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.document;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.chars.CharArrays;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;import it.unimi.dsi.parser.BulletParser;import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;import it.unimi.dsi.parser.callback.TextExtractor;import it.unimi.dsi.util.Properties;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.ObjectInputStream;import java.io.Reader;import java.nio.charset.Charset;import org.apache.commons.configuration.ConfigurationException;/** A factory that provides fields for body and title of HTML documents.  * It uses internally a {@link BulletParser}.  * A default encoding can be provided * using the property {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#ENCODING}. */public class HtmlDocumentFactory extends PropertyBasedDocumentFactory {	private static final long serialVersionUID = 1L;	public static enum MetadataKeys {		/** The maximum number of characters before an anchor. */		MAXPREANCHOR,		/** The maximum number of characters in an anchor. */		MAXANCHOR,		/** The maximum number of characters after an anchor. */		MAXPOSTANCHOR,	};	private static final int DEFAULT_BUFFER_SIZE = 16 * 1024;	/** A parser that will be used to extract text from HTML documents. */	private transient BulletParser parser;	/** The callback recording text. */	private transient TextExtractor textExtractor;	/** The callback for anchors. */	private transient AnchorExtractor anchorExtractor;	/** The word reader used for all documents. */	private transient WordReader wordReader;	/** The maximum number of characters before an anchor. */	private int maxPreAnchor;	/** The maximum number of characters in an anchor. */	private int maxAnchor;	/** The maximum number of characters after an anchor. */	private int maxPostAnchor;			private transient char[] text;	protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException {		if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, key ) ) {			metadata.put( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, ensureJustOne( key, values ) );			return true;		}		else if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, key ) ) {			metadata.put( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, Charset.forName( ensureJustOne( key, values ) ).toString() );			return true;		}		else if ( sameKey( MetadataKeys.MAXPREANCHOR, key ) ) {			metadata.put( MetadataKeys.MAXPREANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) );			return true;		}		else if ( sameKey( MetadataKeys.MAXANCHOR, key ) ) {			metadata.put( MetadataKeys.MAXANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) );			return true;		}		else if ( sameKey( MetadataKeys.MAXPOSTANCHOR, key ) ) {			metadata.put( MetadataKeys.MAXPOSTANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) );			return true;		}				return super.parseProperty( key, values, metadata );	}	private void init() {		this.parser = new BulletParser();				ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();		composedBuilder.add( this.textExtractor = new TextExtractor() );		composedBuilder.add( this.anchorExtractor = new AnchorExtractor( maxPreAnchor, maxAnchor, maxPostAnchor ) ); 		parser.setCallback( composedBuilder.compose() );		this.wordReader = new FastBufferedReader();		text = new char[ DEFAULT_BUFFER_SIZE ];	}	@SuppressWarnings("boxing")	private void initVars() {		maxPreAnchor = (Integer)resolve( MetadataKeys.MAXPREANCHOR, defaultMetadata, 32 );		maxAnchor = (Integer)resolve( MetadataKeys.MAXANCHOR, defaultMetadata, 256 );		maxPostAnchor = (Integer)resolve( MetadataKeys.MAXPOSTANCHOR, defaultMetadata, 32 );	}		/** Returns a copy of this document factory. A new parser is allocated for the copy. */	public HtmlDocumentFactory copy() {		return new HtmlDocumentFactory( defaultMetadata );	}		public HtmlDocumentFactory( final Properties properties ) throws ConfigurationException {		super( properties );		initVars();		init();	}	public HtmlDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) {		super( defaultMetadata );		initVars();		init();	}	public HtmlDocumentFactory( final String[] property ) throws ConfigurationException {		super( property );		initVars();		init();	}	public HtmlDocumentFactory() {		super();		initVars();		init();	}		public int numberOfFields() {		return 3;	}	public String fieldName( final int field ) {		ensureFieldIndex( field );		switch( field ) {			case 0: return "text";			case 1: return "title";			case 2: return "anchor";			default: throw new IllegalArgumentException();		}	}		public int fieldIndex( final String fieldName ) {		for ( int i = 0; i < numberOfFields(); i++ )			if ( fieldName( i ).equals( fieldName ) ) return i;		return -1;	}		public FieldType fieldType( final int field ) {		ensureFieldIndex( field );		switch( field ) {			case 0: return FieldType.TEXT;			case 1: return FieldType.TEXT;			case 2: return FieldType.VIRTUAL;			default: throw new IllegalArgumentException();		}	}	private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {		s.defaultReadObject();		init();	}	/** An HTML document. If a <samp>TITLE</samp> element is available, it will be used for {@link #title()}	 * 	instead of the default value. 	 * 	 * <p>We delay the actual parsing until it is actually necessary, so operations like	 * getting the document URI will not require parsing. */		protected class HtmlDocument extends AbstractDocument {		private final Reference2ObjectMap<Enum<?>,Object> metadata;		/** Whether we already parsed the document. */		private boolean parsed;		/** The cached raw content. */		private final InputStream rawContent;		private void ensureParsed() throws IOException {			if ( parsed ) return;			int offset = 0, l;			Reader r = new InputStreamReader( rawContent, (String)resolveNotNull( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, metadata ) );			while( ( l = r.read( text, offset, text.length - offset ) ) > 0 ) {				offset += l;				text = CharArrays.grow( text, offset + 1 );			}			parser.parse( text, 0, offset );			textExtractor.title.trim();			parsed = true;		}				protected HtmlDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) {			this.metadata = metadata;			this.rawContent = rawContent;		}		public CharSequence title() {			try {				ensureParsed();			}			catch ( IOException e ) {				throw new RuntimeException( e );			}			return (CharSequence)( textExtractor.title.length() == 0 ? resolve( PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata ): textExtractor.title );		}		public String toString() {			return title().toString();		}		public CharSequence uri() {			return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.URI, metadata );		}		public Object content( final int field ) throws IOException {			ensureFieldIndex( field );			ensureParsed();			switch( field ) {				case 0: return new FastBufferedReader( textExtractor.text );				case 1: return new FastBufferedReader( textExtractor.title );				case 2: return anchorExtractor.anchors;				default: throw new IllegalArgumentException();			}		}		public WordReader wordReader( final int field ) {			ensureFieldIndex( field );			return wordReader; 		}	}	public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws IOException {		return new HtmlDocument( rawContent, metadata );	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -