📄 anchorextractor.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.util.parser.callback;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Paolo Boldi * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.ObjectList;import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.parser.Attribute;import it.unimi.dsi.parser.BulletParser;import it.unimi.dsi.parser.Element;import it.unimi.dsi.parser.callback.DefaultCallback;import it.unimi.dsi.util.CircularCharArrayBuffer;import java.util.Map;import org.apache.log4j.Logger;/** A callback extracting anchor text. When instantiating the extractor, you can specify the number of characters to * be considered before the anchor, after the anchor or during the anchor (just the first characters are taken into * consideration in the last two characters, and just the last ones in the first case).  *  * <p>At the end of parsing, the result (the list of anchors) is available in {@link #anchors}, whose * elements provide the content of the <samp>href</samp> attribute  * the text of the anchor and around the anchor; text is however modified so that fragment of words at the beginning * of the pre-anchor context, or at the end of the post-anchor context, are cut away. *  * <p>For example, a fragment like: *  * <code> *    ...foo fOO FOO FOO <a href="xxx">ANCHOR TEXT</a> BAR BAR BAr bar...  * </code> *  * (where the uppercase part represents the pre- and post-anchor context) generates the element *  * <code> * 		Anchor("xxx", "FOO FOO ANCHOR TEXT BAR BAR") * </code> */public class AnchorExtractor extends DefaultCallback {	/** A class representing an anchor. It is used to return the results of parsing. 	 * 	 */	public final static class Anchor implements VirtualDocumentFragment {		private static final long serialVersionUID = 1L;		/** The content of the <samp>href</samp> attribute for this anchor. */		private final MutableString href;		/** The text surrounding this anchor. */		private final MutableString anchorText;				public Anchor( final MutableString href, final MutableString anchorText ) {			this.href = href;			this.anchorText = anchorText;		}		public MutableString documentSpecifier() {			return href;		}		public MutableString text() {			return anchorText;		}				public String toString() {			return "<" + href + ", \"" + anchorText + "\">";		}	}		public static final Logger LOGGER = Logger.getLogger( AnchorExtractor.class );	public static final boolean DEBUG = false;		/** The resulting list of {@linkplain Anchor anchors}. */	public final ObjectList<Anchor> anchors = new ObjectArrayList<Anchor>();	/** The circular buffer for pre-anchor context. */	private final CircularCharArrayBuffer preAnchor;	/** The circular buffer for anchor. */	private final MutableString anchor;	/** The maximum number of characters in the anchor. */	private final int maxAnchor;	/** The maximum number of characters after anchor. */	private final int maxAfter;	/** The post-anchor. */	private final MutableString postAnchor;	/** The current URL (if state is IN_ANCHOR). */	private MutableString url;	/** The resulting string (pre+anchor+post). */	private MutableString result;	/** When an anchor opens, the pre-anchor buffer is copied in this array. */	private char[] preAnchorArray;	private enum State {		BEFORE_ANCHOR, IN_ANCHOR, AFTER_ANCHOR	};	private State state;		/** 	 * 	 * @param maxBefore maximum number of words to be considered before of the anchor.	 * @param maxAfter maximum number of words to be considered after the anchor.	 */	public AnchorExtractor( int maxBefore, int maxAnchor, int maxAfter ) {		preAnchor = new CircularCharArrayBuffer( maxBefore );		anchor = new MutableString( maxAnchor );		postAnchor = new MutableString( maxAfter );		result = new MutableString( maxBefore + maxAnchor + maxAfter );		this.maxAfter = maxAfter;		this.maxAnchor = maxAnchor;		state = State.BEFORE_ANCHOR;	}	public void configure( final BulletParser parser ) {		parser.parseTags( true );		parser.parseAttributes( true );		parser.parseText( true );		parser.parseAttribute( Attribute.HREF );	}	public void startDocument() {		state = State.BEFORE_ANCHOR;		anchors.clear();		preAnchor.clear();		anchor.setLength( 0 );		postAnchor.setLength( 0 );		url = null;	}		public void endDocument() {		if ( url != null ) {			emit();		}		url = null;	}		public boolean startElement( final Element element, final Map<Attribute,MutableString> attrMap ) {				if ( element == Element.A && attrMap != null && attrMap.containsKey( Attribute.HREF ) ) {			if ( state == State.AFTER_ANCHOR ) {				emit();				state = State.BEFORE_ANCHOR;			}			if ( state == State.BEFORE_ANCHOR ) {				preAnchorArray = preAnchor.toCharArray();				preAnchor.clear();				if ( DEBUG ) System.out.println( "Freezing now pre: <" + new String( preAnchorArray ) + ">" );				state = State.IN_ANCHOR;				url = attrMap.get( Attribute.HREF );				anchor.setLength( 0 );				postAnchor.setLength( 0 );			} 		}		return true;	}		public boolean endElement( final Element element ) {		if ( element == Element.A && state == State.IN_ANCHOR ) {			state = State.AFTER_ANCHOR;		}		return true;	}		public boolean characters( final char[] characters, final int offset, final int length, final boolean flowBroken ) {		switch ( state ) {			case BEFORE_ANCHOR: 				preAnchor.add( characters, offset, length );				break;			case IN_ANCHOR:				anchor.append( characters, offset, Math.min( length, maxAnchor - anchor.length() ) );				break;			case AFTER_ANCHOR:				preAnchor.add( characters, offset, length );				postAnchor.append( characters, offset, Math.min( length, maxAfter - postAnchor.length() ) );				break;		}		if ( state == State.AFTER_ANCHOR && postAnchor.length() == maxAfter && url != null ) {			emit();			state = State.BEFORE_ANCHOR;		}		return true;	}	private void emit() {		int posPre, posPost, posAnchor;				// Cut pre until the first start of word		posPre = 0;		if ( preAnchorArray.length > 0 && Character.isLetterOrDigit( preAnchorArray[ posPre ] ) )			// Skip starting non-space			for ( ; posPre < preAnchorArray.length && Character.isLetterOrDigit( preAnchorArray[ posPre ] ); posPre++ );		// Same for post		char[] postAnchorArray = postAnchor.array();		posPost = postAnchor.length() - 1;		if ( posPost >= 0 && Character.isLetterOrDigit( postAnchorArray[ posPost ] ) ) {			// Skip ending non-space 			for ( ; posPost >= 0 && Character.isLetterOrDigit( postAnchorArray[ posPost ] ); posPost-- );		}		// Same for anchor		char[] anchorArray = anchor.array();		posAnchor = anchor.length() - 1;		if ( anchor.length() == maxAnchor && posAnchor >= 0 && Character.isLetterOrDigit( anchorArray[ posAnchor ] ) )			// Skip starting non-space			for ( ; posAnchor >= 0 && Character.isLetterOrDigit( anchorArray[ posAnchor ] ); posAnchor-- );					result.setLength( 0 );		result.append( preAnchorArray, posPre, preAnchorArray.length - posPre ).append( anchorArray, 0, posAnchor + 1 ).append( postAnchorArray, 0, posPost + 1 );		anchors.add( new Anchor( url, result.copy() ) );		url = null;	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -