📄 regexfilter.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
字号:
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2003 Derrick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/filters/RegexFilter.java,v $// $Author: derrickoswald $// $Date: 2005/05/15 11:49:04 $// $Revision: 1.4 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.filters;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Text;/** * This filter accepts all string nodes matching a regular expression. * Because this searches {@link org.htmlparser.Text Text} nodes. it is * only useful for finding small fragments of text, where it is * unlikely to be broken up by a tag. To find large fragments of text * you should convert the page to plain text with something like the * {@link org.htmlparser.beans.StringBean StringBean} and then apply * the regular expression. * <p> * For example, to look for dates use: * <pre> *   (19|20)\d\d([- \\/.](0[1-9]|1[012])[- \\/.](0[1-9]|[12][0-9]|3[01]))? * </pre> * as in: * <pre> * Parser parser = new Parser ("http://cbc.ca"); * RegexFilter filter = new RegexFilter ("(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?"); * NodeIterator iterator = parser.extractAllNodesThatMatch (filter).elements (); * </pre> * which matches a date in yyyy-mm-dd format between 1900-01-01 and 2099-12-31, * with a choice of five separators, either a dash, a space, either kind of * slash or a period. * The year is matched by (19|20)\d\d which uses alternation to allow the * either 19 or 20 as the first two digits. The round brackets are mandatory. * The month is matched by 0[1-9]|1[012], again enclosed by round brackets * to keep the two options together. By using character classes, the first * option matches a number between 01 and 09, and the second * matches 10, 11 or 12. * The last part of the regex consists of three options. The first matches * the numbers 01 through 09, the second 10 through 29, and the third matches 30 or 31. * The day and month are optional, but must occur together because of the ()? * bracketing after the year. */public class RegexFilter implements NodeFilter{    /**     * Use match() matching strategy.     */    public static final int MATCH = 1;    /**     * Use lookingAt() match strategy.     */    public static final int LOOKINGAT = 2;    /**     * Use find() match strategy.     */    public static final int FIND = 3;    /**     * The regular expression to search for.     */    protected String mPatternString;    /**     * The compiled regular expression to search for.     */    protected Pattern mPattern;    /**     * The match strategy.     * @see #RegexFilter(String, int)     */    protected int mStrategy;    /**     * Creates a new instance of RegexFilter that accepts string nodes matching     * the regular expression ".*" using the FIND strategy.     */    public RegexFilter ()    {        this (".*", FIND);    }    /**     * Creates a new instance of RegexFilter that accepts string nodes matching     * a regular expression using the FIND strategy.     * @param pattern The pattern to search for.     */    public RegexFilter (String pattern)    {        this (pattern, FIND);    }    /**     * Creates a new instance of RegexFilter that accepts string nodes matching     * a regular expression.     * @param pattern The pattern to search for.     * @param strategy The type of match:     * <ol>     * <li>{@link #MATCH} use matches() method: attempts to match     * the entire input sequence against the pattern</li>     * <li>{@link #LOOKINGAT} use lookingAt() method: attempts to match     * the input sequence, starting at the beginning, against the pattern</li>     * <li>{@link #FIND} use find() method: scans the input sequence looking     * for the next subsequence that matches the pattern</li>     * </ol>     */    public RegexFilter (String pattern, int strategy)    {        setPattern (pattern);        setStrategy (strategy);    }    /**     * Get the search pattern.     * @return Returns the pattern.     */    public String getPattern ()    {        return (mPatternString);    }    /**     * Set the search pattern.     * @param pattern The pattern to set.     */    public void setPattern (String pattern)    {        mPatternString = pattern;        mPattern = Pattern.compile (pattern);    }    /**     * Get the search strategy.     * @return Returns the strategy.     */    public int getStrategy ()    {        return (mStrategy);    }    /**     * Set the search pattern.     * @param strategy The strategy to use. One of MATCH, LOOKINGAT or FIND.     */    public void setStrategy (int strategy)    {        if ((strategy != MATCH) && (strategy != LOOKINGAT)            && (strategy != FIND))            throw new IllegalArgumentException ("illegal strategy ("                + strategy + ")");        mStrategy = strategy;    }    /**     * Accept string nodes that match the regular expression.     * @param node The node to check.     * @return <code>true</code> if the regular expression matches the     * text of the node, <code>false</code> otherwise.     */    public boolean accept (Node node)    {        String string;        Matcher matcher;        boolean ret;        ret = false;        if (node instanceof Text)        {            string = ((Text)node).getText ();            matcher = mPattern.matcher (string);            switch (mStrategy)            {                case MATCH:                    ret = matcher.matches ();                    break;                case LOOKINGAT:                    ret = matcher.lookingAt ();                    break;                case FIND:                default:                    ret = matcher.find ();                    break;            }        }        return (ret);    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -