📄 stringextractor.java

📁 本程序用于对页面信息进行提取并分析
💻 JAVA
字号:
// HTMLParser Library $Name: v1_6_20060319 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Somik Raha//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/StringExtractor.java,v $// $Author: derrickoswald $// $Date: 2005/04/12 11:27:42 $// $Revision: 1.48 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.parserapplications;import org.htmlparser.beans.StringBean;import org.htmlparser.util.ParserException;/** * Extract plaintext strings from a web page. * Illustrative program to gather the textual contents of a web page. * Uses a {@link org.htmlparser.beans.StringBean StringBean} to accumulate * the user visible text (what a browser would display) into a single string. */public class StringExtractor{    private String resource;    /**     * Construct a StringExtractor to read from the given resource.     * @param resource Either a URL or a file name.     */    public StringExtractor (String resource)    {        this.resource = resource;    }    /**     * Extract the text from a page.     * @return The textual contents of the page.     * @param links if <code>true</code> include hyperlinks in output.     * @exception ParserException If a parse error occurs.     */    public String extractStrings (boolean links)        throws            ParserException    {        StringBean sb;        sb = new StringBean ();        sb.setLinks (links);        sb.setURL (resource);        return (sb.getStrings ());    }    /**     * Mainline.     * @param args The command line arguments.     */    public static void main (String[] args)    {        boolean links;        String url;        StringExtractor se;        links = false;        url = null;        for (int i = 0; i < args.length; i++)            if (args[i].equalsIgnoreCase ("-links"))                links = true;            else                url = args[i];        if (null != url)        {            se = new StringExtractor (url);            try            {                System.out.println (se.extractStrings (links));            }            catch (ParserException e)            {                e.printStackTrace ();            }        }        else            System.out.println ("Usage: java -classpath htmlparser.jar org.htmlparser.parserapplications.StringExtractor [-links] url");    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -