📄 htmlparser.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
字号:
/*
 * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
 *
 * Copyright (c) 2001 Brian Pitcher
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

// $Header: /CVSRepository/spider/cn/yicha/subject/spider/extractor/HTMLParser.java,v 1.2 2005/12/01 02:10:21 zhangdi Exp $

package cn.yicha.subject.spider.extractor;

import org.apache.log4j.Category;

import cn.yicha.common.util.Log4j;
import cn.yicha.common.util.StringUtil;
import cn.yicha.subject.spider.SpiderConfig;
import cn.yicha.subject.spider.URLObject;
import cn.yicha.subject.spider.URLToDownload;
import cn.yicha.subject.spider.store.ExtractAnchor;



import java.util.List;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.lang.Runtime;
import java.lang.System;


public class HTMLParser
{
    private final static Category _logClass = Category.getInstance(URLObject.class);

    private SpiderConfig config;

    static
    {
        Log4j.init();
    }

    public HTMLParser(SpiderConfig config)
    {
        this.config = config;
    }

    public List parseLinksInDocument(URLToDownload sourceURL, String textContent)
    {
        return parseAsHTML(sourceURL, textContent);
    }

	/**
	* 解析页面，提取出所有链接
	*/
    private List parseAsHTML(URLToDownload sourceURL, String textContent)
    {
    	//System.out.println("parseAsHTML: sourceURL: " + sourceURL.getEndURL().toExternalForm());
		
        ArrayList newURLs = new ArrayList();
        HashSet newURLSet = new HashSet();

        extractAnchorsFromTags(sourceURL, newURLs, newURLSet, textContent);
        extractGoHrefs(sourceURL, newURLs, newURLSet, textContent);
        extractRedirectsFromTags(sourceURL, newURLs, newURLSet, textContent);
        extractOptionAnchors(sourceURL, newURLs, newURLSet, textContent);
		
        if(newURLs.size() == 0)
        {
            // _logClass.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent);
        }
	
        _logClass.info("Returning " + newURLs.size() + " urls extracted from page " + sourceURL.getURL().toExternalForm());
		for (int i=0; i < newURLs.size(); i++) {
			//System.out.println("newURLs.get(i)" + newURLs.get(i));
			
			URLToDownload u = (URLToDownload) newURLs.get(i);
			_logClass.debug("extract url --> " + u.getURL().toExternalForm());
		}

        return newURLs;
    }

	/**
	* 提取出链接标签属性值
	*/
    private void extractAnchorsFromTags(URLToDownload sourceURL, List newURLs, Set newURLSet, String input)
    {
   	 	// System.out.println("enter extractAnchorsFromTags");
		
		// 取出所有链接标签属性值
		String[] attrList = ExtractAnchor.extractAnchorsFromTag(input);
		for (int i=0; i < attrList.length; i++)
		{
			//System.out.println("attrList[i]: " + i + attrList[i]);
			
			String attrValue = StringUtil.replace(attrList[i], "&amp;", "&");
			
			//System.out.println("attrValue: " + i + attrValue);
			//System.out.println("sourceURL: " + sourceURL.getEndURL().toExternalForm());
			
			addAnchorLink(sourceURL, newURLs, newURLSet, attrValue);
		}		
    }

	/**
	* 提取出重定向标签属性值
	*/
	private void extractRedirectsFromTags(URLToDownload sourceURL, List newURLs, Set newURLSet, String input)
	{
		// 取出所有重定向标签属性值
		String[] attrList = ExtractAnchor.extractRedirectsFromTag(input);
		for (int i=0; i < attrList.length; i++)
		{
			String attrValue = StringUtil.replace(attrList[i], "&amp;", "&");
			_logClass.info("extract redirect tag --> " + attrValue);
			addAnchorLink(sourceURL, newURLs, newURLSet, attrValue);
		}
	}

	/**
	* 提取选择控件中的链接属性
	*/
	private void extractOptionAnchors(URLToDownload sourceURL, List newURLs, Set newURLSet, String input)
	{
		// 取出选择控件中的链接
		String[] attrList = ExtractAnchor.extractOptionsFromTag(input);
		for (int i=0; i < attrList.length; i++)
		{
			String attrValue = StringUtil.replace(attrList[i], "&amp;", "&");
			_logClass.info("extract option tag --> " + attrValue);
			addAnchorLink(sourceURL, newURLs, newURLSet, attrValue);
		}
	}
	
	/**
	* 提取出Go标签内的所有链接
	* 本函数不必替换"&amp;"，因为URL是程序定制组装的
	*/
	private void extractGoHrefs(URLToDownload sourceURL, List newURLs, Set newURLSet, String input)
	{
		// 取出所有Go链接
		String[] hrefList = ExtractAnchor.fetchGoHrefs(input);
		for (int i=0; i < hrefList.length; i++)
		{
			String hrefLink = StringUtil.replace(hrefList[i], "&amp;", "&");
			// _logClass.info("extract go href --> " + hrefLink);
			addAnchorLink(sourceURL, newURLs, newURLSet, hrefLink);
		}

		// 取出所有形如<go.../>的链接
		String[] hrefTagGoList = ExtractAnchor.fetchTagGoHrefs(input);
		for (int i=0; i < hrefTagGoList.length; i++)
		{
			String hrefLink = StringUtil.replace(hrefTagGoList[i], "&amp;", "&");
			_logClass.info("extract pure go href --> " + hrefLink);
			addAnchorLink(sourceURL, newURLs, newURLSet, hrefLink);
		}
	}

	/**
	* 添加本页链接
	*/
	private void addAnchorLink(URLToDownload sourceURL, List newURLs, Set newURLSet, String link)
	{
		try 
		{
			// 添加绝对链接
			URL	u = new	URL(sourceURL.getEndURL(), link);
			if(!newURLSet.contains(u.toExternalForm())) 
			{
				// 根据新链接，决定新的URL是否需要
				URLToDownload newUrl = new URLToDownload(u, 0, sourceURL.getServiceID());
				if (newUrl.isValidAnchor(config.getRingExtensions(), config.getGameExtensions(), config.getInvalidExtensions()))
				{
					newURLs.add(newUrl);
					newURLSet.add(u.toExternalForm());
				}
			}
		}			
		catch(MalformedURLException murle)
		{
		}
	}
		
    private void logMailURL(String url)
    {
        // _logClass.debug("logMailURL()");

        try
        {
            FileWriter appendedFile = new FileWriter(config.getMailtoLogFile().toString(), true);
            PrintWriter pW = new PrintWriter(appendedFile);
            pW.println(url);
            pW.flush();
            pW.close();
        }
        catch(IOException ioe)
        {
            // _logClass.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe);
        }
    }

    /**
     * Check if a particular URL looks like it's a mailto: style link.
     */
    private boolean isMailTo(String url)
    {
        if(url == null)
        {
            return false;
        }

        url = url.toUpperCase();
        return (url.indexOf("MAILTO:") != -1);
    }

}
💿 文件大小 53 K
👤 上传用户 god_dog
📂 所属分类 Java编程
🏷️ 相关标签

#sipder #10000 #java #压力
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -