⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 extractanchor.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
字号:
package cn.yicha.subject.spider.store;

import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.lang.Character;

import org.apache.regexp.*;

import cn.yicha.common.util.*;


public class ExtractAnchor
{
	// 已提取锚点的页面集合
	private static HashSet urlsFetchedAnchors = new HashSet();

	/**
	* 从网页中提取所有锚点信息
	*/
	private static AnchorProperty[] extractAnchors(String content, String url)
	{
		System.out.println("Enter extractAnchors");
		
		// 根据正则模式,匹配所有锚点信息
		String pattern = "<a\\s*href\\s*=\\s*[\"|'](.*?)[\"|']>(.*?)</a>";
		return fetchMatchedPatterns(content, pattern, url);
	}

	/**
	* 获取匹配正则模式的所有子串
	* @param content 待提炼子串的源内容
	* @param pattern 正则模式
	* @param level 匹配的子表达式层数
	*/
	private static AnchorProperty[] fetchMatchedPatterns(String content, String pattern, String absoluteUrl)
	{
		ArrayList anchorList = new ArrayList();

		// 根据正则模式,匹配所有子串
		RE patt = StringParser.getPatternObj(pattern);
		Reader r = new StringReader(content);

		CharacterIterator in = new ReaderCharacterIterator(r);
		int end = 0;
		while (patt.match(in, end))
		{
			// 取得锚点链接
			int start = patt.getParenStart(1);
			end = patt.getParenEnd(1);

			if (start < 0 || end < 0) {
				break;
			}

			String anchorHref = in.substring(start, end);

			// 取得锚点标题
			start = patt.getParenStart(2);
			end = patt.getParenEnd(2);
			if (start < 0 || end < 0) {
				break;
			}
			
			String anchorTitle = in.substring(start, end);

			// 添加锚点对象
			AnchorProperty ap = new AnchorProperty();
			ap.setAbsoluteUrl(absoluteUrl);
			ap.setRelativeUrl(anchorHref);
			ap.setTitle(anchorTitle);
			
			System.out.println(ap.getTitle() + "\n" + ap.getUrl());

			String url = ap.getUrl();
			synchronized( urlsFetchedAnchors)
			{
				if (!urlsFetchedAnchors.contains(url)) {
					anchorList.add(ap);
					urlsFetchedAnchors.add(url);
				}
			}
		}

		return (AnchorProperty[])anchorList.toArray(new AnchorProperty[0]);
	}	

	/**
	* 返回锚点日志文件存储位置
	*/
	public static String getSaveDiskPath(String basePath, String serviceID, boolean downloadMonternet)
	{
		if (downloadMonternet) {
			return basePath + "/" + serviceID;
		}
		else {
			return basePath;
		}
	}
	
	/**
	* 把锚点数组存储到本地文件
	* @param anchorArray 锚点数组
	*/
	private synchronized static void saveDisk(AnchorProperty[] anchorArray, String fileName, int depth)
	{
		try {
			// 存储本地文件需要同步
			// synchronized(fileName) {
			{
				File f = new File(fileName);
				f.getParentFile().mkdirs();
				
				PrintWriter pw = new PrintWriter(
					new OutputStreamWriter( new FileOutputStream(fileName, true) ) );
				for (int i=0; i < anchorArray.length; i++) {
					pw.println(anchorArray[i].getTitle() + "," + anchorArray[i].getUrl() + "," + depth);
				}
				pw.close();
			}
		}
		catch (java.io.FileNotFoundException e) {
			e.printStackTrace();
		}
	}

	/**
	* 提取URL地址的域名部分
	*/
	private static String extractDomain(String url)
	{
		final String _HTTP_PREFIX = "http://";
		final int _HTTP_PREFIX_LEN = _HTTP_PREFIX.length();

		// 去掉开始的HTTP前缀
		url = url.toLowerCase();
		if (url.indexOf(_HTTP_PREFIX) == 0) {
			url = url.substring(_HTTP_PREFIX_LEN);
		}

		int pos = url.indexOf("/");
		if (pos < 0) {
			return url;
		}
		else {
			return url.substring(0, pos);
		}
	}

	/**
	* 提炼页面内部的锚点数据,包括锚点文本和锚点链接,并存储到文件
	*/
	public static void fetchAnchors(String content, String url, String savePath, int depth)
	{
		AnchorProperty[] anchorArray = extractAnchors(content, url);

		String fileName = savePath + "/" + extractDomain(url);
		saveDisk(anchorArray, fileName, depth);				
	}

	/**
	* 提炼页面内部的GO标签链接数据
	*/
	public static String[] fetchGoHrefs(String content)
	{	
		
		// 根据正则模式,匹配所有锚点信息
		String pattern = "<go.*?href\\s*=\\s*[\"|'](.*?)[\"|'].*?>(.*?)</go>";
		return extractMatchedGoHrefs(delPureGo(content), pattern);
	}

	/**
	* 取得单Go标签内容
	*/
	public static String[] getPureGo(String content)
	{
		final String _GO_PREFIX = "<go";

		ArrayList results = new ArrayList();
		String lowerContent = content.toLowerCase();

		int pos = lowerContent.indexOf(_GO_PREFIX);
		while (pos > 0)
		{
			String part = "";

			// 取一个GO标签
			char ch = content.charAt(pos);
			while( ch != '>') {
				part += ch;
				pos++;
				ch = content.charAt(pos);;
			}
			part += ch;

			// 如果是单标签,添加到结果集中
			if (content.charAt(pos-1) == '/') {
				results.add(part);
			}

			pos = lowerContent.indexOf(_GO_PREFIX, pos);
		}

		return (String[]) results.toArray(new String[0]);
	}

	/**
	* 删除所有纯GO标签
	*/
	private static String delPureGo(String content)
	{
		final String _GO_PREFIX = "<go";
		String lowerContent = content.toLowerCase();
		StringBuffer results = new StringBuffer();

		// 循环寻找GO前缀
		int beginPos = 0;
		int pos = lowerContent.indexOf(_GO_PREFIX);
		while (pos > 0)
		{
			// 向结果串中添加非纯GO标签部分
			results.append(content.substring(beginPos, pos));
			beginPos = pos;

			// 取标签内容
			char ch = content.charAt(pos);
			while( ch != '>') {
				pos++;
				ch = content.charAt(pos);;
			}

			// 如果是纯GO标签,不添加到结果串中
			if (content.charAt(pos-1) == '/') {
				beginPos = pos + 1;
			}

			pos = lowerContent.indexOf(_GO_PREFIX, pos);
		}
		results.append(content.substring(beginPos));

		return results.toString();
	}

	/**
	* 提炼页面内部形如<go.../>标签链接数据
	*/
	public static String[] fetchTagGoHrefs(String content)
	{
		ArrayList results = new ArrayList();

		String[] goTags = getPureGo(content);
		for (int i=0; i < goTags.length; i++)
		{
			String pattern = "href\\s*=\\s*[\"|'](.*?)[\"|']";
			String goHref = StringParser.getMatchedElement(goTags[i], pattern);
			results.add(goHref);
		}

		return (String[]) results.toArray(new String[0]);
	}

	/**
	* 解析GO标签里面的PostField域
	*/
	private static String parseGoField(String content)
	{
		String urlPara = "";
		String patternPostField1 = "<postfield(.*?)>";
		String patternPostField2 = "<postfield(.*?)/>";
		String patternFieldName = "name\\s*=\\s*[\"|'](.*?)[\"|']"; 
		String patternFieldValue = "value\\s*=\\s*[\"|'](.*?)[\"|']"; 

		// 组装submit参数串
		String[] postFieldList = StringParser.fetchMatchedPatterns(content, patternPostField1);
		if (postFieldList.length <= 0) {
			postFieldList = StringParser.fetchMatchedPatterns(content, patternPostField2);
		}
		
		for (int i=0; i < postFieldList.length; i++)
		{
			String fieldName = StringParser.getMatchedElement(postFieldList[i], patternFieldName);
			String fieldValue = StringParser.getMatchedElement(postFieldList[i], patternFieldValue);

			// 不组装带变量的提交链接
			if (fieldValue.indexOf("$") >= 0) {
				return null;
			}
			
			urlPara = urlPara + fieldName + "=" + fieldValue + "&";
		}
		
		// 滤去尾部的"&",并在首部添加"?"
		if (urlPara.endsWith("&")) {
			urlPara = urlPara.substring(0, urlPara.length() - 1);
		}
		return urlPara;
	}
	
	/**
	* 根据页面内容提取符合特定模式的锚点属性
	*/
	private static String[] extractMatchedGoHrefs(String content, String pattern)
	{
		ArrayList hrefList = new ArrayList();
		
		// 根据正则模式,匹配所有子串
		RE patt = StringParser.getPatternObj(pattern);		
		Reader r = new StringReader(content);

		CharacterIterator in = new ReaderCharacterIterator(r);
		int end = 0;
		while (patt.match(in, end))
		{
			// 取得锚点链接
			int start = patt.getParenStart(1);
			end = patt.getParenEnd(1);

			if (start < 0 || end < 0) {
				break;
			}

			String goHref = in.substring(start, end);

			// 取得锚点内容
			start = patt.getParenStart(2);
			end = patt.getParenEnd(2);
			if (start < 0 || end < 0) {
				break;
			}
			
			String goContent = in.substring(start, end);
			String goField = parseGoField(goContent);
			if (goField != null) {
				if (goHref.indexOf("?") >= 0) {
					hrefList.add(goHref + "&" + goField);
				}
				else {
					hrefList.add(goHref + "?" + goField);
				}
			}
		}

		return (String[]) hrefList.toArray(new String[0]);
	}

	/**
	* 从网页内容中提取指定标签属性的所有值
	*/
	public static String[] extractAttributesFromTag(String tag, String attr, String source)
	{
		// 根据正则模式,取出所有标签属性
		String pattern = "<" + tag + ".*?" + attr + "\\s*=\\s*[\"|'](.*?)[\"|']";
		
		return StringParser.fetchMatchedPatterns(source, pattern);		
	}

	/**
	* 从网页内容中提取"a"标签属性的所有值
	*/
	public static String[] extractAnchorsFromTag(String source)
	{
		//System.out.println("enter extractAnchorsFromTag");
		
		ArrayList results = new ArrayList();

		String[] aTags = getStartTagContents(source, "a");
		for (int i=0; i < aTags.length; i++)
		{
			//System.out.println("aTags" + aTags[i]);
		
			String pattern = "href\\s*=\\s*[\"|'](.*?)[\"|']";
			String aHref = StringParser.getMatchedElement(aTags[i], pattern);
			results.add(aHref);
		}

		return (String[]) results.toArray(new String[0]);
	}

	/**
	* 取出所有选择控件链接
	*/
	public static String[] extractOptionsFromTag(String source)
	{
		// 根据正则模式,取出所有标签属性
		String pattern = "onpick\\s*=\\s*[\"|'](.*?)[\"|']";
		
		return StringParser.fetchMatchedPatterns(source, pattern);		
	}
	
	/**
	* 判断tag出现位置,注:查看a标签时<anchor>不应匹配
	*/
	private static int getTagPos(String content, String tag, int startPos)
	{
		final String _TAG_PREFIX = "<" + tag.toLowerCase();
		content = content.toLowerCase();

		// 取得起始标签位置
		int pos = content.indexOf(_TAG_PREFIX, startPos);
		while (pos >= 0)
		{
			int newPos = pos + _TAG_PREFIX.length();

			// 判断标签字符串是否与tag完全匹配
			char ch = content.charAt(newPos);
			if (Character.isSpaceChar(ch)) {
				break;
			}

			pos = content.indexOf(_TAG_PREFIX, newPos);
		}

		return pos;
	}

	/**
	* 取得tag标签的起始内容
	*/
	public static String[] getStartTagContents(String content, String tag)
	{
		ArrayList results = new ArrayList();

		int pos = getTagPos(content, tag, 0);
		while (pos > 0)
		{
			String part = "";

			// 取一个标签
			char ch = content.charAt(pos);
			while( ch != '>') {
				part += ch;
				pos++;
				ch = content.charAt(pos);;
			}
			part += ch;

			results.add(part);

			pos = getTagPos(content, tag, pos);
		}

		return (String[]) results.toArray(new String[0]);
	}
	
	/**
	* 从网页内容中提取"a"标签属性的所有值
	*/
	public static String[] extractOldAnchorsFromTag(String source)
	{
		// 根据正则模式,取出所有标签属性
		String tag = "a";
		String attr = "href";
		String pattern = "<" + tag + "\\s*" + attr + "\\s*=\\s*[\"|'](.*?)[\"|']";
		
		return StringParser.fetchMatchedPatterns(source, pattern);		
	}
	
	/**
	* 从网页内容中提取"ontimer、onenterforward、onenterbackward"标签属性的所有值
	*/
	public static String[] extractRedirectsFromTag(String source)
	{
		// 根据正则模式,取出所有标签属性
		String[] tagArray = {"ontimer", "onenterforward", "onenterbackward"};

		ArrayList resultArray = new ArrayList();
		for (int i=0; i < tagArray.length; i++) 
		{
			String[] matchPatterns = StringParser.fetchMatchedPatterns(source, getPatternFromRedirectTag(tagArray[i]));
			for (int j=0; j < matchPatterns.length; j++) {
				resultArray.add(matchPatterns[j]);
			}
		}

		return (String[]) resultArray.toArray(new String[0]);
	}

	/**
	* 根据标签获取串模式
	*/
	private static String getPatternFromRedirectTag(String tag)
	{
		return tag + "\\s*=\\s*[\"|'](.*?)[\"|']";
	}
	
	/**
	* 测试获取网页中所有的标签属性
	*/
	public static void fetchAttributes(String url)
	{
		// 取得页面内容
		String urlContent = "";
		try {
			urlContent = URLReader.readUrlContent(url);
		}
		catch (Exception e) {
			e.printStackTrace();
		}

		String[] attrList = extractAttributesFromTag("a", "href", urlContent);
		for (int i=0; i < attrList.length; i++) {
			System.out.println(attrList[i]);
		}
	}
	
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -