📄 relative.java

📁 一个主题相关的网络爬虫,实现与某一主题相关的网页的爬取
💻 JAVA
字号:
package com.relative;

import com.parser.*;
import com.segment.*;
import java.io.*;
import java.util.*;
import javax.swing.text.html.HTMLEditorKit;

//求相关度的类
public class Relative {
	public Segment segment = new Segment();

	// 根据path获取主题集合
	public String[] getTopic(String path) throws IOException {
		BufferedReader in = new BufferedReader(new FileReader(path));
		String s1, s2 = new String();
		while ((s1 = in.readLine()) != null)
			s2 += s1 + " ";
		return s2.split(" ");
	}

	// 求主题集合的权重
	public double[] getTopicWeight(String[] topic) {
		double[] topicWeight = new double[topic.length];
		for (int i = 0; i < topic.length; i++) {
			if (topic[i].indexOf("奥运") != -1 || topic[i].indexOf("火炬") != -1)
				topicWeight[i] = 1;
			else
				topicWeight[i] = 0.5;
		}
		return topicWeight;
	}

	// 删除重复元素并排序
	public int[] sortAndDelRepeat(int[] count) {
		ArrayList al1 = new ArrayList();
		ArrayList al2 = new ArrayList();
		int index = 0;
		// 利用al1求出删除重复元素后的数组长度,即al1.size()
		for (int i = 0; i < count.length; i++)
			if (!al1.contains(count[i]))
				al1.add(count[i]);
		int[] array = new int[al1.size()];
		// 利用al2将删除重复元素后的值存入array
		for (int i = 0; i < count.length; i++)
			if (!al2.contains(count[i])) {
				al2.add(count[i]);
				array[index++] = count[i];
			}
		// 利用Arrays的sort()方法排序
		Arrays.sort(array);
		return array;
	}

	// 删除分词后的重复元素
	public String[] delRepeat(String[] segment) {
		ArrayList al1 = new ArrayList();
		ArrayList al2 = new ArrayList();
		int index = 0;
		// 利用al1求出删除重复元素后的数组长度,即al1.size()
		for (int i = 0; i < segment.length; i++)
			if (!al1.contains(segment[i]))
				al1.add(segment[i]);
		String[] array = new String[al1.size()];
		// 利用al2将删除重复元素后的值存入array
		for (int i = 0; i < segment.length; i++)
			if (!al2.contains(segment[i])) {
				al2.add(segment[i]);
				array[index++] = segment[i];
			}
		return array;
	}

	// 删除得到的URL中的重复元素
	public Vector delRepeat(Vector url) {
		Vector v = new Vector();
		for (int i = 0; i < url.size(); i++)
			if (!v.contains(url.get(i)))
				v.add(url.get(i));
		return v;
	}

	// 获得Parser实例
	public Parser getParser(String url) throws IOException {
		HTMLEditorKit.Parser parser = new HtmlParser().getParser();
		Parser p = new Parser(url);
		HTTP http = new HTTP();
		String s = http.getBody(url);
		parser.parse(new StringReader(s), p, true);
		if (s.equals(""))
			return null;
		return p;
	}

	// 计算url对应网页与主题的相关度,1为相关,0不相关,-1说明连接超时
	public int urlRelative(String url) throws Exception {

		Parser p = getParser(url);
		if (p == null)
			return -1;
		System.out.println(url);
		String[] topic = getTopic("./topic/topic.txt");
		// 主题,标题,正文权重数组
		double[] topicWeight = getTopicWeight(topic);
		double[] titleWeight = new double[topic.length];
		double[] bodyWeight = new double[topic.length];
		// 标题,正文文本
		String titleStr = p.getURLtitle();
		String bodyStr = p.getParagraphText();

		// 对titleStr,bodyStr分词
		String titleStrSeg = segment.segment(titleStr);
		String bodyStrSeg = segment.segment(bodyStr);
		String[] title = titleStrSeg.split(" ");
		title = delRepeat(title);
		String[] body = bodyStrSeg.split(" ");
		body = delRepeat(body);
		// 标识标题元素或正文元素匹配标题或正文文本的位置
		int index = 0;
		// 标题,正文平均权重
		double averTitleWeight = 0.0, averBodyWeight = 0.0;
		String titleStrCpy, bodyStrCpy;
		// 分别记录标题元素,正文元素在标题,正文文本中出现的次数,如果元素不匹配主题集合则直接为0
		int[] titleCount = new int[title.length];
		int[] bodyCount = new int[body.length];
		// 判断分词后得到的titleStrSeg是否为空,是则标题分量都赋值为0
		if (titleStrSeg.equals(""))
			for (int i = 0; i < topic.length; i++)
				titleWeight[i] = 0.0;
		else {
			for (int i = 0; i < title.length; i++) {
				titleStrCpy = titleStr;
				for (int j = 0; j < topic.length; j++) {
					if (title[i].indexOf(topic[j]) != -1
							|| topic[j].indexOf(title[i]) != -1) {
						while ((index = titleStrCpy.indexOf(title[i])) != -1) {
							titleCount[i]++;
							titleStrCpy = titleStrCpy.substring(index
									+ title[i].length());
						}
						break;
					} else {
						if (j == topic.length)
							titleCount[i] = 0;
					}

				}

			}
			// 对titleCount排序并删除重复元素得到titleCountSort
			int[] titleCountSort = sortAndDelRepeat(titleCount);
			// 标识titleWeight数组元素位置
			int index1 = 0;
			// 通过titleCountSort的长度计算出averTitleWeight
			averTitleWeight = (double) 1 / titleCountSort.length;
			// 主题向量长度小于标题向量长度时的分配权重
			if (topic.length < title.length) {
				for (int i = titleCountSort.length - 1; i >= 0; i--)
					for (int j = 0; j < titleCount.length; j++)
						if (titleCountSort[i] == titleCount[j]
								&& index1 < topic.length) {
							if (titleCount[j] == 0)
								titleWeight[index1++] = 0.0;// 如果titleCount某一元素为0则对应titleWeight元素值为0
							else
								// //如果titleCount某一元素不为0,则对应titleWeight元素值为averTitleWeight乘上对应位置加1;
								titleWeight[index1++] = averTitleWeight
										* (i + 1);
						}

			} else {// //主题向量长度大于标题向量长度时的分配权重
				for (int i = 0; i < title.length; i++)
					for (int j = 0; j < titleCountSort.length; j++)
						if (titleCount[i] == titleCountSort[j]) {
							if (titleCount[i] == 0)
								titleWeight[i] = 0.0;
							else
								titleWeight[i] = averTitleWeight * (j + 1);
						}
				for (int k = title.length; k < topic.length; k++)
					titleWeight[k] = 0.0;
			}
		}

		if (bodyStrSeg.equals(""))
			for (int i = 0; i < topic.length; i++)
				bodyWeight[i] = 0.0;
		else {
			for (int i = 0; i < body.length; i++) {
				bodyStrCpy = bodyStr;
				for (int j = 0; j < topic.length; j++) {
					if (body[i].indexOf(topic[j]) != -1
							|| topic[j].indexOf(body[i]) != -1) {
						while ((index = bodyStrCpy.indexOf(body[i])) != -1) {
							bodyCount[i]++;
							bodyStrCpy = bodyStrCpy.substring(index
									+ body[i].length());
						}
						break;
					} else {
						if (j == topic.length)
							bodyCount[i] = 0;
					}

				}

			}
			int[] bodyCountSort = sortAndDelRepeat(bodyCount);
			int index2 = 0;
			averBodyWeight = (double) 1 / bodyCountSort.length;
			if (topic.length < body.length) {
				for (int i = bodyCountSort.length - 1; i >= 0; i--)
					for (int j = 0; j < bodyCount.length; j++)
						if (bodyCountSort[i] == bodyCount[j]
								&& index2 < topic.length) {
							if (bodyCount[j] == 0)
								bodyWeight[index2++] = 0.0;
							else
								bodyWeight[index2++] = averBodyWeight * (i + 1);
						}

			} else {
				for (int i = 0; i < body.length; i++)
					for (int j = 0; j < bodyCountSort.length; j++)
						if (bodyCount[i] == bodyCountSort[j]) {
							if (bodyCount[i] == 0)
								bodyWeight[i] = 0.0;
							else
								bodyWeight[i] = averBodyWeight * (j + 1);
						}
				for (int k = body.length; k < topic.length; k++)
					bodyWeight[k] = 0.0;
			}
		}
		// 向量空间模型公式中的各分量,topicRela为最终求得的相关度
		double sum1 = 0, sum2 = 0, topic2 = 0, title2 = 0, body2 = 0, topicRela;
		for (int i = 0; i < topic.length; i++) {
			sum1 += topicWeight[i] * titleWeight[i];
			sum2 += topicWeight[i] * bodyWeight[i];
			topic2 += topicWeight[i] * topicWeight[i];
			title2 += titleWeight[i] * titleWeight[i];
			body2 += bodyWeight[i] * bodyWeight[i];
		}
		topicRela = 4 * (sum1 / (Math.sqrt(topic2) * Math.sqrt(title2))) + 1
				* (sum2 / (Math.sqrt(topic2) * Math.sqrt(body2)));
		//System.out.println(topicRela);
		/*
		 * for(int i=0;i<topic.length;i++) System.out.println(titleWeight[i]);
		 */
		// Vector v=p.getLinks();
		// v = delRepeat(v);
		// System.out.println(v.size());
		// for(int i=0;i<v.size();i++)
		// System.out.println(v.get(i));
		if (topicRela >= 2)
			return 1;
		return 0;

	}


}
💿 文件大小 3365 K
👤 上传用户 black001
📂 所属分类 Java编程
🏷️ 相关标签

#网络爬虫 #页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -