📄 relative.java
字号:
package com.relative;
import com.parser.*;
import com.segment.*;
import java.io.*;
import java.util.*;
import javax.swing.text.html.HTMLEditorKit;
//求相关度的类
public class Relative {
public Segment segment = new Segment();
// 根据path获取主题集合
public String[] getTopic(String path) throws IOException {
BufferedReader in = new BufferedReader(new FileReader(path));
String s1, s2 = new String();
while ((s1 = in.readLine()) != null)
s2 += s1 + " ";
return s2.split(" ");
}
// 求主题集合的权重
public double[] getTopicWeight(String[] topic) {
double[] topicWeight = new double[topic.length];
for (int i = 0; i < topic.length; i++) {
if (topic[i].indexOf("奥运") != -1 || topic[i].indexOf("火炬") != -1)
topicWeight[i] = 1;
else
topicWeight[i] = 0.5;
}
return topicWeight;
}
// 删除重复元素并排序
public int[] sortAndDelRepeat(int[] count) {
ArrayList al1 = new ArrayList();
ArrayList al2 = new ArrayList();
int index = 0;
// 利用al1求出删除重复元素后的数组长度,即al1.size()
for (int i = 0; i < count.length; i++)
if (!al1.contains(count[i]))
al1.add(count[i]);
int[] array = new int[al1.size()];
// 利用al2将删除重复元素后的值存入array
for (int i = 0; i < count.length; i++)
if (!al2.contains(count[i])) {
al2.add(count[i]);
array[index++] = count[i];
}
// 利用Arrays的sort()方法排序
Arrays.sort(array);
return array;
}
// 删除分词后的重复元素
public String[] delRepeat(String[] segment) {
ArrayList al1 = new ArrayList();
ArrayList al2 = new ArrayList();
int index = 0;
// 利用al1求出删除重复元素后的数组长度,即al1.size()
for (int i = 0; i < segment.length; i++)
if (!al1.contains(segment[i]))
al1.add(segment[i]);
String[] array = new String[al1.size()];
// 利用al2将删除重复元素后的值存入array
for (int i = 0; i < segment.length; i++)
if (!al2.contains(segment[i])) {
al2.add(segment[i]);
array[index++] = segment[i];
}
return array;
}
// 删除得到的URL中的重复元素
public Vector delRepeat(Vector url) {
Vector v = new Vector();
for (int i = 0; i < url.size(); i++)
if (!v.contains(url.get(i)))
v.add(url.get(i));
return v;
}
// 获得Parser实例
public Parser getParser(String url) throws IOException {
HTMLEditorKit.Parser parser = new HtmlParser().getParser();
Parser p = new Parser(url);
HTTP http = new HTTP();
String s = http.getBody(url);
parser.parse(new StringReader(s), p, true);
if (s.equals(""))
return null;
return p;
}
// 计算url对应网页与主题的相关度,1为相关,0不相关,-1说明连接超时
public int urlRelative(String url) throws Exception {
Parser p = getParser(url);
if (p == null)
return -1;
System.out.println(url);
String[] topic = getTopic("./topic/topic.txt");
// 主题,标题,正文权重数组
double[] topicWeight = getTopicWeight(topic);
double[] titleWeight = new double[topic.length];
double[] bodyWeight = new double[topic.length];
// 标题,正文文本
String titleStr = p.getURLtitle();
String bodyStr = p.getParagraphText();
// 对titleStr,bodyStr分词
String titleStrSeg = segment.segment(titleStr);
String bodyStrSeg = segment.segment(bodyStr);
String[] title = titleStrSeg.split(" ");
title = delRepeat(title);
String[] body = bodyStrSeg.split(" ");
body = delRepeat(body);
// 标识标题元素或正文元素匹配标题或正文文本的位置
int index = 0;
// 标题,正文平均权重
double averTitleWeight = 0.0, averBodyWeight = 0.0;
String titleStrCpy, bodyStrCpy;
// 分别记录标题元素,正文元素在标题,正文文本中出现的次数,如果元素不匹配主题集合则直接为0
int[] titleCount = new int[title.length];
int[] bodyCount = new int[body.length];
// 判断分词后得到的titleStrSeg是否为空,是则标题分量都赋值为0
if (titleStrSeg.equals(""))
for (int i = 0; i < topic.length; i++)
titleWeight[i] = 0.0;
else {
for (int i = 0; i < title.length; i++) {
titleStrCpy = titleStr;
for (int j = 0; j < topic.length; j++) {
if (title[i].indexOf(topic[j]) != -1
|| topic[j].indexOf(title[i]) != -1) {
while ((index = titleStrCpy.indexOf(title[i])) != -1) {
titleCount[i]++;
titleStrCpy = titleStrCpy.substring(index
+ title[i].length());
}
break;
} else {
if (j == topic.length)
titleCount[i] = 0;
}
}
}
// 对titleCount排序并删除重复元素得到titleCountSort
int[] titleCountSort = sortAndDelRepeat(titleCount);
// 标识titleWeight数组元素位置
int index1 = 0;
// 通过titleCountSort的长度计算出averTitleWeight
averTitleWeight = (double) 1 / titleCountSort.length;
// 主题向量长度小于标题向量长度时的分配权重
if (topic.length < title.length) {
for (int i = titleCountSort.length - 1; i >= 0; i--)
for (int j = 0; j < titleCount.length; j++)
if (titleCountSort[i] == titleCount[j]
&& index1 < topic.length) {
if (titleCount[j] == 0)
titleWeight[index1++] = 0.0;// 如果titleCount某一元素为0则对应titleWeight元素值为0
else
// //如果titleCount某一元素不为0,则对应titleWeight元素值为averTitleWeight乘上对应位置加1;
titleWeight[index1++] = averTitleWeight
* (i + 1);
}
} else {// //主题向量长度大于标题向量长度时的分配权重
for (int i = 0; i < title.length; i++)
for (int j = 0; j < titleCountSort.length; j++)
if (titleCount[i] == titleCountSort[j]) {
if (titleCount[i] == 0)
titleWeight[i] = 0.0;
else
titleWeight[i] = averTitleWeight * (j + 1);
}
for (int k = title.length; k < topic.length; k++)
titleWeight[k] = 0.0;
}
}
if (bodyStrSeg.equals(""))
for (int i = 0; i < topic.length; i++)
bodyWeight[i] = 0.0;
else {
for (int i = 0; i < body.length; i++) {
bodyStrCpy = bodyStr;
for (int j = 0; j < topic.length; j++) {
if (body[i].indexOf(topic[j]) != -1
|| topic[j].indexOf(body[i]) != -1) {
while ((index = bodyStrCpy.indexOf(body[i])) != -1) {
bodyCount[i]++;
bodyStrCpy = bodyStrCpy.substring(index
+ body[i].length());
}
break;
} else {
if (j == topic.length)
bodyCount[i] = 0;
}
}
}
int[] bodyCountSort = sortAndDelRepeat(bodyCount);
int index2 = 0;
averBodyWeight = (double) 1 / bodyCountSort.length;
if (topic.length < body.length) {
for (int i = bodyCountSort.length - 1; i >= 0; i--)
for (int j = 0; j < bodyCount.length; j++)
if (bodyCountSort[i] == bodyCount[j]
&& index2 < topic.length) {
if (bodyCount[j] == 0)
bodyWeight[index2++] = 0.0;
else
bodyWeight[index2++] = averBodyWeight * (i + 1);
}
} else {
for (int i = 0; i < body.length; i++)
for (int j = 0; j < bodyCountSort.length; j++)
if (bodyCount[i] == bodyCountSort[j]) {
if (bodyCount[i] == 0)
bodyWeight[i] = 0.0;
else
bodyWeight[i] = averBodyWeight * (j + 1);
}
for (int k = body.length; k < topic.length; k++)
bodyWeight[k] = 0.0;
}
}
// 向量空间模型公式中的各分量,topicRela为最终求得的相关度
double sum1 = 0, sum2 = 0, topic2 = 0, title2 = 0, body2 = 0, topicRela;
for (int i = 0; i < topic.length; i++) {
sum1 += topicWeight[i] * titleWeight[i];
sum2 += topicWeight[i] * bodyWeight[i];
topic2 += topicWeight[i] * topicWeight[i];
title2 += titleWeight[i] * titleWeight[i];
body2 += bodyWeight[i] * bodyWeight[i];
}
topicRela = 4 * (sum1 / (Math.sqrt(topic2) * Math.sqrt(title2))) + 1
* (sum2 / (Math.sqrt(topic2) * Math.sqrt(body2)));
//System.out.println(topicRela);
/*
* for(int i=0;i<topic.length;i++) System.out.println(titleWeight[i]);
*/
// Vector v=p.getLinks();
// v = delRepeat(v);
// System.out.println(v.size());
// for(int i=0;i<v.size();i++)
// System.out.println(v.get(i));
if (topicRela >= 2)
return 1;
return 0;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -