📄 spider.java
字号:
import java.util.*;
import java.net.*;
import java.io.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import java.lang.Math.*;
/**
* That class implements a reusable spider
* @Author Kelven.JU
*
*
*/
public class Spider {
/**
* A collection of URLs that resulted in an error
*/
protected Collection workloadError = new ArrayList(3);
/**
* A collection of URLs that are waiting to be processed
*/
protected Collection workloadWaiting = new ArrayList(3);
/**
* A collection of URLs that were processed
*/
protected Collection workloadProcessed = new ArrayList(3);
//forbiden URL
protected Collection workloadForbiden = new ArrayList(3);
//not parseable file type
protected final String[] mediaFileType = new String[] {"mp3","wav","wma","rar","rm","rmvb","ram","pdf"};
protected Collection workloadFileType = new ArrayList(8);
//Host URLs have been checked
protected Collection workloadCheckedHost = new ArrayList(3);
//key word bag
protected ArrayList wordList = new ArrayList(3);
protected ArrayList wordTf = new ArrayList(3);
protected ArrayList wordIdf = new ArrayList(3);
protected ArrayList wordWeight = new ArrayList(3);
double scoreOfPage=0;
/**
* The class that the spider should report its URLs to
*/
protected ISpiderReportable report;
/**
* A flag that indicates whether this process
* should be canceled
*/
protected boolean cancel = false;
//Key word
protected String keyWord;
//Training Url
protected ArrayList trainingUrlArrayList = new ArrayList(3);
//friendly crowld option
protected boolean checkRobotsOption = false;
protected boolean checkMetaTagOption = false;
/**
* The constructor
*
* @param report A class that implements the ISpiderReportable
* interface, that will receive information that the
* spider finds.
*/
private BufferedWriter fileOut;
private BufferedWriter resultOut;
public Spider(ISpiderReportable report)
{
this.report = report;
try{
resultOut = new BufferedWriter ( new FileWriter ("spiderResults.log"));
fileOut = new BufferedWriter ( new FileWriter ("spiderEvents.log"));
}catch(IOException e)
{
System.out.println("new BufferedWriter error:[Spider.java 82]!");
}
for(int i = 0; i<mediaFileType.length;i++)
workloadFileType.add(mediaFileType[i]);
}
/**
* Get the URLs that resulted in an error.
*
* @return A collection of URL's.
*/
public Collection getWorkloadError()
{
return workloadError;
}
/**
* Get the URLs that were waiting to be processed.
* You should add one URL to this collection to
* begin the spider.
*
* @return A collection of URLs.
*/
public Collection getWorkloadWaiting()
{
return workloadWaiting;
}
/**
* Get the URLs that were processed by this spider.
*
* @return A collection of URLs.
*/
public Collection getWorkloadProcessed()
{
return workloadProcessed;
}
// get the URLs that were forbidened by its host's describetion in rebots.txt
public Collection getWorkloadForbiden()
{
return workloadForbiden;
}
// get the URLs' host that were checked by this spider
public Collection getWorkloadCheckedHost()
{
return workloadCheckedHost;
}
/**
* Clear all of the workloads.
*/
public void clear()
{
getWorkloadError().clear();
getWorkloadWaiting().clear();
getWorkloadProcessed().clear();
}
/**
* Set a flag that will cause the begin
* method to return before it is done.
*/
public void cancel()
{
cancel = true;
}
/**
* Add a URL for processing.
*
* @param url
*/
public void addURL(URL url)
{
if ( getWorkloadWaiting().contains(url) )
return;
if ( getWorkloadError().contains(url) )
return;
if ( getWorkloadProcessed().contains(url) )
return;
if ( getWorkloadForbiden().contains(url))
return;
log("Adding to workload: " + url );
getWorkloadWaiting().add(url);
}
//set key word
public void setKeyWord(String keyWordText){
keyWord = keyWordText;
}
//set Check option
public void setCheckRobots(boolean opt){
checkRobotsOption = opt;
}
public void setCheckMetaTag(boolean opt){
checkMetaTagOption = opt;
}
public void setTrainingUrl(String args){
String trainingUrlString = args;
int dotIndex=-1;
URL trainingUrl;
BufferedReader br;
String s="";
StringBuffer sb=new StringBuffer("");
BufferedWriter bw;
segmenter tmpSeg;
dotIndex=trainingUrlString.indexOf(";");
while( dotIndex!=-1)
{
try{
trainingUrl=new URL(trainingUrlString.substring(0, dotIndex));
trainingUrlArrayList.add(trainingUrl);
log("Training URL:"+trainingUrl.toString());
//output training text file
/******************************
bw=new BufferedWriter (new FileWriter(trainingUrl.toString().replace(":","").replace("/","").replace(".","")+".train"));
br=new BufferedReader(new InputStreamReader(trainingUrl.openStream()));
while((s=br.readLine())!=null)
{
sb.append(s+"\r\n");
bw.write(sb.toString());
bw.flush();
}
br.close();
********************************/
//Parse the URL to output the text on the page
URLConnection connection2 = trainingUrl.openConnection();
InputStream is2 = connection2.getInputStream();
Reader r2 = new InputStreamReader(is2);
HTMLEditorKit.Parser parse2 = new HTMLParse().getParser();
log("Creating training file : "+trainingUrl.toString());
parse2.parse(r2,new Parser2(trainingUrl),true);
trainingUrlString=trainingUrlString.substring(dotIndex+1, trainingUrlString.length());
dotIndex=trainingUrlString.indexOf(";");
}catch (IOException e)
{
System.out.println("Function setTrainingUrl [Spider.java 188] new URL error!");
}
}
this.startSegmenter("train");
}
//对训练文件进行分词,统计@Author Kelven.JU
public void startSegmenter(String argv) {
Vector inputfiles = new Vector() ;
/** For word Statistic (Begin)**/
Vector wordOfOneDocument = new Vector();
Vector wordCountOfOneDocument = new Vector();
Vector wordTfValueOfOneDocument = new Vector();
Vector wordMaxCountOfOneDocument = new Vector();
ArrayList wordOfAllDocument = new ArrayList(3);
ArrayList wordCountOfAllDocument = new ArrayList(3);
ArrayList wordIdfValueOfAllDocument = new ArrayList(3);
ArrayList wordTfValueOfAllDocument = new ArrayList(3);
ArrayList wordWeightOfAllDocument = new ArrayList(3);
//int wordMaxCountOfOneDocument = -1;
ArrayList[] tmpArrayList = new ArrayList[2];
Integer tmpMaxCount;
int nDoc=0;
/** For word Statistic (End)**/
String encoding = "GBK";
int charform = 1;
boolean debug = false;
int i, j,k;
inputfiles.add(argv);
if (inputfiles.size() == 0) {
System.out.println("ERROR: Please specify name of Chinese text file to segment.\n");
//printHelp();
}
System.err.println("Loading segmenter word list. One moment please.");
segmenter mainsegmenter = new segmenter(charform, true);
System.err.println("Total keys " + mainsegmenter.zhwords.size());
File tmpfile;
String dirfiles[];
String fileName;
for (i = 0; i < inputfiles.size(); i++) {
tmpfile = new File((String)inputfiles.get(i));
if (tmpfile.isDirectory() == true) {
dirfiles = tmpfile.list();
if (dirfiles != null) {
for (j = 0; j < dirfiles.length; j++) {
if(!dirfiles[j].endsWith(".train")){
tmpfile=new File(dirfiles[j]);
tmpfile.deleteOnExit();
}
else{
inputfiles.add((String)inputfiles.get(i) + File.separator +
dirfiles[j]);
}
}
}
continue;//返回for开头
}
nDoc++;
tmpArrayList[0] = new ArrayList(3);
tmpArrayList[1] = new ArrayList(3);
wordOfOneDocument.add(tmpArrayList[0]);
wordCountOfOneDocument.add(tmpArrayList[1]);
//mainsegmenter.setWordMaxCountOfOneDocument(tmpMaxCount);
mainsegmenter.setWordOfOneDocument(tmpArrayList[0]);
mainsegmenter.setWordCountOfOneDocument(tmpArrayList[1]);
mainsegmenter.setWordCountOfAllDocument(wordCountOfAllDocument);
mainsegmenter.setWordOfAllDocument(wordOfAllDocument);
System.err.println("Segmenting " + inputfiles.get(i) +
" with encoding " + encoding);
System.err.println("**************"+(String)inputfiles.get(i));
mainsegmenter.segmentFile((String)inputfiles.get(i), encoding);
//mainsegmenter.outputWorkCount();
tmpMaxCount = new Integer(mainsegmenter.getWordCountMaxOfOneDocument());
wordMaxCountOfOneDocument.add(tmpMaxCount);
wordTfValueOfOneDocument.add(mainsegmenter.getTfValue((String)inputfiles.get(i)));
System.out.println("Max Count: "+wordMaxCountOfOneDocument.get(nDoc-1));
System.out.println("本文档词库数:"+((ArrayList)wordCountOfOneDocument.get(nDoc-1)).size());
}
//compute the IDF value of each word
//*******************************************
for(int idfi=0; idfi<wordOfAllDocument.size();idfi++){
int ni=0;//how many document contains a word
String tmpString=(String)wordOfAllDocument.get(idfi);
for(int idfj=0; idfj<nDoc; idfj++){
if(((ArrayList)wordOfOneDocument.get(idfj)).contains(tmpString))
ni++;
}
//System.out.println("ni = "+ni);
wordIdfValueOfAllDocument.add(1/(Math.log(nDoc/ni)+1/Math.log(nDoc-1)));//IDF 采用倒数,表明在专题训练文档中在越多文档中出现,其越重要。
if(debug)
System.out.println(tmpString+":"+wordIdfValueOfAllDocument.get(idfi));
}
/**********************************************/
//unite the TF and IDF value to a kind of weight(W=Avg(tf)*IDF)
//********************************************
for(int tfi=0; tfi<wordOfAllDocument.size();tfi++){
int tmpIndex=-1;//index of a word in one document
int tni=0;//how many document contains a word
double tfall=0.0;
String tmpString=(String)wordOfAllDocument.get(tfi);
for(int tfj=0; tfj<nDoc; tfj++){
if((tmpIndex=((ArrayList)wordOfOneDocument.get(tfj)).indexOf(tmpString))!=-1){
tni++;
tfall=tfall+((Double)(((ArrayList)wordTfValueOfOneDocument.get(tfj)).get(tmpIndex))).doubleValue();
}
}
wordTfValueOfAllDocument.add(tfall/tni);
wordWeightOfAllDocument.add((tfall/tni)*(((Double)wordIdfValueOfAllDocument.get(tfi)).doubleValue()));
//System.out.println(tmpString+":Avg(TF):"+wordTfValueOfAllDocument.get(tfi)+" IDF:"+(((Double)wordIdfValueOfAllDocument.get(tfi)).doubleValue())+" Weight: "+wordWeightOfAllDocument.get(tfi));
}
System.out.println("wordTfValueOfAllDocument.size = "+wordTfValueOfAllDocument.size());
/**********************************************/
//output Global data
System.out.println(wordOfAllDocument.contains("金融"));
System.out.println("N(主题文档总数):"+nDoc);
System.out.println("主题词库总数:"+wordOfAllDocument.size());
wordList=wordOfAllDocument;
wordTf=wordTfValueOfAllDocument;
wordIdf=wordIdfValueOfAllDocument;
wordWeight=wordWeightOfAllDocument;
}
//对当前URL的内容文件行分词,统计@Author Kelven.JU
public double getPageScore(String argv)
{
//definition
Vector inputfiles = new Vector() ;
ArrayList wordOfPage = new ArrayList(3);
ArrayList wordCountOfPage = new ArrayList(3);
String encoding = "GBK";
int charform = 1;
//See if @argv is empty
inputfiles.add(argv);
if (inputfiles.size() == 0) {
System.out.println("ERROR: Please specify name of Chinese text file to segment.\n");
return -1.0;
}
//Load word list
System.err.println("Loading segmenter word list. One moment please.");
segmenter mainsegmenter2 = new segmenter(charform, true);
System.err.println("Total keys " + mainsegmenter2.zhwords.size());
//Set the Object into mainsegmenter2
mainsegmenter2.setWordOfOneDocument(wordOfPage);
mainsegmenter2.setWordCountOfOneDocument(wordCountOfPage);
//Segment file and output result
System.err.println("Segmenting tmp/" + argv+ " with encoding " + encoding);
File tmpDirPath=new File("tmp");
//File tmpTempFile;//tmp文件夹内的文件
//File tmpTempFile = tmpTmpFile[0];
System.out.println("************"+"tmp\\"+argv);
mainsegmenter2.segmentFile("tmp\\"+argv, encoding);
System.out.println("本文档词库数:"+wordCountOfPage.size());
return 0.0;
}
/**Called to check robots.txt to report forbiden URLs**/
public void checkHostRobots(URL url)
{
//log("checkHostRobots:"+url.getProtocol()+"://"+url.getHost()+"/");
if(getWorkloadCheckedHost().contains(url.getHost()))
{
log("Robots.txt has been checked on "+url.getHost().toString());
return;
}
getWorkloadCheckedHost().add(url.getHost());
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -