📄 spider.java

📁 1、锁定某个主题抓取； 2、能够产生日志文本文件
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
import java.util.*;
import java.net.*;
import java.io.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import java.lang.Math.*;

/**
 * That class implements a reusable spider
 * @Author Kelven.JU
 * 
 * 
 */
public class Spider {

  /**
   * A collection of URLs that resulted in an error
   */
  protected Collection workloadError = new ArrayList(3);

  /**
   * A collection of URLs that are waiting to be processed
   */
  protected Collection workloadWaiting = new ArrayList(3);

  /**
   * A collection of URLs that were processed
   */
  protected Collection workloadProcessed = new ArrayList(3);

  //forbiden URL
  
  protected Collection workloadForbiden = new ArrayList(3);

  //not parseable file type
  protected final String[] mediaFileType = new String[] {"mp3","wav","wma","rar","rm","rmvb","ram","pdf"};
  protected Collection workloadFileType = new ArrayList(8);

  //Host URLs have been checked
  protected Collection workloadCheckedHost = new ArrayList(3);

  //key word bag
  protected ArrayList wordList = new ArrayList(3);
  protected ArrayList wordTf = new ArrayList(3);
  protected ArrayList wordIdf = new ArrayList(3);
  protected ArrayList wordWeight = new ArrayList(3);
  double scoreOfPage=0;
  /**
   * The class that the spider should report its URLs to
   */
  protected ISpiderReportable report;

  /**
   * A flag that indicates whether this process
   * should be canceled
   */
  protected boolean cancel = false;
  
  //Key word
  protected String keyWord;

  //Training Url
  protected ArrayList trainingUrlArrayList = new ArrayList(3);
  
  //friendly crowld option
  protected boolean checkRobotsOption = false;
  protected boolean checkMetaTagOption = false;
  
  /**
   * The constructor
   * 
   * @param report A class that implements the ISpiderReportable
   * interface, that will receive information that the
   * spider finds.
   */
  private BufferedWriter fileOut;
  private BufferedWriter resultOut;
  
  public Spider(ISpiderReportable report)
  {
    this.report = report;
    try{
    resultOut = new BufferedWriter ( new FileWriter ("spiderResults.log"));
    fileOut = new BufferedWriter ( new FileWriter ("spiderEvents.log"));
    }catch(IOException e)
	{
	System.out.println("new BufferedWriter error:[Spider.java 82]!");
	}
	
    for(int i = 0; i<mediaFileType.length;i++)
		workloadFileType.add(mediaFileType[i]);
	
  }

  /**
   * Get the URLs that resulted in an error.
   * 
   * @return A collection of URL's.
   */
  public Collection getWorkloadError()
  {
    return workloadError;
  }

  /**
   * Get the URLs that were waiting to be processed.
   * You should add one URL to this collection to
   * begin the spider.
   * 
   * @return A collection of URLs.
   */
  public Collection getWorkloadWaiting()
  {
    return workloadWaiting;
  }

  /**
   * Get the URLs that were processed by this spider.
   * 
   * @return A collection of URLs.
   */
  public Collection getWorkloadProcessed()
  {
    return workloadProcessed;
  }   

  // get the URLs that were forbidened by its host's describetion in rebots.txt
  public Collection getWorkloadForbiden()
  {
    return workloadForbiden;
  }    

  // get the URLs' host  that were checked by this spider
  public Collection getWorkloadCheckedHost()
  {
    return workloadCheckedHost;
  }    

  /**
   * Clear all of the workloads.
   */
  public void clear()
  {
    getWorkloadError().clear();
    getWorkloadWaiting().clear();
    getWorkloadProcessed().clear();
  }

  /**
   * Set a flag that will cause the begin
   * method to return before it is done.
   */
  public void cancel()
  {
    cancel = true;
  }

  /**
   * Add a URL for processing.
   * 
   * @param url
   */
  public void addURL(URL url)
  {
    if ( getWorkloadWaiting().contains(url) )
      return;
    if ( getWorkloadError().contains(url) )
      return;
    if ( getWorkloadProcessed().contains(url) )
      return;
    if ( getWorkloadForbiden().contains(url))
      return;
    log("Adding to workload: " + url );
    getWorkloadWaiting().add(url);
  }
  
  //set key word
  public void setKeyWord(String keyWordText){
  	keyWord = keyWordText;
  }
  
  //set Check option
  public void setCheckRobots(boolean opt){
  	checkRobotsOption = opt;
  }
  
  public void setCheckMetaTag(boolean opt){
  	checkMetaTagOption = opt;
  }

  public void setTrainingUrl(String args){
  	
  	String trainingUrlString = args;
	int dotIndex=-1;
	URL trainingUrl;
	BufferedReader br;
	String s="";
	StringBuffer sb=new StringBuffer("");
       BufferedWriter bw;
	segmenter tmpSeg;
	dotIndex=trainingUrlString.indexOf(";");
  	while( dotIndex!=-1)
  		{
  		try{
  			trainingUrl=new URL(trainingUrlString.substring(0, dotIndex));
  			trainingUrlArrayList.add(trainingUrl);
			log("Training URL:"+trainingUrl.toString());
			
			//output training text file
			/******************************
			bw=new BufferedWriter (new FileWriter(trainingUrl.toString().replace(":","").replace("/","").replace(".","")+".train"));
			br=new BufferedReader(new InputStreamReader(trainingUrl.openStream()));
				while((s=br.readLine())!=null)
					{ 
					sb.append(s+"\r\n"); 
					bw.write(sb.toString());
					bw.flush();
					}
					br.close();
			********************************/
			//Parse the URL to output the text on the page
			      URLConnection connection2 = trainingUrl.openConnection();
			      InputStream is2 = connection2.getInputStream();
			      Reader r2 = new InputStreamReader(is2);
			      HTMLEditorKit.Parser parse2 = new HTMLParse().getParser();
				log("Creating training file : "+trainingUrl.toString());
  			      parse2.parse(r2,new Parser2(trainingUrl),true);
				
			trainingUrlString=trainingUrlString.substring(dotIndex+1, trainingUrlString.length());
			dotIndex=trainingUrlString.indexOf(";");
  			}catch (IOException e)
  				{
  				System.out.println("Function setTrainingUrl [Spider.java 188] new URL error!");
  				}
  		}

		this.startSegmenter("train");

  	}

//对训练文件进行分词，统计@Author Kelven.JU
    public  void startSegmenter(String argv) {
	Vector inputfiles = new Vector() ;
	
	/** For word Statistic (Begin)**/
	
	Vector wordOfOneDocument = new Vector();
	Vector wordCountOfOneDocument = new Vector();
	Vector wordTfValueOfOneDocument = new Vector();
	Vector wordMaxCountOfOneDocument = new Vector();
	ArrayList wordOfAllDocument = new ArrayList(3);
	ArrayList wordCountOfAllDocument = new ArrayList(3);
	ArrayList wordIdfValueOfAllDocument = new ArrayList(3);
	ArrayList wordTfValueOfAllDocument = new ArrayList(3);
	ArrayList wordWeightOfAllDocument = new ArrayList(3);
	//int wordMaxCountOfOneDocument = -1;
	ArrayList[] tmpArrayList = new ArrayList[2];
	Integer tmpMaxCount;
	int nDoc=0;
	
	/** For word Statistic (End)**/

	String encoding = "GBK";
	int charform = 1;
	boolean debug = false;
	int i, j,k;
	
       
	inputfiles.add(argv);

	if (inputfiles.size() == 0) {
	    System.out.println("ERROR: Please specify name of Chinese text file to segment.\n");
	    //printHelp();
	} 


	System.err.println("Loading segmenter word list.  One moment please.");
	segmenter mainsegmenter = new segmenter(charform, true);
	System.err.println("Total keys " + mainsegmenter.zhwords.size());

	File tmpfile;
	String dirfiles[];
	String fileName;
	for (i = 0; i < inputfiles.size(); i++) {
	    tmpfile = new File((String)inputfiles.get(i));
	    if (tmpfile.isDirectory() == true) {

		dirfiles = tmpfile.list();
		if (dirfiles != null) {

		    for (j = 0; j < dirfiles.length; j++) {
				if(!dirfiles[j].endsWith(".train")){
					tmpfile=new File(dirfiles[j]);
				       tmpfile.deleteOnExit();
					}
				else{
			inputfiles.add((String)inputfiles.get(i) + File.separator +
				       dirfiles[j]);
					}
		    }
		}
		continue;//返回for开头
	    }

	    nDoc++;
	    tmpArrayList[0] = new ArrayList(3);
	    tmpArrayList[1] = new ArrayList(3);
	    wordOfOneDocument.add(tmpArrayList[0]);
	    wordCountOfOneDocument.add(tmpArrayList[1]);

	    //mainsegmenter.setWordMaxCountOfOneDocument(tmpMaxCount);
	    mainsegmenter.setWordOfOneDocument(tmpArrayList[0]);
	    mainsegmenter.setWordCountOfOneDocument(tmpArrayList[1]);
	    mainsegmenter.setWordCountOfAllDocument(wordCountOfAllDocument);
	    mainsegmenter.setWordOfAllDocument(wordOfAllDocument);

	    System.err.println("Segmenting " + inputfiles.get(i) + 
			       " with encoding " + encoding);
		System.err.println("**************"+(String)inputfiles.get(i));
	    mainsegmenter.segmentFile((String)inputfiles.get(i), encoding);
	    //mainsegmenter.outputWorkCount();
	    tmpMaxCount = new Integer(mainsegmenter.getWordCountMaxOfOneDocument());
	    wordMaxCountOfOneDocument.add(tmpMaxCount);
	    wordTfValueOfOneDocument.add(mainsegmenter.getTfValue((String)inputfiles.get(i)));
	    System.out.println("Max Count: "+wordMaxCountOfOneDocument.get(nDoc-1));
	    System.out.println("本文档词库数："+((ArrayList)wordCountOfOneDocument.get(nDoc-1)).size());
	}

	
//compute the IDF value of each word
	//*******************************************
	for(int idfi=0; idfi<wordOfAllDocument.size();idfi++){
		int ni=0;//how many document contains a word
		String tmpString=(String)wordOfAllDocument.get(idfi);
		for(int idfj=0; idfj<nDoc; idfj++){
			if(((ArrayList)wordOfOneDocument.get(idfj)).contains(tmpString))
				ni++;
			}
		//System.out.println("ni = "+ni);
		wordIdfValueOfAllDocument.add(1/(Math.log(nDoc/ni)+1/Math.log(nDoc-1)));//IDF 采用倒数，表明在专题训练文档中在越多文档中出现，其越重要。
		if(debug)
		System.out.println(tmpString+":"+wordIdfValueOfAllDocument.get(idfi));
		}
	/**********************************************/
//unite the TF and IDF value to a kind of weight(W=Avg(tf)*IDF)
	//********************************************
	for(int tfi=0; tfi<wordOfAllDocument.size();tfi++){
		int tmpIndex=-1;//index of a word in one document
		int tni=0;//how many document contains a word
		double tfall=0.0;
		String tmpString=(String)wordOfAllDocument.get(tfi);
		for(int tfj=0; tfj<nDoc; tfj++){
			if((tmpIndex=((ArrayList)wordOfOneDocument.get(tfj)).indexOf(tmpString))!=-1){
				tni++;
			       tfall=tfall+((Double)(((ArrayList)wordTfValueOfOneDocument.get(tfj)).get(tmpIndex))).doubleValue();
				}
			}
		
		wordTfValueOfAllDocument.add(tfall/tni);
		wordWeightOfAllDocument.add((tfall/tni)*(((Double)wordIdfValueOfAllDocument.get(tfi)).doubleValue()));
		//System.out.println(tmpString+":Avg(TF):"+wordTfValueOfAllDocument.get(tfi)+" IDF:"+(((Double)wordIdfValueOfAllDocument.get(tfi)).doubleValue())+"  Weight: "+wordWeightOfAllDocument.get(tfi));
		}
	System.out.println("wordTfValueOfAllDocument.size = "+wordTfValueOfAllDocument.size());
       /**********************************************/
	//output Global data	
	System.out.println(wordOfAllDocument.contains("金融"));
	System.out.println("N(主题文档总数)："+nDoc);
	System.out.println("主题词库总数："+wordOfAllDocument.size());
	wordList=wordOfAllDocument;
	wordTf=wordTfValueOfAllDocument;
	wordIdf=wordIdfValueOfAllDocument;
	wordWeight=wordWeightOfAllDocument;
    }

//对当前ＵＲＬ的内容文件行分词，统计@Author Kelven.JU

	public double getPageScore(String argv)
	{
	//definition
	Vector inputfiles = new Vector() ;
	ArrayList wordOfPage = new ArrayList(3);
	ArrayList wordCountOfPage = new ArrayList(3);
	String encoding = "GBK";
	int charform = 1;

	//See if @argv is empty
	inputfiles.add(argv);
	if (inputfiles.size() == 0) {
	    System.out.println("ERROR: Please specify name of Chinese text file to segment.\n");
	    return -1.0;
	} 

       //Load word list
	System.err.println("Loading segmenter word list.  One moment please.");
	segmenter mainsegmenter2 = new segmenter(charform, true);
	System.err.println("Total keys " + mainsegmenter2.zhwords.size());

	//Set the Object into mainsegmenter2
	mainsegmenter2.setWordOfOneDocument(wordOfPage);
	mainsegmenter2.setWordCountOfOneDocument(wordCountOfPage);

	//Segment file and output result
	System.err.println("Segmenting tmp/" + argv+  " with encoding " + encoding);
	File tmpDirPath=new File("tmp");
	//File tmpTempFile;//tmp文件夹内的文件

		//File tmpTempFile = tmpTmpFile[0];
		System.out.println("************"+"tmp\\"+argv);
		mainsegmenter2.segmentFile("tmp\\"+argv, encoding);
		System.out.println("本文档词库数："+wordCountOfPage.size());
		
	return 0.0;
	
	}

  /**Called to check robots.txt to report forbiden URLs**/
  public void checkHostRobots(URL url)
  	{
  	//log("checkHostRobots:"+url.getProtocol()+"://"+url.getHost()+"/");
	if(getWorkloadCheckedHost().contains(url.getHost()))
		{
		log("Robots.txt has been checked on "+url.getHost().toString());
		return;
		}
	getWorkloadCheckedHost().add(url.getHost());
12 下一页
💿 文件大小 1868 K
👤 上传用户 gjq2000
📂 所属分类 Java编程
🏷️ 相关标签

#日志
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -