⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 radarspecialsearchengine.java

📁 nutch的小应用
💻 JAVA
字号:
package chapter11;

import java.io.*;
import org.htmlparser.util.*;
import org.htmlparser.Parser;
import org.htmlparser.filters.*;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.ParserException;

public class RadarSpecialSearchEngine {

	  public static void main (String[] args) throws ParserException
	  {
		  try {
			  TravelWordTable("D:\\workshop\\docs\\wordlist.txt");
		  } catch(Exception e)
		  {
			  e.printStackTrace();
		  }
 	  }
	  
	  public static void TravelWordTable(String filename) throws IOException
	  {
		  try{
		    String buffer ;
            FileWriter resultFile = null;
            PrintWriter myFile = null;
		    
		    String dstfile = filename+ "_dsturl.txt" ;
		    File writefile = new File(dstfile);
		    if(!writefile.exists())
            {
            	writefile.createNewFile();
            }                 
		    resultFile=new FileWriter(writefile);
            myFile = new PrintWriter(resultFile);
            BufferedReader reader = new BufferedReader(new FileReader(filename));
            while((buffer = reader.readLine())!=null)
            {
            	String url = "http://www.baidu.com/s?lm=0&si=&rn=10&ie=gb2312&ct=0&wd=" + buffer + "&pn=0&ver=0&cl=3";
            	getBaiduUrls(url,"GB2312",myFile);
            }
            if( myFile != null)
            	myFile.close();
            if( resultFile != null)
            	resultFile.close();
            
		  } catch (ParserException e) { 
			   e.printStackTrace();
			}
		  
	  
	  }

	  public static void getBaiduUrls(String url , String pageEncoding,PrintWriter writer)  throws ParserException
	  { 
		  NodeList nodeList = null;
		  try { 
			   Parser parser = new Parser(url);
		       parser.setEncoding(pageEncoding);                            // 设置解析编码格式
			   // Baidu 检索结果的url连接和标题
		       nodeList = parser.parse( new AndFilter( new HasAttributeFilter("target") , 
		    		                                   new HasAttributeFilter("onclick")));
			   } catch (ParserException e) { 
				   e.printStackTrace();
			   }
			  if(nodeList != null && nodeList.size() > 0) {                 // 循环遍历每个Url节点
				   for(int i = 0; i < nodeList.size(); i ++) {
					   String urlLink = ((LinkTag)nodeList.elementAt(i)).extractLink();
					   String LinkName = ((LinkTag)nodeList.elementAt(i)).getLinkText();
					   
					   if( urlLink.indexOf("bnu") == 0 || urlLink.indexOf("http") == 0 )
						   System.out.println("结果 "+ i +" 标题:"+LinkName);
						   System.out.println("       链接:"+urlLink );
						   writer.println(urlLink);
				    }
			  }
	  }
}	 	  

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -