📄 threadtoupdate.java

📁 本系统实现了从五个网站上搜索的图书进行整合后
💻 JAVA
字号:
package com.booksearch.quartz;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.Map;

import org.apache.log4j.Logger;
import org.w3c.dom.Document;

import com.booksearch.dao.BookSaveDao;
import com.booksearch.dao.DailySaveDao;
import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
import com.booksearch.service.htmlparser.BookschinaParser;
import com.booksearch.service.htmlparser.China_pubparser;
import com.booksearch.service.htmlparser.Dangdangparser;
import com.booksearch.service.htmlparser.HtmlParser;
import com.booksearch.service.htmlparser.Tsinghuaparser;
import com.booksearch.service.htmlparser.WeilanParser;
import com.booksearch.service.htmlparser.Zhuoyueparser;
/**
 * Class:ThreadToUpdate
 * Description: 根据网站名称和请求url去网站爬取相应图书信息，并存放数据库中
 * extens:Thread
 * implements no
 * @author  feng guang
 * @since   12/16/08
 */
public class ThreadToUpdate extends Thread {
    
	private HashMap<String,String> urlMap;
	
	private String keyword;
	
	private Document doc;
	
	private ArrayList<Book> tempList;
	
	private Price price;
	
	private boolean flag = true;
	
	private DailySaveDao dailySaveDao = null;
	
	private BookSaveDao bookSaveDao = null;
	/*抽取网站内容接口引用*/
    private HtmlParser dangDangParser = null;
	
	private HtmlParser china_pubParser = null;
	
	private HtmlParser zhuoYueParser = null;
	
	private HtmlParser tsingHuaParser = null;
	
	private HtmlParser bookschinaParser = null;
	
	private HtmlParser weiLanParser = null;
	
	private String nextUrl;
	
	private static final Logger logger;
	
	static 
    {
        logger = Logger.getLogger(com.booksearch.quartz.ThreadToUpdate.class);
    }
	
	public ThreadToUpdate(HashMap<String,String> urlMap,String keyword,
			              DailySaveDao dailySaveDao,BookSaveDao bookSaveDao){

		this.urlMap = urlMap;
		this.keyword = keyword;
		this.bookSaveDao = bookSaveDao;
		this.dailySaveDao = dailySaveDao;
	}
	/**
	* Function:  run
	* Description: 调用serviceToUpdate函数到各个网站上爬取符合条件的图书的信息
	* Calls:  this.serviceToUpdate(String webName,HtmlParser htmlParser)
	* Called By:  no
	* @param no
	* @return no
	* @throws no
	*/
	public void run(){
		/*初始化各爬取网站内容对象*/
		this.dangDangParser = new Dangdangparser();
		this.bookschinaParser = new BookschinaParser();
		this.china_pubParser = new China_pubparser();
		this.tsingHuaParser = new Tsinghuaparser();
		this.weiLanParser = new WeilanParser();
		this.zhuoYueParser = new Zhuoyueparser();
		/*首先去当当网爬取该关键字的相应信息，如果当当网不存在，则再到中国图书网
		 * 爬取信息，如果还没有相关记录，再去china-pub网爬取，直到爬取到匹配的
		 * 记录或者所有网站都已爬取完。
		 */
		//this.serviceToUpdate("dangdang", this.dangDangParser);
//		if(this.flag)
//			this.serviceToUpdate("chinabook", this.bookschinaParser);
//		if(this.flag)
//			this.serviceToUpdate("china_pub", this.china_pubParser);
		if(this.flag)
			this.serviceToUpdate("weilan", this.weiLanParser);
		if(this.flag)
			this.serviceToUpdate("zhuoyue", this.zhuoYueParser);
		
	}	
	/**
	* Function:  serviceToUpdate
	* Description:  爬取指定请求url上的所有符合条件上的图书信息，并爬取对应的其它网站上的该本书的信息
	* Calls:  getOtherPrice(HtmlParser htmlperser,String webName,String reqUrl,Book book)
	* Called By:  this.run
	* @param webName as String,htmlParser as HtrmlParser
	* @return no
	* @throws no
	*/
	private void serviceToUpdate(String webName,HtmlParser htmlParser){
        
		 Set<Map.Entry<String, String>> entry=urlMap.entrySet();
		 Iterator<Map.Entry<String, String>> iter=entry.iterator();

		 /*迭代，取出其中的网站名称和请求路径，启动线程去抽取网站内容*/
		 while(iter.hasNext()){
			 
		     Map.Entry<String,String> tem=(Map.Entry<String,String>)iter.next();
			 if(webName.equals(tem.getKey())&&!"no".equals(tem.getValue())){
				 this.nextUrl = tem.getValue();
                 break;
			 }
		 }
		 do{
			 /*清空链表中的记录*/
		     if(null != this.tempList)
			     this.tempList.clear();
			 
			 if("no".equals(this.nextUrl))
				 break;
			 System.out.println(keyword + ">>" + this.nextUrl);
			 try {
				doc = null;
				this.doc = htmlParser.nekohtmlParser(this.nextUrl);
			} catch (Exception e) {
				this.doc = null;
				logger.error(e);
				break;
			}
			if(this.doc != null){
				tempList = htmlParser.mainService(doc, true);
				System.out.println(keyword + ">>" + tempList.size());
				 /*如果是抓取第一页，则更新数据库中的日志信息*/
				if(flag){
			        long num = htmlParser.getRecordNum(doc);
					if(num>0){
						this.dailySaveDao.updateDaily(keyword, num, false);
						flag = false;
					}else{
						break;
					}
					
					System.out.println("num:" + num);
				}
				/*取出下一页请求url*/
				this.nextUrl = htmlParser.getNextPageUrl(doc);
				
				/*如果记录数大于零，则把根据每一条记录的isbn去其它网站抓取相关信息，并添加到数据库中*/
				if(tempList.size()>0){
					for(int i = 0;i<tempList.size();i++){
						Book book = tempList.get(i);
						String isbn = book.getBookISBN();
						/*如果isbn不为空，并且数据库中没有*/
						if(!this.bookSaveDao.isExist(isbn)){
							if(null != isbn && !"".equals(isbn)){
								/*到其它网站上去抽取指定isbn号的图书信息*/
								if(!"china_pub".equals(webName))
								    this.getOtherPrice(this.china_pubParser,"china_pub", "http://www.china-pub.com/s/?type=&ref=&tid=0&key1=" + isbn, book);
								if(!"chinabook".equals(webName))
								    this.getOtherPrice(this.bookschinaParser,"chinabook","http://www.bookschina.com/book_find/goodsfind.aspx?book=" + isbn + "&Str_Search=isbn", book);
								if(!"weilan".equals(webName))
								    this.getOtherPrice(this.weiLanParser,"weilan", "http://search.wl.cn/search.aspx?q=" + isbn + "&producttype=1&index=5", book);
								if(!"tsinghua".equals(webName))
								    this.getOtherPrice(this.tsingHuaParser,"tsinghua", "http://www.tup.com.cn/book/search.asp?keyword=" + isbn + "&type=2", book);
								if(!"zhuoyue".equals(webName))
								    this.getOtherPrice(this.zhuoYueParser,"zhuoyue", "http://www.amazon.cn/mn/advancedSearchApp?type=book&isbn=" + isbn, book);
						     }
						     System.out.println(this.getName() + "??" + keyword + " : " + book.getBookAuthor() 
								           + ">>" + book.getBookFixPrice() + ">>" + book.getBookImage()
								           + ">>" + book.getBookISBN() + ">>" + book.getBookName() + ">>" + book.getBookPublisher()
								           + ">>" + book.getBookPublishTime());	
						     this.bookSaveDao.addBook(book);
						}
					}
				}
			}
	 }while(true);
	}
	/**
	* Function:  getOtherPrice
	* Description:  去指定请求url上爬取某本书的价格、折扣和详细信息地址
	* Calls:  no
	* Called By:  this.serviceToUpdate(String webName,HtmlParser htmlParser);
	* @param htmlperser as HtmlParser,webName as String,reqUrl as String,book as Book
	* @return no
	* @throws no
	*/
	private void getOtherPrice(HtmlParser htmlperser,String webName,String reqUrl,Book book){
		
		try {
			doc = null;
			doc = htmlperser.nekohtmlParser(reqUrl);
		} catch (Exception e) {
			doc = null;
			logger.error(e);
		}
        if(null != doc){
        	
			price = htmlperser.getDetailInfo(doc);
			if(null != price){
				if("china_pub".equals(webName)){
					book.getPrice().setChina_pubDiscount(price.getChina_pubDiscount());
					book.getPrice().setChina_pubPrice(price.getChina_pubPrice());
					book.getPrice().setChina_pubUrl(price.getChina_pubUrl());
				}else if(("chinabook").equals(webName)){
					book.getPrice().setBookschinaDiscount(price.getBookschinaDiscount());
					book.getPrice().setBookschinaPrice(price.getBookschinaPrice());
					book.getPrice().setBookschinaUrl(price.getBookschinaUrl());	
				}else if(("tsinghua").equals(webName)){
				    book.getPrice().setTsinghuaDiscount(price.getTsinghuaDiscount());
				    book.getPrice().setTsinghuaPrice(price.getTsinghuaPrice());
				    book.getPrice().setTsinghuaUrl(price.getTsinghuaUrl());	
				}else if(("zhuoyue").equals(webName)){
		            book.getPrice().setZhuoyueDiscount(price.getZhuoyueDiscount());
					book.getPrice().setZhuoyuePrice(price.getZhuoyuePrice());
					book.getPrice().setZhuoyueUrl(price.getZhuoyueUrl());	
				}else if(("weilan").equals(webName)){
				    book.getPrice().setWeilanDiscount(price.getWeilanDiscount());
				    book.getPrice().setWeilanPrice(price.getWeilanPrice());
				    book.getPrice().setWeilanUrl(price.getWeilanUrl());	
				}
			}
        }
	}
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -