📄 threadtoupdate.java
字号:
package com.booksearch.quartz;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.Map;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import com.booksearch.dao.BookSaveDao;
import com.booksearch.dao.DailySaveDao;
import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
import com.booksearch.service.htmlparser.BookschinaParser;
import com.booksearch.service.htmlparser.China_pubparser;
import com.booksearch.service.htmlparser.Dangdangparser;
import com.booksearch.service.htmlparser.HtmlParser;
import com.booksearch.service.htmlparser.Tsinghuaparser;
import com.booksearch.service.htmlparser.WeilanParser;
import com.booksearch.service.htmlparser.Zhuoyueparser;
/**
* Class:ThreadToUpdate
* Description: 根据网站名称和请求url去网站爬取相应图书信息,并存放数据库中
* extens:Thread
* implements no
* @author feng guang
* @since 12/16/08
*/
public class ThreadToUpdate extends Thread {
private HashMap<String,String> urlMap;
private String keyword;
private Document doc;
private ArrayList<Book> tempList;
private Price price;
private boolean flag = true;
private DailySaveDao dailySaveDao = null;
private BookSaveDao bookSaveDao = null;
/*抽取网站内容接口引用*/
private HtmlParser dangDangParser = null;
private HtmlParser china_pubParser = null;
private HtmlParser zhuoYueParser = null;
private HtmlParser tsingHuaParser = null;
private HtmlParser bookschinaParser = null;
private HtmlParser weiLanParser = null;
private String nextUrl;
private static final Logger logger;
static
{
logger = Logger.getLogger(com.booksearch.quartz.ThreadToUpdate.class);
}
public ThreadToUpdate(HashMap<String,String> urlMap,String keyword,
DailySaveDao dailySaveDao,BookSaveDao bookSaveDao){
this.urlMap = urlMap;
this.keyword = keyword;
this.bookSaveDao = bookSaveDao;
this.dailySaveDao = dailySaveDao;
}
/**
* Function: run
* Description: 调用serviceToUpdate函数到各个网站上爬取符合条件的图书的信息
* Calls: this.serviceToUpdate(String webName,HtmlParser htmlParser)
* Called By: no
* @param no
* @return no
* @throws no
*/
public void run(){
/*初始化各爬取网站内容对象*/
this.dangDangParser = new Dangdangparser();
this.bookschinaParser = new BookschinaParser();
this.china_pubParser = new China_pubparser();
this.tsingHuaParser = new Tsinghuaparser();
this.weiLanParser = new WeilanParser();
this.zhuoYueParser = new Zhuoyueparser();
/*首先去当当网爬取该关键字的相应信息,如果当当网不存在,则再到中国图书网
* 爬取信息,如果还没有相关记录,再去china-pub网爬取,直到爬取到匹配的
* 记录或者所有网站都已爬取完。
*/
//this.serviceToUpdate("dangdang", this.dangDangParser);
// if(this.flag)
// this.serviceToUpdate("chinabook", this.bookschinaParser);
// if(this.flag)
// this.serviceToUpdate("china_pub", this.china_pubParser);
if(this.flag)
this.serviceToUpdate("weilan", this.weiLanParser);
if(this.flag)
this.serviceToUpdate("zhuoyue", this.zhuoYueParser);
}
/**
* Function: serviceToUpdate
* Description: 爬取指定请求url上的所有符合条件上的图书信息,并爬取对应的其它网站上的该本书的信息
* Calls: getOtherPrice(HtmlParser htmlperser,String webName,String reqUrl,Book book)
* Called By: this.run
* @param webName as String,htmlParser as HtrmlParser
* @return no
* @throws no
*/
private void serviceToUpdate(String webName,HtmlParser htmlParser){
Set<Map.Entry<String, String>> entry=urlMap.entrySet();
Iterator<Map.Entry<String, String>> iter=entry.iterator();
/*迭代,取出其中的网站名称和请求路径,启动线程去抽取网站内容*/
while(iter.hasNext()){
Map.Entry<String,String> tem=(Map.Entry<String,String>)iter.next();
if(webName.equals(tem.getKey())&&!"no".equals(tem.getValue())){
this.nextUrl = tem.getValue();
break;
}
}
do{
/*清空链表中的记录*/
if(null != this.tempList)
this.tempList.clear();
if("no".equals(this.nextUrl))
break;
System.out.println(keyword + ">>" + this.nextUrl);
try {
doc = null;
this.doc = htmlParser.nekohtmlParser(this.nextUrl);
} catch (Exception e) {
this.doc = null;
logger.error(e);
break;
}
if(this.doc != null){
tempList = htmlParser.mainService(doc, true);
System.out.println(keyword + ">>" + tempList.size());
/*如果是抓取第一页,则更新数据库中的日志信息*/
if(flag){
long num = htmlParser.getRecordNum(doc);
if(num>0){
this.dailySaveDao.updateDaily(keyword, num, false);
flag = false;
}else{
break;
}
System.out.println("num:" + num);
}
/*取出下一页请求url*/
this.nextUrl = htmlParser.getNextPageUrl(doc);
/*如果记录数大于零,则把根据每一条记录的isbn去其它网站抓取相关信息,并添加到数据库中*/
if(tempList.size()>0){
for(int i = 0;i<tempList.size();i++){
Book book = tempList.get(i);
String isbn = book.getBookISBN();
/*如果isbn不为空,并且数据库中没有*/
if(!this.bookSaveDao.isExist(isbn)){
if(null != isbn && !"".equals(isbn)){
/*到其它网站上去抽取指定isbn号的图书信息*/
if(!"china_pub".equals(webName))
this.getOtherPrice(this.china_pubParser,"china_pub", "http://www.china-pub.com/s/?type=&ref=&tid=0&key1=" + isbn, book);
if(!"chinabook".equals(webName))
this.getOtherPrice(this.bookschinaParser,"chinabook","http://www.bookschina.com/book_find/goodsfind.aspx?book=" + isbn + "&Str_Search=isbn", book);
if(!"weilan".equals(webName))
this.getOtherPrice(this.weiLanParser,"weilan", "http://search.wl.cn/search.aspx?q=" + isbn + "&producttype=1&index=5", book);
if(!"tsinghua".equals(webName))
this.getOtherPrice(this.tsingHuaParser,"tsinghua", "http://www.tup.com.cn/book/search.asp?keyword=" + isbn + "&type=2", book);
if(!"zhuoyue".equals(webName))
this.getOtherPrice(this.zhuoYueParser,"zhuoyue", "http://www.amazon.cn/mn/advancedSearchApp?type=book&isbn=" + isbn, book);
}
System.out.println(this.getName() + "??" + keyword + " : " + book.getBookAuthor()
+ ">>" + book.getBookFixPrice() + ">>" + book.getBookImage()
+ ">>" + book.getBookISBN() + ">>" + book.getBookName() + ">>" + book.getBookPublisher()
+ ">>" + book.getBookPublishTime());
this.bookSaveDao.addBook(book);
}
}
}
}
}while(true);
}
/**
* Function: getOtherPrice
* Description: 去指定请求url上爬取某本书的价格、折扣和详细信息地址
* Calls: no
* Called By: this.serviceToUpdate(String webName,HtmlParser htmlParser);
* @param htmlperser as HtmlParser,webName as String,reqUrl as String,book as Book
* @return no
* @throws no
*/
private void getOtherPrice(HtmlParser htmlperser,String webName,String reqUrl,Book book){
try {
doc = null;
doc = htmlperser.nekohtmlParser(reqUrl);
} catch (Exception e) {
doc = null;
logger.error(e);
}
if(null != doc){
price = htmlperser.getDetailInfo(doc);
if(null != price){
if("china_pub".equals(webName)){
book.getPrice().setChina_pubDiscount(price.getChina_pubDiscount());
book.getPrice().setChina_pubPrice(price.getChina_pubPrice());
book.getPrice().setChina_pubUrl(price.getChina_pubUrl());
}else if(("chinabook").equals(webName)){
book.getPrice().setBookschinaDiscount(price.getBookschinaDiscount());
book.getPrice().setBookschinaPrice(price.getBookschinaPrice());
book.getPrice().setBookschinaUrl(price.getBookschinaUrl());
}else if(("tsinghua").equals(webName)){
book.getPrice().setTsinghuaDiscount(price.getTsinghuaDiscount());
book.getPrice().setTsinghuaPrice(price.getTsinghuaPrice());
book.getPrice().setTsinghuaUrl(price.getTsinghuaUrl());
}else if(("zhuoyue").equals(webName)){
book.getPrice().setZhuoyueDiscount(price.getZhuoyueDiscount());
book.getPrice().setZhuoyuePrice(price.getZhuoyuePrice());
book.getPrice().setZhuoyueUrl(price.getZhuoyueUrl());
}else if(("weilan").equals(webName)){
book.getPrice().setWeilanDiscount(price.getWeilanDiscount());
book.getPrice().setWeilanPrice(price.getWeilanPrice());
book.getPrice().setWeilanUrl(price.getWeilanUrl());
}
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -