📄 weilanparser.java
字号:
package com.booksearch.service.htmlparser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
* Class:WeilanParser
* Description: 根据检索关键字到www.wl.cn抽取匹配内容
* extens:no
* implements:HtmlParser<Element>
* @author li chao
* @since 11/09/08
*/
public class WeilanParser implements HtmlParser<Element> {
private ArrayList<Book> list;
private String url = "";
private static final Logger logger;
static
{
logger = Logger.getLogger(com.booksearch.service.htmlparser.WeilanParser.class);
}
/**
* Function: nekohtmlParser
* Description: 用nekohtml解析器解析指定网页,并转化为dom对象
* Calls: no
* Called By: no
* @param no
* @return Document
* @throws IOException,SAXException
*/
public Document nekohtmlParser(String url) throws Exception{
this.url = url;
// 生成html parse
DOMParser parser = new DOMParser();
// 设置网站默认编码
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"UTF-8");
URL u = new URL(this.url);
Thread.currentThread().sleep(5000);
/* 建立与源网站的连接 */
URLConnection urlConnection = u.openConnection();
urlConnection.setReadTimeout(30000);
//urlConnection.setConnectTimeout(30000);
//urlConnection.connect();
/* 获得源网站的字节流,并转化为字符流,设置编码为gb2312 */
BufferedReader inputStream = new BufferedReader(new InputStreamReader(
urlConnection.getInputStream(), "UTF-8"));
parser.parse(new InputSource(inputStream));
Document doc = parser.getDocument();
return doc;
}
/**
* Function: mainService
* Description: Dom对转化过来的xml进行解析,取得图书的各个信息
* Calls:no
* Called By: no
* @param doc as Document,flag as boolean
* @return Document
* @throws SAXException IOException
*/
public ArrayList<Book> mainService(Document doc,boolean flag){
list = new ArrayList<Book>();
/*取出所有的div节点*/
NodeList divNodeList = doc.getElementsByTagName("div");
/*遍历div节点,选出id=__search_centent的div节点*/
for(int i=0;(i<divNodeList.getLength()
&&Node.ELEMENT_NODE == divNodeList.item(i).getNodeType());i++){
Node divNode =divNodeList.item(i);
if("__search_centent".equals(((Element)divNode).getAttribute("id"))){
Book book = new Book();
Price price = new Price();
/*取出id=__search_centent的div的所有节点,保存在contentNodeList中*/
NodeList contentNodeList = divNode.getChildNodes();
for(int j=0;j<contentNodeList.getLength();j++){
if(contentNodeList.item(j).getNodeType()==Node.ELEMENT_NODE){
Element contentElement=(Element)contentNodeList.item(j);
/*判断id=__search_centent的div节点中的<p>、<div>节点的class属性值,分别处理*/
if("pic".equals(contentElement.getAttribute("class"))){
String weilanUrl = this.getBookUrl(contentElement);
price.setWeilanUrl(weilanUrl);
//System.out.println(weilanUrl);
if(flag == true){
try {
Thread.currentThread().sleep(5000);
WeiLanParserSec tem = new WeiLanParserSec();
String bookISBN = tem.getBookISBNSec(weilanUrl);
book.setBookISBN(bookISBN);
}catch (Exception e) {
flag = false;
logger.error("==========蔚蓝网二次请求解析" + weilanUrl + "时出错" + "==========" +e);
//e.printStackTrace();
//book.setBookISBN("");
}
}
String pic=this.getBookImage(contentElement);
book.setBookImage(pic);
//System.out.println("图片链接:"+book.getBookImage());
}
if("ProductName".equals(contentElement.getAttribute("class"))){
String bookName = this.getBookName(contentElement);
book.setBookName(bookName);
//System.out.println("书名:"+book.getBookName());
}
if("display: ".equals(contentElement.getAttribute("style"))){
String bookAuthor = this.getBookAuthor(contentElement);
book.setBookAuthor(bookAuthor);
//System.out.println("作者:"+book.getBookAuthor());
String bookPublisher =this.getBookPublisher(contentElement);
book.setBookPublisher(bookPublisher);
//System.out.println("出版社:"+book.getBookPublisher());
String bookPublishTime =this.getBookPublishTime(contentElement);
if(!"".equals(bookPublishTime)&&null != bookPublishTime)
book.setBookPublishTime(bookPublishTime);
//System.out.println("出版时间:"+book.getBookPublishTime());
}
if("margin-top: 10px; margin-bottom: 10px;".equals(contentElement
.getAttribute("style"))){
String bookProspectus=this.getBookContent(contentElement);
book.setBookProspectus(bookProspectus);
//System.out.println("内容简介:"+book.getBookContent());
}
if("DIV".equals(contentElement.getTagName())){
String bookFixPrice=this.getBookFixPrice(contentElement);
if(null != bookFixPrice&&!"".equals(bookFixPrice))
book.setBookFixPrice(Double.valueOf(bookFixPrice));
//System.out.println("原价:"+book.getBookFixPrice());
String weilanPrice=this.getBookPrice(contentElement);
if(null != weilanPrice&&!"".equals(weilanPrice))
price.setWeilanPrice(Double.valueOf(weilanPrice));
//System.out.println("蔚蓝价:"+price.getWlPrice());
String weilanDiscount =this.getBookDiscount(contentElement);
if(null != weilanDiscount&&!"".equals(weilanDiscount))
price.setWeilanDiscount(Float.valueOf(weilanDiscount));
//System.out.println("折扣:"+price.getWeilanDiscount());
}
}
}
if(book.getBookName() != null){
book.setPrice(price);
list.add(book);
}
}
}
return list;
}
/**
* Function: getDetailInfo
* Description: 获得某本书的详细信息,价格,折扣
* Calls: getBookUrl(),getBookFixPrice(),getBookPrice(),
* Called By: no
* @param doc as Document
* @return Price
* @throws no
*/
public Price getDetailInfo(Document doc) {
Price price = new Price();
list = new ArrayList<Book>();
/*取出所有的div节点*/
NodeList divNodeList = doc.getElementsByTagName("div");
/*遍历div节点,选出id=__search_centent的div节点*/
for(int i=0;(i<divNodeList.getLength()
&&Node.ELEMENT_NODE == divNodeList.item(i).getNodeType());i++){
Node divNode =divNodeList.item(i);
if("__search_centent".equals(((Element)divNode).getAttribute("id"))){
/*取出id=__search_centent的div的所有节点,保存在contentNodeList中*/
NodeList contentNodeList = divNode.getChildNodes();
for(int j=0;j<contentNodeList.getLength();j++){
if(contentNodeList.item(j).getNodeType()==Node.ELEMENT_NODE){
Element contentElement=(Element)contentNodeList.item(j);
if("pic".equals(contentElement.getAttribute("class"))){
String weilanUrl = this.getBookUrl(contentElement);
price.setWeilanUrl(weilanUrl);
}
if("DIV".equals(contentElement.getTagName())){
String weilanPrice=this.getBookPrice(contentElement);
if(null !=weilanPrice && !"".equals(weilanPrice))
price.setWeilanPrice(Double.valueOf(weilanPrice));
//System.out.println("蔚蓝价:"+price.getWlPrice());
String weilanDiscount =this.getBookDiscount(contentElement);
if(!"".equals(weilanDiscount)&&null != weilanDiscount)
price.setWeilanDiscount(Float.valueOf(weilanDiscount));
//System.out.println("折扣:"+price.getWeilanDiscount());
}
}
}
break;
}
}
return price;
}
/**Function:getNextPageUrl
* Description:获得下一页链接
* Call:no
* Called by:
* @param bookElement as Element
* @throws no
* @return String
*/
public String getNextPageUrl(Document doc) {
String nextPageUrl="no";
NodeList divNodeList=doc.getElementsByTagName("div");
for(int i=0;i<divNodeList.getLength();i++){
if(divNodeList.item(i).getNodeType()==Node.ELEMENT_NODE){
if("__search_topfanye".equals(((Element)divNodeList.item(i)).getAttribute("id"))){
NodeList tableNodeList = divNodeList.item(i).getChildNodes();
for(int j = 0;j<tableNodeList.getLength();j++){
Node tableNode = tableNodeList.item(j);
if(Node.ELEMENT_NODE == tableNode.getNodeType()
&&"TABLE".equals(tableNode.getNodeName())){
NodeList trNodeList = tableNode.getChildNodes();
for(int k = 0;k<trNodeList.getLength();k++){
Node trNode = trNodeList.item(k);
if(Node.ELEMENT_NODE == trNode.getNodeType()
&& "TR".equals(trNode.getNodeName())){
NodeList tdNodeList = trNode.getChildNodes();
for(int m = 0;m<tdNodeList.getLength();m++){
Node tdNode = tdNodeList.item(m);
if(Node.ELEMENT_NODE == tdNode.getNodeType()
&&"TD".equals(tdNode.getNodeName())){
Element tdElement = (Element)tdNode;
if("pre_page".equals(tdElement.getAttribute("class"))){
if(tdElement.hasChildNodes()&&null != tdElement.getFirstChild().getNodeValue()){
String fanye=tdElement.getFirstChild().getNodeValue().trim();
if(Integer.valueOf(fanye.substring(0, fanye.indexOf("/")))
< Integer.valueOf(fanye.substring(fanye.indexOf("/")+1))){
if(this.url.indexOf("start=") != -1){
String makeUrl=url.substring(0, url.indexOf("&start="))+"&start=";
nextPageUrl=makeUrl+(Integer.valueOf(fanye.substring(0, fanye.indexOf("/")))*20+1);
}else{
nextPageUrl = this.url+"&start="+(Integer.valueOf(fanye.substring(0, fanye.indexOf("/")))*20+1);
}
}
}
break;
}
}
}
}
}
break;
}
}
}
}
}
return nextPageUrl;
}
/**Function:getRecordNum
* Description:获得记录总数
* Call:no
* Called by:
* @param bookElement as Element
* @throws no
* @return long
*/
public long getRecordNum(Document doc) {
long num = 0;
list = new ArrayList<Book>();
/*取出所有的div节点*/
NodeList divNodeList = doc.getElementsByTagName("div");
/*遍历div节点,选出id=__search_centent的div节点*/
for(int i=0;i<divNodeList.getLength();i++){
Node divNode =divNodeList.item(i);
if("summarycontent".equals(((Element)divNode).getAttribute("id"))){
NodeList spanNodeList=divNode.getChildNodes();
for(int j=0;j<spanNodeList.getLength();j++){
if(spanNodeList.item(j).getNodeType()==Node.ELEMENT_NODE){
Element spanElement =(Element)spanNodeList.item(j);
if("ctl00_ContentPlaceHolder1_TotalCount".equals(spanElement.getAttribute("id"))){
NodeList bList = spanElement.getChildNodes();
for(int k = 0;k<bList.getLength();k++){
Node bNode = bList.item(k);
if("B".equals(bNode.getNodeName())){
if(bNode.hasChildNodes()){
Node numNode = bNode.getFirstChild();
if(numNode.getNodeType() == Node.ELEMENT_NODE){
Element numElement = (Element)numNode;
if(numElement.hasChildNodes()&&null != numElement.getFirstChild().getNodeValue()){
String numStr = numElement.getFirstChild().getNodeValue().trim();
if(null!=numStr)
num = Integer.valueOf(numStr);
break;
}
}
}
}
}
break;
}
}
}
break;
}
}
return num;
}
/**Function:getBookAuthor
* Description:获得图书作者
* Call:no
* Called by:mainService
* @param bookElement as Element
* @throws no
* @return String
*/
public String getBookAuthor(Element bookElement) {
String bookAuthor="";
NodeList aList = bookElement.getChildNodes();
for(int i = 0;i<aList.getLength();i++){
Node aNode = aList.item(i);
if(null != aNode.getNodeValue()&&!"".equals(aNode.getNodeValue())){
if(aNode.getNodeValue().indexOf("作者")!= -1){
i ++;
if(i<aList.getLength()){
Node authorNode = aList.item(i);
if("A".equals(authorNode.getNodeName())){
Element authorElement = (Element)authorNode;
if(authorElement.hasChildNodes()&&null != authorElement.getFirstChild().getNodeValue())
bookAuthor = authorElement.getFirstChild().getNodeValue().trim();
break;
}
}
}
}
}
bookAuthor = bookAuthor.replace(",", " ");
bookAuthor = bookAuthor.replace(",", " ");
bookAuthor = bookAuthor.replace(";", " ");
bookAuthor = bookAuthor.replace("、", " ");
bookAuthor = bookAuthor.replace("等", "");
bookAuthor = bookAuthor.replace("著", "");
//System.out.println(bookAuthor);
return bookAuthor;
}
/**Function:getBookDiscount
* Description:获得图书折扣
* Call:no
* Called by:mainService
* @param bookElement as Element
* @throws no
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -