📄 zhuoyueparser.java
字号:
package com.booksearch.service.htmlparser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
* Class:Zhuoyueparser
* Description: 根据检索关键字到http://www.amazon.cn/抽取匹配内容
* extens:no
* implements:HtmlParser<Element>
* @author wangchao
* @since 11/09/08
*/
public class Zhuoyueparser implements HtmlParser<Element> {
/* 存放本网站某一页的记录 */
private ArrayList<Book> list;
private String url;
/*记录日志*/
private static final Logger logger;
static
{
logger = Logger.getLogger(com.booksearch.service.htmlparser.Zhuoyueparser.class);
}
/**
* Function: nekohtmlParser
* Description: 用nekohtml解析器解析指定网页,并转化为dom对象
* Calls:no
* Called By: no
* @param no
* @return Document
* @throws IOException,SAXException
*/
public Document nekohtmlParser(String url) throws Exception {
this.url = url;
System.out.println(this.url);
/* 生成html 解析器 */
DOMParser parser = new DOMParser();
/* 设置网页的默认编码 */
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"utf-8");
URL u = new URL(this.url);
Thread.currentThread().sleep(5000);
/* 建立与源网站的连接 */
URLConnection urlconn = u.openConnection();
//urlconn.setReadTimeout(30000);
//urlconn.setConnectTimeout(30000);
//urlconn.connect();
/* 获得源网站的字节流,并转化为字符流,设置编码为utf-8 */
BufferedReader in = new BufferedReader(new InputStreamReader(urlconn.getInputStream(), "utf-8"));
/* 进行解析,转化为xml */
parser.parse(new InputSource(in));
/* 转化为dom对象 */
Document doc = parser.getDocument();
// return null;
return doc;
}
/**
* Function: mainService
* Description: Dom对转化过来的xml进行解析,取得图书的各个信息
* Calls:getBookImage(),getBookName(),getBookName(),getBookAuthor(),
* getBookPublisher(),getBookPublishTime(),getBookPrice()
* Called By: no
* @param doc as Document
* @return ArrayList<Book>
* @throws no
*/
public ArrayList<Book> mainService(Document doc,boolean flag) {
list = new ArrayList<Book>();
/* 取得所有<div>结点 */
NodeList servers = doc.getElementsByTagName("div");
for (int i = 0; (i < servers.getLength()
&& Node.ELEMENT_NODE == servers.item(i).getNodeType()); i++) {
Element serveritem = (Element) servers.item(i);
/* 过滤出<div id="product-content">结点,也就是存放记录的结点 */
if ("product".equals(serveritem.getAttribute("id"))) {
Book book = new Book();
Price price = new Price();
NodeList childList1 = serveritem.getChildNodes();
/* 循环遍历<div id="product-content">的子结点 */
for (int j = 0; j < childList1.getLength(); j++) {
Node childNode1 = childList1.item(j);
/* 对元素结点进行分类处理,取出其中的文本值 */
if (childNode1.getNodeType() == Node.ELEMENT_NODE) {
Element childElement1 = (Element) childNode1;
/* 取出图片地址 */
if ("product-pic".equals(childElement1.getAttribute("id"))) {
String bookImage = this.getBookImage(childElement1);
book.setBookImage(bookImage);
j+=4;
if(j<childList1.getLength()&&Node.ELEMENT_NODE == childList1.item(j).getNodeType()){
NodeList childList2 = ((Element)childList1.item(j)).getChildNodes();
/* 循环遍历<div id="product-content"的子节点> */
for (int r = 0; r < childList2.getLength(); r++) {
Node childNode2 = childList2.item(r);
/* 对元素结点进行分类处理,取出其中的文本值 */
if (childNode2.getNodeType() == Node.ELEMENT_NODE) {
Element childElement2 = (Element) childNode2;
if ("ProductTitle".equals(childElement2.getAttribute("class"))) {
String bookUrl=this.getBookUrl(childElement2);
price.setZhuoyueUrl(bookUrl);
String bookName = this.getBookName(childElement2);
book.setBookName(bookName);
String bookAuthor = this.getBookAuthor(childElement2);
book.setBookAuthor(bookAuthor);
if(flag == true){
try {
Thread.currentThread().sleep(5000);
ZhuoyueparserSec sec = new ZhuoyueparserSec();
Document tempdoc = sec.nekohtmlParserSec(bookUrl);
book.setBookISBN(sec.getBookISBNSec(tempdoc));
book.setBookProspectus(sec.getBookContentSec(tempdoc));
}catch (Exception e) {
//book.setBookISBN("");
logger.error("==========卓越二次请求解析" + bookUrl + "时出错" + "==========" +e);
flag = false;
}
}
/* 取出图书出版社 */
}else if("Company".equals(childElement2.getAttribute("class"))){
String bookPublisher = this.getBookPublisher(childElement2);
book.setBookPublisher(bookPublisher);
String bookPublishTime = this.getBookPublishTime(childElement2);
book.setBookPublishTime(bookPublishTime);
}else if ("PriceArea".equals(childElement2.getAttribute("class"))) {
String bookFixPrice = this.getBookFixPrice(childElement2);
if(!"".equals(bookFixPrice)&&null != bookFixPrice)
book.setBookFixPrice(Double.valueOf(bookFixPrice.trim()));
String bookPrice = this.getBookPrice(childElement2);
if(!"".equals(bookPrice)&&null != bookPrice)
price.setZhuoyuePrice(Double.valueOf(bookPrice));
if(!"".equals(bookFixPrice)&&null != bookFixPrice
&&!"".equals(bookPrice)&&null != bookPrice){
double bookDiscount=0;
bookDiscount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
DecimalFormat df = new DecimalFormat("####.00");
price.setZhuoyueDiscount((Float.valueOf(df.format(bookDiscount))));
}
}
}
}
}
}
/* 添加到结果链中 */
if(book.getBookName()!=null){
book.setPrice(price);
list.add(book);
}
}
}
}
}
return list;
}
public Price getDetailInfo(Document doc) {
Price price = new Price();
/* 取得所有<div>结点 */
/* 取得所有<div>结点 */
NodeList servers = doc.getElementsByTagName("div");
for (int i = 0; (i < servers.getLength()
&& Node.ELEMENT_NODE == servers.item(i).getNodeType()); i++) {
Element serveritem = (Element) servers.item(i);
/* 过滤出<div id="product-content">结点,也就是存放记录的结点 */
if ("product".equals(serveritem.getAttribute("id"))) {
NodeList childList1 = serveritem.getChildNodes();
/* 循环遍历<div id="product-content">的子结点 */
for (int j = 0; j < childList1.getLength(); j++) {
Node childNode1 = childList1.item(j);
/* 对元素结点进行分类处理,取出其中的文本值 */
if (childNode1.getNodeType() == Node.ELEMENT_NODE) {
Element childElement1 = (Element) childNode1;
/* 取出图片地址 */
if ("product-content".equals(childElement1.getAttribute("id"))) {
NodeList childList2 = childElement1.getChildNodes();
/* 循环遍历<div id="product-content"的子节点> */
for (int r = 0; r < childList2.getLength(); r++) {
Node childNode2 = childList2.item(r);
/* 对元素结点进行分类处理,取出其中的文本值 */
if (childNode2.getNodeType() == Node.ELEMENT_NODE) {
Element childElement2 = (Element) childNode2;
if ("ProductTitle".equals(childElement2.getAttribute("class"))) {
String bookUrl=this.getBookUrl(childElement2);
price.setZhuoyueUrl(bookUrl);
/* 取出图书出版社 */
}else if ("PriceArea".equals(childElement2.getAttribute("class"))) {
String bookFixPrice = this.getBookFixPrice(childElement2);
String bookPrice = this.getBookPrice(childElement2);
if(!"".equals(bookPrice)&&null != bookPrice)
price.setZhuoyuePrice(Double.valueOf(bookPrice));
if(!"".equals(bookFixPrice)&&null != bookFixPrice
&&!"".equals(bookPrice)&&null != bookPrice){
double bookDiscount=0;
bookDiscount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
DecimalFormat df = new DecimalFormat("####.00");
price.setZhuoyueDiscount((Float.valueOf(df.format(bookDiscount))));
}
}
}
}
break;
}
}
}
break;
}
}
return price;
}
/**
* Function: getBookAuthor
* Description: 获得图书作者
* Calls: no
* Called By:mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookAuthor(Element bookElement) {
String bookAuthor = "";
NodeList childList=bookElement.getChildNodes();
/*循环遍历<div class="ProductTitle">的子节点*/
for(int i=0;i<childList.getLength();i++){
Node childNode=childList.item(i);
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement=(Element)childNode;
/* 取出图书名称 */
if("author".equals(childElement.getAttribute("class"))){
if(childElement.hasChildNodes())
bookAuthor = childElement.getFirstChild().getNodeValue();
break;
}
}
}
bookAuthor = bookAuthor.replaceAll(",", "");
bookAuthor = bookAuthor.replaceAll("%", "");
bookAuthor = bookAuthor.replaceAll("等", "");
if(bookAuthor.length()>64){
bookAuthor = bookAuthor.substring(0, 64);
}
return bookAuthor.trim();
}
/**
* Function: getBookImage
* Description: 获得图书封面图片地址
* Calls: no
* Called By:mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookImage(Element bookElement) {
String bookImage = "";
NodeList childList=bookElement.getChildNodes();
for(int i=0;i<childList.getLength();i++){
Node childNode=childList.item(i);
if("A".equals(childNode.getNodeName())){
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement = (Element)childNode;
NodeList imgList = childElement.getChildNodes();
for(int j = 0;j<imgList.getLength();j++){
Node imgNode = imgList.item(j);
if("IMG".equals(imgNode.getNodeName())){
Element imgElement = (Element)imgNode;
if(null != imgElement.getAttribute("src"))
bookImage = imgElement.getAttribute("src");
break;
}
}
}
break;
}
}
//System.out.println("<<" + bookImage + ">>");
return bookImage;
}
/**
* Function: getBookName
* Description: 获得图书名称
* Calls: no
* Called By:mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookName(Element bookElement) {
String bookName = "";
NodeList childList=bookElement.getChildNodes();
/*循环遍历<div class="ProductTitle">的子节点*/
for(int i=0;i<childList.getLength();i++){
Node childNode=childList.item(i);
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement=(Element)childNode;
/* 取出图书名称 */
if("medium".equals(childElement.getAttribute("class"))){
if(childElement.hasChildNodes()
&&null != childElement.getFirstChild().getNodeValue())
bookName = childElement.getFirstChild().getNodeValue().trim();
break;
}
}
}
if(bookName.length()>64){
bookName = bookName.substring(0, 64);
}
return bookName.trim();
}
/**
* Function: getBookPrice
* Description: 获得当地图书价格
* Calls: no
* Called By: mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookPrice(Element bookElement){
String bookPrice="";
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -