⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zhuoyueparser.java

📁 本系统实现了从五个网站上搜索的图书进行整合后
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package com.booksearch.service.htmlparser;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;


import com.booksearch.orm.Book;
import com.booksearch.orm.Price;

/**
 * Class:Zhuoyueparser 
 * Description: 根据检索关键字到http://www.amazon.cn/抽取匹配内容
 * extens:no 
 * implements:HtmlParser<Element>
 * @author wangchao
 * @since 11/09/08
 */
public class Zhuoyueparser implements HtmlParser<Element> {

	/* 存放本网站某一页的记录 */
	private ArrayList<Book> list;	
	
	private String url;	
	
	/*记录日志*/
	private static final Logger logger;
	
	static 
    {
        logger = Logger.getLogger(com.booksearch.service.htmlparser.Zhuoyueparser.class);
    }
	/**
	 * Function: nekohtmlParser 
	 * Description: 用nekohtml解析器解析指定网页,并转化为dom对象 
	 * Calls:no 
	 * Called By: no
	 * @param no
	 * @return Document
	 * @throws IOException,SAXException
	 */
	public Document nekohtmlParser(String url) throws Exception {
		this.url = url;
		System.out.println(this.url);
		/* 生成html 解析器 */
		DOMParser parser = new DOMParser();
		/* 设置网页的默认编码 */
		parser.setProperty(
				"http://cyberneko.org/html/properties/default-encoding",
				"utf-8");

		URL u = new URL(this.url);
		Thread.currentThread().sleep(5000);
		/* 建立与源网站的连接 */
		URLConnection urlconn = u.openConnection();
		//urlconn.setReadTimeout(30000);
		//urlconn.setConnectTimeout(30000);
		//urlconn.connect();
		/* 获得源网站的字节流,并转化为字符流,设置编码为utf-8 */
		BufferedReader in = new BufferedReader(new InputStreamReader(urlconn.getInputStream(), "utf-8"));
		/* 进行解析,转化为xml */
		parser.parse(new InputSource(in));
		/* 转化为dom对象 */
		Document doc = parser.getDocument();
//        return null;
		return doc;
	}

	/**
	 * Function: mainService 
	 * Description: Dom对转化过来的xml进行解析,取得图书的各个信息
	 * Calls:getBookImage(),getBookName(),getBookName(),getBookAuthor(),
	 *       getBookPublisher(),getBookPublishTime(),getBookPrice() 
	 * Called By: no
	 * @param doc as Document
	 * @return ArrayList<Book>
	 * @throws no
	 */
	public ArrayList<Book> mainService(Document doc,boolean flag) {
		
		list = new ArrayList<Book>();
	
		/* 取得所有<div>结点 */
		NodeList servers = doc.getElementsByTagName("div");	
		for (int i = 0; (i < servers.getLength()
		                && Node.ELEMENT_NODE == servers.item(i).getNodeType()); i++) {    
			Element serveritem = (Element) servers.item(i);
			/* 过滤出<div id="product-content">结点,也就是存放记录的结点 */
			if ("product".equals(serveritem.getAttribute("id"))) {
				
				Book book = new Book();
				Price price = new Price();
				NodeList childList1 = serveritem.getChildNodes();				
				/* 循环遍历<div id="product-content">的子结点 */
				for (int j = 0; j < childList1.getLength(); j++) {
					Node childNode1 = childList1.item(j);
					/* 对元素结点进行分类处理,取出其中的文本值 */
					if (childNode1.getNodeType() == Node.ELEMENT_NODE) {
						
						Element childElement1 = (Element) childNode1;
						/* 取出图片地址 */
						if ("product-pic".equals(childElement1.getAttribute("id"))) {
							
							String bookImage = this.getBookImage(childElement1);
							book.setBookImage(bookImage);
						
							j+=4;
							if(j<childList1.getLength()&&Node.ELEMENT_NODE == childList1.item(j).getNodeType()){								
							    NodeList childList2 = ((Element)childList1.item(j)).getChildNodes();
							    /* 循环遍历<div id="product-content"的子节点> */
							    for (int r = 0; r < childList2.getLength(); r++) {
								    Node childNode2 = childList2.item(r);
								    /* 对元素结点进行分类处理,取出其中的文本值 */
								    if (childNode2.getNodeType() == Node.ELEMENT_NODE) {
									    Element childElement2 = (Element) childNode2;
									    if ("ProductTitle".equals(childElement2.getAttribute("class"))) {
										
											String bookUrl=this.getBookUrl(childElement2);
											price.setZhuoyueUrl(bookUrl);
											
											String bookName = this.getBookName(childElement2);
											book.setBookName(bookName);
											
											String bookAuthor = this.getBookAuthor(childElement2);										
											book.setBookAuthor(bookAuthor);
										
										    if(flag == true){
                                         	    try {
                                         	        Thread.currentThread().sleep(5000);
                                         		    ZhuoyueparserSec sec = new ZhuoyueparserSec();
                                         		    Document tempdoc = sec.nekohtmlParserSec(bookUrl);
												    book.setBookISBN(sec.getBookISBNSec(tempdoc));
												    book.setBookProspectus(sec.getBookContentSec(tempdoc));
											    }catch (Exception e) {														
												    //book.setBookISBN("");
											    	logger.error("==========卓越二次请求解析" + bookUrl + "时出错" + "==========" +e);
											    	flag = false;
											    } 													
                                           }
									    /* 取出图书出版社 */
									    }else if("Company".equals(childElement2.getAttribute("class"))){
										
										    String bookPublisher = this.getBookPublisher(childElement2);
										    book.setBookPublisher(bookPublisher);
										
										    String bookPublishTime = this.getBookPublishTime(childElement2);
										    book.setBookPublishTime(bookPublishTime);
								        }else if ("PriceArea".equals(childElement2.getAttribute("class"))) {
									      
										     String bookFixPrice = this.getBookFixPrice(childElement2);
										     if(!"".equals(bookFixPrice)&&null != bookFixPrice)
										         book.setBookFixPrice(Double.valueOf(bookFixPrice.trim()));	
										  
										     String bookPrice = this.getBookPrice(childElement2);
										     if(!"".equals(bookPrice)&&null != bookPrice)
										         price.setZhuoyuePrice(Double.valueOf(bookPrice));
										  
										     if(!"".equals(bookFixPrice)&&null != bookFixPrice
												  &&!"".equals(bookPrice)&&null != bookPrice){
											      double bookDiscount=0;
											      bookDiscount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
							             	      DecimalFormat df = new DecimalFormat("####.00"); 						             		
											      price.setZhuoyueDiscount((Float.valueOf(df.format(bookDiscount))));
										 }
									}
								}
							}
						}						
					}						
					/* 添加到结果链中 */
					if(book.getBookName()!=null){
					    book.setPrice(price);
					    list.add(book);
					}
				}
			}
		 }
	  }	
	  return list; 
	}
	public Price getDetailInfo(Document doc) {
		Price price = new Price();		
		/* 取得所有<div>结点 */
		/* 取得所有<div>结点 */
		NodeList servers = doc.getElementsByTagName("div");	
		for (int i = 0; (i < servers.getLength()
		                && Node.ELEMENT_NODE == servers.item(i).getNodeType()); i++) {    
			Element serveritem = (Element) servers.item(i);
			/* 过滤出<div id="product-content">结点,也就是存放记录的结点 */
			if ("product".equals(serveritem.getAttribute("id"))) {
				
				NodeList childList1 = serveritem.getChildNodes();				
				/* 循环遍历<div id="product-content">的子结点 */
				for (int j = 0; j < childList1.getLength(); j++) {
					Node childNode1 = childList1.item(j);
					/* 对元素结点进行分类处理,取出其中的文本值 */
					if (childNode1.getNodeType() == Node.ELEMENT_NODE) {
						
						Element childElement1 = (Element) childNode1;
						/* 取出图片地址 */
						if ("product-content".equals(childElement1.getAttribute("id"))) {
							
							    NodeList childList2 = childElement1.getChildNodes();
							    /* 循环遍历<div id="product-content"的子节点> */
							    for (int r = 0; r < childList2.getLength(); r++) {
								    Node childNode2 = childList2.item(r);
								    /* 对元素结点进行分类处理,取出其中的文本值 */
								    if (childNode2.getNodeType() == Node.ELEMENT_NODE) {
								    	
									    Element childElement2 = (Element) childNode2;
									    if ("ProductTitle".equals(childElement2.getAttribute("class"))) {
										
											String bookUrl=this.getBookUrl(childElement2);
											price.setZhuoyueUrl(bookUrl);
									    /* 取出图书出版社 */
									    }else if ("PriceArea".equals(childElement2.getAttribute("class"))) {
									      
										     String bookFixPrice = this.getBookFixPrice(childElement2);
										  
										     String bookPrice = this.getBookPrice(childElement2);
										     if(!"".equals(bookPrice)&&null != bookPrice)
										         price.setZhuoyuePrice(Double.valueOf(bookPrice));
										  
										     if(!"".equals(bookFixPrice)&&null != bookFixPrice
												  &&!"".equals(bookPrice)&&null != bookPrice){
											      double bookDiscount=0;
											      bookDiscount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
							             	      DecimalFormat df = new DecimalFormat("####.00"); 						             		
											      price.setZhuoyueDiscount((Float.valueOf(df.format(bookDiscount))));
										    }
									   }
								   }
							    }
							    break;
						   }						
					   }						
				  }
				  break;
			}
	  }	
		return price;
	}
	/**
	 * Function: getBookAuthor 
	 * Description: 获得图书作者 
	 * Calls: no 
	 * Called By:mainService
	 * @param bookElement as Element
	 * @return String
	 * @throws no
	 */
	public String getBookAuthor(Element bookElement) {

		String bookAuthor = "";
		NodeList childList=bookElement.getChildNodes();
		 /*循环遍历<div class="ProductTitle">的子节点*/
		for(int i=0;i<childList.getLength();i++){
			Node childNode=childList.item(i);
			if(childNode.getNodeType()==Node.ELEMENT_NODE){
				Element childElement=(Element)childNode;
				/* 取出图书名称 */
				if("author".equals(childElement.getAttribute("class"))){
					if(childElement.hasChildNodes())
						bookAuthor = childElement.getFirstChild().getNodeValue();
					break;
				}
			}
		}
		bookAuthor = bookAuthor.replaceAll(",", "");
		bookAuthor = bookAuthor.replaceAll("%", "");
		bookAuthor = bookAuthor.replaceAll("等", "");
        if(bookAuthor.length()>64){
        	bookAuthor = bookAuthor.substring(0, 64);
        }
		return bookAuthor.trim();
	}

	/**
	 * Function: getBookImage 
	 * Description: 获得图书封面图片地址 
	 * Calls: no 
	 * Called By:mainService
	 * @param bookElement as Element
	 * @return String
	 * @throws no
	 */
	public String getBookImage(Element bookElement) {
		
        String bookImage = "";
        
		NodeList childList=bookElement.getChildNodes();
		for(int i=0;i<childList.getLength();i++){
			Node childNode=childList.item(i);
			if("A".equals(childNode.getNodeName())){
			    if(childNode.getNodeType()==Node.ELEMENT_NODE){
				    Element childElement = (Element)childNode;
				    NodeList imgList = childElement.getChildNodes();
				    for(int j = 0;j<imgList.getLength();j++){
				    	Node imgNode = imgList.item(j);
				    	if("IMG".equals(imgNode.getNodeName())){
				    		Element imgElement = (Element)imgNode;
						    if(null != imgElement.getAttribute("src"))
						    	bookImage = imgElement.getAttribute("src");
							break;
				    	}
				    }
				}
			    break;
			}
		}
		//System.out.println("<<" + bookImage + ">>");
		return bookImage;

	}
	/**
	 * Function: getBookName 
	 * Description: 获得图书名称 
	 * Calls: no 
	 * Called By:mainService
	 * @param bookElement as Element
	 * @return String
	 * @throws no
	 */
	public String getBookName(Element bookElement) {

		String bookName = "";
		
		NodeList childList=bookElement.getChildNodes();
		 /*循环遍历<div class="ProductTitle">的子节点*/
		for(int i=0;i<childList.getLength();i++){
			Node childNode=childList.item(i);
			if(childNode.getNodeType()==Node.ELEMENT_NODE){
				Element childElement=(Element)childNode;
				/* 取出图书名称 */
				if("medium".equals(childElement.getAttribute("class"))){
					if(childElement.hasChildNodes()
							&&null != childElement.getFirstChild().getNodeValue())
	                    bookName = childElement.getFirstChild().getNodeValue().trim();
					break;
				}
			}
		}
		
		if(bookName.length()>64){
			bookName = bookName.substring(0, 64);
		}
		return bookName.trim();
	}

	/**
	* Function:  getBookPrice
	* Description:  获得当地图书价格
	* Calls: no
	* Called By:  mainService
	* @param bookElement as Element
	* @return String
	* @throws no
	*/
	public String getBookPrice(Element bookElement){
		
	   String bookPrice="";

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -