⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bookschinaparser.java

📁 本系统实现了从五个网站上搜索的图书进行整合后
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package com.booksearch.service.htmlparser;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;

import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
 * Class:BookschinaParser
 * Description: 根据检索关键字到www.bookschina.cn抽取匹配内容
 * extens:no
 * implements:HtmlParser<Element>
 * @author  wang jiaqiang
 * @since   11/09/08
 */
public class BookschinaParser implements HtmlParser<Element> {

	/* 存放本网站某一页的记录 */
	private ArrayList<Book> list;
    
//	private static final Logger logger;
	
//	static 
//    {
//        logger = Logger.getLogger(com.booksearch.service.htmlparser.BookschinaParser.class);
//    }
	/**
	 * Function: nekohtmlParser 
	 * Description: 用nekohtml解析器解析指定网页,并转化为dom对象
	 *  Calls: no 
	 *  Called By: no
	 * @param no
	 * @return Document
	 * @throws IOException,SAXException
	 */
	public Document nekohtmlParser(String url)throws Exception{
		/* 生成html 解析器 */
		DOMParser parser = new DOMParser();
		/* 设置网页的默认编码 */

		parser.setProperty(
					"http://cyberneko.org/html/properties/default-encoding",
					"gb2312");

        url = url.replace(" ", "+");
       // url = url.replace("%20", "+");
		URL u = new URL(url);

		/* 建立与源网站的连接 */
		URLConnection urlconn = u.openConnection();
		//urlconn.connect();
		//System.out.println(urlconn.getContentType());
		urlconn.setReadTimeout(30000);
		/* 获得源网站的字节流,并转化为字符流,设置编码为utf-8 */
		BufferedReader in = new BufferedReader(new InputStreamReader(urlconn.getInputStream(), "gb2312"));
		/* 进行解析,转化为xml */
		parser.parse(new InputSource(in));

		/* 转化为dom对象 */
		Document doc = parser.getDocument();

		return doc;
	}
	/**
	* Function:  mainService
	* Description:  Dom对转化过来的xml进行解析,取得图书的各个信息
	* Calls:  getBookImage(),getBookName(),getBookName(),getBookAuthor(),
	*         getBookPublisher(),getBookPublishTime(),getBookPrice()
	* Called By:  no
	* @param doc as Document
	* @return ArrayList<Book>
	* @throws no
	*/
	public ArrayList<Book> mainService(Document doc,boolean flag) {
		list = new ArrayList<Book>();
		
		/* 取得所有<div>结点 */
		NodeList servers = doc.getElementsByTagName("div");
		for (int i = 0; i < servers.getLength(); i++) {
			if(Node.ELEMENT_NODE == servers.item(i).getNodeType()){
				Element serveritem = (Element) servers.item(i);
				Book book = new Book();
				Price price = new Price();
				/* 过滤出<div class="pic">结点,也就是存放图片记录的结点 */
				if ("pic".equals(serveritem.getAttribute("class"))) {
	                if(serveritem.hasChildNodes()){
						NodeList childList1 = serveritem.getChildNodes();
						Element childElement1 = (Element) childList1.item(0);
						NodeList cdList1 = childElement1.getChildNodes();
						for(int k = 0;k<cdList1.getLength();k++){
							Node cdTem = cdList1.item(k);
							if("IMG".equals(cdTem.getNodeName())){
								
								Element cdElement1 = (Element) cdTem;
								
								/* 取出图书的图片 */
								String bookImage = this.getBookImage(cdElement1);
								book.setBookImage(bookImage);
							}
						}
	                }
	
					Element serveritem2 = (Element) servers.item(i+=1);
					
				    /*过滤出<div class="xunhuan">结点,也就是存放记录的结点 */
					if ("xunhuan".equals(serveritem2.getAttribute("class"))) {
					NodeList childList = serveritem2.getChildNodes();
				    
					/* 循环遍历<div class="xunhuan">的子结点 */
					for (int j = 0; j < childList.getLength(); j++) {
						Node childNode = childList.item(j);
						/* 如果是元素结点(<span>,<li>),则进行分类处理,取出其中的文本值 */
						if (childNode.getNodeType() == Node.ELEMENT_NODE) {
							Element childElement = (Element) childNode;
							NodeList cdList = childElement.getChildNodes();
							//System.out.println("length>>"+cdList.getLength());
							if(cdList.getLength()>7){
								if(cdList.item(1).getNodeType() == Node.ELEMENT_NODE){
									Element secElement = (Element) cdList.item(1);
			
									/* 取出图书的名字 */
									String bookName = this.getBookName(secElement);
									
									book.setBookName(bookName);
									String bookUrl = this.getBookUrl(secElement);
			                        price.setBookschinaUrl(bookUrl);
								}
								if(cdList.item(3).getNodeType() == Node.ELEMENT_NODE){
									Element secElement1 = (Element) cdList.item(3);
									/* 取出图书的作者 */
									String bookAuthor = this.getBookAuthor(secElement1);
									book.setBookAuthor(bookAuthor);
								}
								if(cdList.item(5).getNodeType() == Node.ELEMENT_NODE){
									Element secElement2 = (Element) cdList.item(5);
									/* 取出图书的出版社 */
									String bookPublisher = this
											.getBookPublisher(secElement2);
									book.setBookPublisher(bookPublisher);
								}
								Element secElement3 = null;
								if(cdList.item(7).getNodeType() == Node.ELEMENT_NODE){
									secElement3 = (Element) cdList.item(7);
									
									/* 取出图书的出版时间 */
									String bookPublishTime = this
											.getBookPublishTime(secElement3);
									if(!"".equals(bookPublishTime)&&null != bookPublishTime)
									    book.setBookPublishTime(bookPublishTime);
								}
								if(cdList.getLength()>12){
									if(cdList.item(9).getNodeType() == Node.ELEMENT_NODE){
										/* 图书的ISBN */
										Element temSecElement3 = (Element) cdList.item(9);
										//System.out.println(temSecElement3.getNodeName());
										String bookISBN = this.getBookISBN(temSecElement3);
										book.setBookISBN(bookISBN);
									}
									if(cdList.item(11).getNodeType() == Node.ELEMENT_NODE
											&&cdList.item(13).getNodeType() == Node.ELEMENT_NODE){
										Element secElement4 = (Element) cdList.item(11);
										/* 图书的固定价格 */
										String bookFixPrice = this.getBookFixPrice(secElement4);
										book.setBookFixPrice(Double.valueOf(bookFixPrice));
										
										Element secElement5 = (Element) cdList.item(13);
										/* 图书的市场价格 */			
										String bookPrice = this.getBookPrice(secElement5);
										price.setBookschinaPrice(Double.valueOf(bookPrice));
										/* 图书的折扣 */
										double discount=0;
										discount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
					             		DecimalFormat df = new DecimalFormat("####.00"); 
					             		
										price.setBookschinaDiscount((Float.valueOf(df.format(discount))));
									}
								}else{
									if(null !=secElement3&&cdList.item(11).getNodeType() == Node.ELEMENT_NODE){
										/* 图书的ISBN */
										String bookISBN = this.getBookISBN(secElement3);
										book.setBookISBN(bookISBN);
										 //System.out.println(secElement3.getTextContent());
										Element secElement4 = (Element) cdList.item(9);
										/* 图书的固定价格 */
										String bookFixPrice = this.getBookFixPrice(secElement4);
										book.setBookFixPrice(Double.valueOf(bookFixPrice));
										
										Element secElement5 = (Element) cdList.item(11);
										/* 图书的市场价格 */	
										String bookPrice = this.getBookPrice(secElement5);
										price.setBookschinaPrice(Double.valueOf(bookPrice));
										/* 图书的折扣 */
										double discount=0;
										discount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
					             		DecimalFormat df = new DecimalFormat("####.00"); 
					             		
										price.setBookschinaDiscount((Float.valueOf(df.format(discount))));
									}
								}
							}
						}
	
					}
				}
				if(!"".equals(book.getBookName())){
					 book.setPrice(price);
					 list.add(book);
				}
			  }
		   }
		}

		return list;

	}
	/**
	* Function:  getDetailInfo
	* Description:  获得某本书的详细信息,价格,折扣
	* Calls:  getBookUrl(),getBookFixPrice(),getBookPrice(),
	* Called By:  no
	* @param doc as Document
	* @return Price
	* @throws no
	*/
	public Price getDetailInfo(Document doc) {
		Price price = new Price();
		/* 取得所有<div>结点 */
		NodeList servers = doc.getElementsByTagName("div");
		for (int i = 0; i < servers.getLength(); i++) {
			
			Element serveritem = (Element) servers.item(i);
			if ("xunhuan".equals(serveritem.getAttribute("class"))) {
                
				NodeList childList = serveritem.getChildNodes();		    
				/* 循环遍历<div class="xunhuan">的子结点 */
				for (int j = 0; j < childList.getLength(); j++) {
					Node childNode = childList.item(j);
					/* 如果是元素结点(<span>,<li>),则进行分类处理,取出其中的文本值 */
					if (childNode.getNodeType() == Node.ELEMENT_NODE) {
						Element childElement = (Element) childNode;
						NodeList cdList = childElement.getChildNodes();
						if(cdList.getLength()>=1){
							Node secElement = (Node) cdList.item(1);
	                        if(secElement.getNodeType() == Node.ELEMENT_NODE){
								String bookUrl = this.getBookUrl((Element)secElement);
		                        price.setBookschinaUrl(bookUrl);
	                        }
						}
						if(cdList.getLength()>12){
                            if(cdList.item(11).getNodeType() == Node.ELEMENT_NODE
                            		&&cdList.item(13).getNodeType() == Node.ELEMENT_NODE){
								Element secElement4 = (Element) cdList.item(11);
								/* 图书的固定价格 */
								String bookFixPrice = this.getBookFixPrice(secElement4);

								Element secElement5 = (Element) cdList.item(13);
								/* 图书的市场价格 */
								String bookPrice = this.getBookPrice(secElement5);
								if(null !=bookPrice&&!"".equals(bookPrice))
								    price.setBookschinaPrice(Double.valueOf(bookPrice));
								/* 图书的折扣 */
								double discount=0;
								discount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
			             		DecimalFormat df = new DecimalFormat("####.00"); 
			             		
								price.setBookschinaDiscount((Float.valueOf(df.format(discount))));
                            }
						}else if(cdList.getLength() == 11){
                            if(cdList.item(9).getNodeType() == Node.ELEMENT_NODE
                            		&&cdList.item(11).getNodeType() == Node.ELEMENT_NODE){
								Element secElement4 = (Element) cdList.item(9);
								/* 图书的固定价格 */
								String bookFixPrice = this.getBookFixPrice(secElement4);
								
								Element secElement5 = (Element) cdList.item(11);
								/* 图书的市场价格 */
			
								String bookPrice = this.getBookPrice(secElement5);
								price.setBookschinaPrice(Double.valueOf(bookPrice));
								/* 图书的折扣 */
								double discount=0;
								discount=(Double.valueOf(bookPrice))/(Double.valueOf(bookFixPrice));
			             		DecimalFormat df = new DecimalFormat("####.00"); 
			             		
								price.setBookschinaDiscount((Float.valueOf(df.format(discount))));
                            }
						}
					}	
				}
				break;
			}
		}
		return price;
	}
	/**
	 * Function: getBookImage Description: 获得图书封面图片地址 
	 * Calls: no 
	 * Called By:mainService
	 * @param bookElement as Element
	 * @return String
	 * @throws no
	 */
	public String getBookImage(Element bookElement) {

		String bookImage = "";
        if(bookElement.hasAttribute("src"))
		    bookImage = bookElement.getAttribute("src");
        if(bookImage.length()>0)
        	bookImage = bookImage.trim();
		return bookImage;
	}

	/**
	 * Function: getBookName 
	 * Description: 获得图书名称 
	 * Calls: no Called By:mainService 
	 * @param bookElement as Element
	 * @return String
	 * @throws no
	 */
	public String getBookName(Element bookElement) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -