⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dangdangparser.java

📁 本系统实现了从五个网站上搜索的图书进行整合后
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package com.booksearch.service.htmlparser;
/************************************************************
FileName: Dangdangparser.java
Author: fengguang 
Date:11/09/08
Description: 根据检索关键字到www.dangdang.com抽取匹配内容
Class List: Dangdangparser
***********************************************************/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;

import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
 * Class:Dangdangparser
 * Description: 根据检索关键字到www.dangdang.com抽取匹配内容
 * extens:no
 * implements:HtmlParser<Element>
 * @author  feng guang
 * @since   11/09/08
 */
public class Dangdangparser implements HtmlParser<Element>{

	/*存放本网站某一页的记录*/
	private ArrayList<Book> list;
	/*记录日志*/
	private static final Logger logger;
	
	static 
    {
        logger = Logger.getLogger(com.booksearch.service.htmlparser.Dangdangparser.class);
    }
	/**
	* Function:  nekohtmlParser
	* Description:  用nekohtml解析器解析指定网页,并转化为dom对象
	* Calls:  no
	* Called By:  no
	* @param no
	* @return Document
	* @throws IOException,SAXException
	*/
	
	public Document nekohtmlParser(String url)throws Exception{
		/*生成html 解析器*/
		DOMParser parser = new DOMParser();
		/* 设置网页的默认编码*/
		parser.setProperty("http://cyberneko.org/html/properties/default-encoding","gb2312");

		URL u = new URL(url);
		/*建立与源网站的连接*/
		URLConnection urlconn = u.openConnection();		
		//urlconn.connect();
		urlconn.setReadTimeout(30000);
		//urlconn.setConnectTimeout(30000);
		/*获得源网站的字节流,并转化为字符流,设置编码为utf-8*/
		BufferedReader in = new BufferedReader(new InputStreamReader(urlconn.getInputStream(),"gb2312"));
	    /*进行解析,转化为xml*/
		parser.parse(new InputSource(in));
        /*转化为dom对象*/
		Document doc = parser.getDocument();
		
		return doc;
	}
	/**
	* Function:  mainService
	* Description:  Dom对转化过来的xml进行解析,取得图书的各个信息
	* Calls:  getBookImage(),getBookName(),getBookName(),getBookAuthor(),
	*         getBookPublisher(),getBookPublishTime(),getBookPrice()
	* Called By:  no
	* @param doc as Document
	* @return ArrayList<Book>
	* @throws no
	*/
	public ArrayList<Book> mainService(Document doc,boolean flag){
		
		list = new ArrayList<Book>();

		/*取得所有<div>结点*/
		NodeList servers = doc.getElementsByTagName("div");
		for (int i = 0; i < servers.getLength(); i++) {

			Element serveritem = (Element) servers.item(i);
            /*过滤出<div class="list_r_list">结点,也就是存放记录的结点*/
			if (!"list_r_list".equals(serveritem.getAttribute("class"))) {
				continue;
			} else {

				NodeList childList = serveritem.getChildNodes();
				/*循环遍历<div class="list_r_list">的子结点*/
				/*接受抽取出来的信息*/
				Book book = new Book();		
				Price price = new Price();
//				if(flag){
//					try {
//						Thread.currentThread().sleep(1000);
//					} catch (InterruptedException e1) {
//						e1.printStackTrace();
//					}
//				}
				for (int j = 0; j < childList.getLength(); j++) {

					Node childNode = childList.item(j);
                    /*如果是元素结点(<span>,<h2>,<h4>,<h5>,<h6>),则进行分类处理,取出其中的文本值*/
					if (childNode.getNodeType() == Node.ELEMENT_NODE) {

						Element childElement = (Element) childNode;
                         /*取出图片地址*/
						if ("list_r_list_book".equals(childElement.getAttribute("class"))) {
							
							String bookImage = this.getBookImage(childElement);					
							book.setBookImage(bookImage);
					    /*取出图书名称*/    
						} else if ("H2".equals(childElement.getTagName())) {
							
							String bookName = this.getBookName(childElement);
							String bookUrl = this.getBookUrl(childElement);
							//System.out.println(bookUrl);
							if(flag){
								try {
									Thread.currentThread().sleep(2000);
									DangdangparserSec dangdangSec = new DangdangparserSec();
									String bookISBN = dangdangSec.getBookISBNSec(bookUrl);
									//System.out.println(bookISBN);
									book.setBookISBN(bookISBN);
								}catch (Exception e) {
									logger.error("==========当当网二次请求解析" + bookUrl + "时出错" + "==========" +e);
	                                //book.setBookISBN("");
	                               //e.printStackTrace();
								}
						    }
							book.setBookName(bookName);
							//System.out.println(bookName);
							price.setDangdangUrl(bookUrl);
			
						/*取出作者名*/	
						} else if ("list_r_list_h4".equals(childElement.getAttribute("class"))) {
							
							String bookAuthor = this.getBookAuthor(childElement);
							book.setBookAuthor(bookAuthor);
						/*取出出版社名称*/
						} else if (("H4".equals(childElement.getTagName()))
								&& (-1 != (childElement.getFirstChild().getNodeValue().indexOf("出版社")))) {
							
							String bookPublisher = this.getBookPublisher(childElement);
							book.setBookPublisher(bookPublisher);
						/*取出详细内容*/
						}else if("H5".equals(childElement.getTagName())){
							String bookContent = this.getBookContent(childElement);
							book.setBookProspectus(bookContent);
							//System.out.println(">>>>>>"+bookContent);
						
					    /*取出出版时间*/
						} else if (("H4".equals(childElement.getTagName()))
								&& (-1 != (childElement.getFirstChild().getNodeValue().indexOf("出版时间")))) {
							String bookPublishTime = this.getBookPublishTime(childElement);
							if(!"".equals(bookPublishTime)&&null!=bookPublishTime)
							    book.setBookPublishTime(bookPublishTime);
						/*取出图书价格*/
						} else if ("H6".equals(childElement.getTagName())) {
							String bookFixPrice = this.getBookFixPrice(childElement);
							if(null !=bookFixPrice&&!"".equals(bookFixPrice))
							    book.setBookFixPrice(Double.valueOf(bookFixPrice.trim()));
							String bookPrice = this.getBookPrice(childElement);
							//book.setBookPrice(bookPrice);
							if(null != bookPrice&&!"".equals(bookPrice))
							    price.setDangdangPrice(Double.valueOf(bookPrice));
							String bookDiscount = this.getBookDiscount(childElement);	
                            //book.setBookDiscount(bookDiscount);
						    if(null != bookDiscount&&!"".equals(bookDiscount))
							    price.setDangdangDiscount(Float.valueOf(bookDiscount));
						}
					}
				}
				/*放到存放结果链中*/
				if(book.getBookName() != null){
					book.setPrice(price);
					list.add(book);
				}
			}
		}
		return list;

	}
	/**
	* Function:  getBookImage
	* Description:  获得图书封面图片地址
	* Calls: no
	* Called By:  mainService
	* @param bookElement as Element
	* @return String
	* @throws no
	*/
	public  String  getBookImage(Element bookElement){
		
		Element imageElement = (Element)bookElement.getFirstChild();		
		Element srcElement = (Element)imageElement.getFirstChild();
		
		return srcElement.getAttribute("src");
	}
	/**
	* Function:  getBookName
	* Description:  获得图书名称
	* Calls: no
	* Called By:  mainService
	* @param bookElement as Element
	* @return String
	* @throws no
	*/
	public  String getBookName(Element bookElement){
		
        String bookName = "";
        /*取出<a>结点*/
		Element Firstelement = (Element)bookElement.getFirstChild(); 
		NodeList nameList = Firstelement.getChildNodes();
		/*循环遍历<a>的子结点,并取出其中的文本值*/
		for(int i = 0;i<nameList.getLength();i++){
			
			Node nameNode = nameList.item(i);
			/*分元素结点和#text结点两种不同情况进行处理*/
			if(nameNode.getNodeType() == Node.ELEMENT_NODE){
				Element nameElement = (Element)nameNode;
				/*元素结点取得其中的文本值*/
				if(nameElement.hasChildNodes())
				    bookName += nameElement.getFirstChild().getNodeValue();
			}else{
				/*#text结点取得其中的文本值*/
				bookName += nameNode.getNodeValue();
			}
		}
		bookName = bookName.replaceAll(" ", "");
		bookName = bookName.trim();
		if(bookName.length()>64){
			bookName = bookName.substring(0, 64);
		}		
		//System.out.println(bookName);
		return bookName;
	}
	/**
	* Function:  getBookAuthor
	* Description:  获得图书作者
	* Calls: no
	* Called By:  mainService
	* @param bookElement as Element
	* @return String
	* @throws no
	*/
	public  String getBookAuthor(Element bookElement){
		
		String bookAuthor = "";
	    NodeList authorList = bookElement.getChildNodes();
        for(int i = 0;i<authorList.getLength();i++){
			
			Node nameNode = authorList.item(i);
			
			if(nameNode.getNodeType() == Node.ELEMENT_NODE&&"A".equals(nameNode.getNodeName())){
				Element nameElement = (Element)nameNode;
				NodeList nameList = nameElement.getChildNodes();
				for(int j = 0;j<nameList.getLength();j++){
					Node temNode = nameList.item(j);
					if(temNode.getNodeType() == Node.ELEMENT_NODE){
						Element temElement = (Element)temNode;
						if(temElement.hasChildNodes())
						    bookAuthor += temElement.getFirstChild().getNodeValue() + " ";
					}else
					   bookAuthor += temNode.getNodeValue() + " ";
				    }
				}
		}
        if(bookAuthor.length()>1)
        	bookAuthor = bookAuthor.substring(0, bookAuthor.length() - 1);

	    if(bookAuthor.length()>64){
	    	bookAuthor = bookAuthor.substring(0, 64);
	    }
        bookAuthor = bookAuthor.replace(",", " ");
        bookAuthor = bookAuthor.replace(",", " ");
        bookAuthor = bookAuthor.replace("等", "");
        bookAuthor = bookAuthor.replace("著", "");
        bookAuthor = bookAuthor.replace("编", "");
		return bookAuthor;//.substring(bookAuthor.indexOf("作 者:"+4));
	}
	/**
	* Function:  getBookPublisher
	* Description:  获得图书出版社
	* Calls: no
	* Called By:  mainService
	* @param bookElement as Element
	* @return String
	* @throws no
	*/
	public  String getBookPublisher(Element bookElement){
		
		String bookPublisher = "";
		NodeList publisherNode = bookElement.getChildNodes();
        for(int i = 0;i<publisherNode.getLength();i++){
			
			Node publisherNameNode = publisherNode.item(i);	
			if(publisherNameNode.getNodeType() == Node.ELEMENT_NODE&&"A".equals(publisherNameNode.getNodeName())){
				Element elementPublisher = (Element)publisherNameNode;
				if(elementPublisher.hasChildNodes()){
					if(elementPublisher.getChildNodes().getLength()>1){
						NodeList temNodeList = elementPublisher.getChildNodes();
						for(int j = 0;j<temNodeList.getLength();j++){
							Node temNode = temNodeList.item(j);
							if(temNode.getNodeType()==Node.ELEMENT_NODE){
								Element temElement = (Element)temNode;
								if(temElement.hasChildNodes())
								    bookPublisher += temElement.getFirstChild().getNodeValue();
							}else{
								bookPublisher += temNode.getNodeValue();
							}
						}
					}else{
						
						if("FONT".equals(elementPublisher.getFirstChild().getNodeName())){
							Node temNode = elementPublisher.getFirstChild();
							if(temNode.getNodeType() == Node.ELEMENT_NODE){
								Element temElement = (Element)temNode;
							    if(temElement.hasChildNodes())
							        bookPublisher = temElement.getFirstChild().getNodeValue();
							}
						}else 
							bookPublisher = elementPublisher.getLastChild().getNodeValue();
					}
				} 
			}
		}
		
		//System.out.println("bookPublisher:"+bookPublisher);
//        bookPublisher = bookPublisher.trim();
//        if(bookPublisher.length()>64){
//        	bookPublisher = bookPublisher.substring(0, 64);
//        }
		return bookPublisher.trim();
	}
	/**
	* Function:  getBookPublishTime

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -