⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 weilanparser.java

📁 本系统实现了从五个网站上搜索的图书进行整合后
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package com.booksearch.service.htmlparser;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
 * Class:WeilanParser
 * Description: 根据检索关键字到www.wl.cn抽取匹配内容
 * extens:no
 * implements:HtmlParser<Element>
 * @author  li chao
 * @since   11/09/08
 */
public class WeilanParser implements HtmlParser<Element> {
	
	private ArrayList<Book> list;	
	
	private String url = "";
	
	private static final Logger logger;
	
	static 
    {
        logger = Logger.getLogger(com.booksearch.service.htmlparser.WeilanParser.class);
    }
	/**
	* Function:  nekohtmlParser
	* Description:  用nekohtml解析器解析指定网页,并转化为dom对象
	* Calls:  no
	* Called By:  no
	* @param no
	* @return Document
	* @throws IOException,SAXException
	*/
	public Document nekohtmlParser(String url) throws Exception{
		this.url = url;
		// 生成html parse
		DOMParser parser = new DOMParser();
		// 设置网站默认编码
		parser.setProperty(
					"http://cyberneko.org/html/properties/default-encoding",
					"UTF-8");
		URL u = new URL(this.url);
		Thread.currentThread().sleep(5000);
		/* 建立与源网站的连接 */
		URLConnection urlConnection = u.openConnection();
		urlConnection.setReadTimeout(30000);
		//urlConnection.setConnectTimeout(30000);
		//urlConnection.connect();
		/* 获得源网站的字节流,并转化为字符流,设置编码为gb2312 */
		BufferedReader inputStream = new BufferedReader(new InputStreamReader(
					urlConnection.getInputStream(), "UTF-8"));
		parser.parse(new InputSource(inputStream));
		Document doc = parser.getDocument();

		return doc;
		}
	/**
	 * Function: mainService 
	 * Description: Dom对转化过来的xml进行解析,取得图书的各个信息 
	 * Calls:no 
	 * Called By: no
	 * @param doc as Document,flag as boolean
	 * @return Document
	 * @throws SAXException IOException 
	 */
	public ArrayList<Book> mainService(Document doc,boolean flag){		
		
		list = new ArrayList<Book>();		
		/*取出所有的div节点*/
		NodeList divNodeList = doc.getElementsByTagName("div");
		/*遍历div节点,选出id=__search_centent的div节点*/
		for(int i=0;(i<divNodeList.getLength()
				    &&Node.ELEMENT_NODE == divNodeList.item(i).getNodeType());i++){			
			Node divNode =divNodeList.item(i);
			if("__search_centent".equals(((Element)divNode).getAttribute("id"))){

				Book book = new Book();
				Price price = new Price();
				/*取出id=__search_centent的div的所有节点,保存在contentNodeList中*/
				NodeList contentNodeList = divNode.getChildNodes();
				for(int j=0;j<contentNodeList.getLength();j++){	
					
					if(contentNodeList.item(j).getNodeType()==Node.ELEMENT_NODE){

						Element contentElement=(Element)contentNodeList.item(j);
						/*判断id=__search_centent的div节点中的<p>、<div>节点的class属性值,分别处理*/ 												
						if("pic".equals(contentElement.getAttribute("class"))){
							String weilanUrl = this.getBookUrl(contentElement);
							price.setWeilanUrl(weilanUrl);			
							//System.out.println(weilanUrl);
							if(flag == true){
								try {
									Thread.currentThread().sleep(5000);
									WeiLanParserSec tem = new WeiLanParserSec();
									String bookISBN = tem.getBookISBNSec(weilanUrl);
									book.setBookISBN(bookISBN);
								}catch (Exception e) {
									flag = false;
									logger.error("==========蔚蓝网二次请求解析" + weilanUrl + "时出错" + "==========" +e);
									//e.printStackTrace();
			                        //book.setBookISBN("");
								}
							}
							String pic=this.getBookImage(contentElement);
							book.setBookImage(pic);
							//System.out.println("图片链接:"+book.getBookImage());							
						}
						if("ProductName".equals(contentElement.getAttribute("class"))){
							String bookName = this.getBookName(contentElement);
							book.setBookName(bookName);
							//System.out.println("书名:"+book.getBookName());
						}
						if("display: ".equals(contentElement.getAttribute("style"))){
							String bookAuthor = this.getBookAuthor(contentElement);
							book.setBookAuthor(bookAuthor);
							//System.out.println("作者:"+book.getBookAuthor());
							
							String bookPublisher =this.getBookPublisher(contentElement);
							book.setBookPublisher(bookPublisher);
							//System.out.println("出版社:"+book.getBookPublisher());
							
							String bookPublishTime =this.getBookPublishTime(contentElement);
							if(!"".equals(bookPublishTime)&&null != bookPublishTime)
							    book.setBookPublishTime(bookPublishTime);
							//System.out.println("出版时间:"+book.getBookPublishTime());
						}
						if("margin-top: 10px; margin-bottom: 10px;".equals(contentElement
								.getAttribute("style"))){
							String bookProspectus=this.getBookContent(contentElement);
						    book.setBookProspectus(bookProspectus);						    
							//System.out.println("内容简介:"+book.getBookContent());																								
						}
						if("DIV".equals(contentElement.getTagName())){							
							String bookFixPrice=this.getBookFixPrice(contentElement);
							if(null != bookFixPrice&&!"".equals(bookFixPrice))
							    book.setBookFixPrice(Double.valueOf(bookFixPrice));
							//System.out.println("原价:"+book.getBookFixPrice());
							
							String weilanPrice=this.getBookPrice(contentElement);
							if(null != weilanPrice&&!"".equals(weilanPrice))
							    price.setWeilanPrice(Double.valueOf(weilanPrice));
							//System.out.println("蔚蓝价:"+price.getWlPrice());
							
							String weilanDiscount =this.getBookDiscount(contentElement);
							if(null != weilanDiscount&&!"".equals(weilanDiscount))
							    price.setWeilanDiscount(Float.valueOf(weilanDiscount));
							//System.out.println("折扣:"+price.getWeilanDiscount());
						}

					}					
				}
				if(book.getBookName() != null){	
					book.setPrice(price);
					list.add(book);							
				}
			}			
		}	
        return list;
	}
	/**
	* Function:  getDetailInfo
	* Description:  获得某本书的详细信息,价格,折扣
	* Calls:  getBookUrl(),getBookFixPrice(),getBookPrice(),
	* Called By:  no
	* @param doc as Document
	* @return Price
	* @throws no
	*/
	public Price getDetailInfo(Document doc) {
		Price price = new Price();
		list = new ArrayList<Book>();		
		/*取出所有的div节点*/
		NodeList divNodeList = doc.getElementsByTagName("div");
		/*遍历div节点,选出id=__search_centent的div节点*/
		for(int i=0;(i<divNodeList.getLength()
		             &&Node.ELEMENT_NODE == divNodeList.item(i).getNodeType());i++){			
			
			Node divNode =divNodeList.item(i);
			if("__search_centent".equals(((Element)divNode).getAttribute("id"))){

				/*取出id=__search_centent的div的所有节点,保存在contentNodeList中*/
				NodeList contentNodeList = divNode.getChildNodes();
				for(int j=0;j<contentNodeList.getLength();j++){	
					
					if(contentNodeList.item(j).getNodeType()==Node.ELEMENT_NODE){

						Element contentElement=(Element)contentNodeList.item(j);	
						if("pic".equals(contentElement.getAttribute("class"))){
							String weilanUrl = this.getBookUrl(contentElement);
							price.setWeilanUrl(weilanUrl);									
						}
						if("DIV".equals(contentElement.getTagName())){							
							
							String weilanPrice=this.getBookPrice(contentElement);
							if(null !=weilanPrice && !"".equals(weilanPrice))
							    price.setWeilanPrice(Double.valueOf(weilanPrice));
							//System.out.println("蔚蓝价:"+price.getWlPrice());
							
							String weilanDiscount =this.getBookDiscount(contentElement);
							if(!"".equals(weilanDiscount)&&null != weilanDiscount)
							    price.setWeilanDiscount(Float.valueOf(weilanDiscount));
							//System.out.println("折扣:"+price.getWeilanDiscount());
						}
					}
				}
				break;
			}	
		}
		return price;
	}
	/**Function:getNextPageUrl
	 * Description:获得下一页链接
	 * Call:no
	 * Called by:
	 * @param bookElement as Element	 
	 * @throws no
	 * @return String
	 */

	public String getNextPageUrl(Document doc) {
		String nextPageUrl="no";
		NodeList divNodeList=doc.getElementsByTagName("div");
		for(int i=0;i<divNodeList.getLength();i++){
			if(divNodeList.item(i).getNodeType()==Node.ELEMENT_NODE){
				 if("__search_topfanye".equals(((Element)divNodeList.item(i)).getAttribute("id"))){
					NodeList tableNodeList = divNodeList.item(i).getChildNodes();
					for(int j = 0;j<tableNodeList.getLength();j++){
						Node tableNode = tableNodeList.item(j);
						if(Node.ELEMENT_NODE == tableNode.getNodeType()
								&&"TABLE".equals(tableNode.getNodeName())){
							NodeList trNodeList = tableNode.getChildNodes();
							for(int k = 0;k<trNodeList.getLength();k++){
								Node trNode = trNodeList.item(k);
								if(Node.ELEMENT_NODE == trNode.getNodeType()
										&& "TR".equals(trNode.getNodeName())){
									NodeList tdNodeList = trNode.getChildNodes();
									for(int m = 0;m<tdNodeList.getLength();m++){
										Node tdNode = tdNodeList.item(m);
										if(Node.ELEMENT_NODE == tdNode.getNodeType()
												&&"TD".equals(tdNode.getNodeName())){
											Element tdElement = (Element)tdNode;
											if("pre_page".equals(tdElement.getAttribute("class"))){
												if(tdElement.hasChildNodes()&&null != tdElement.getFirstChild().getNodeValue()){
											      String fanye=tdElement.getFirstChild().getNodeValue().trim();	
								                  if(Integer.valueOf(fanye.substring(0, fanye.indexOf("/")))
								                		  < Integer.valueOf(fanye.substring(fanye.indexOf("/")+1))){	
								                	  
						                              if(this.url.indexOf("start=") != -1){
						                             	 
						    							    String makeUrl=url.substring(0, url.indexOf("&start="))+"&start=";                        			
						   								    nextPageUrl=makeUrl+(Integer.valueOf(fanye.substring(0, fanye.indexOf("/")))*20+1);
						                               }else{
														    nextPageUrl = this.url+"&start="+(Integer.valueOf(fanye.substring(0, fanye.indexOf("/")))*20+1);
						                               }	    			                
								                  }
												}
												break;
											}
										}
									}
								}
							}
							break;
						}
					}

				}					
			}
		}
		return nextPageUrl;
	}
	/**Function:getRecordNum
	 * Description:获得记录总数
	 * Call:no
	 * Called by:
	 * @param bookElement as Element	 
	 * @throws no
	 * @return long
	 */
	public long getRecordNum(Document doc) {
		
		long num = 0;
		list = new ArrayList<Book>();		
		/*取出所有的div节点*/
		NodeList divNodeList = doc.getElementsByTagName("div");
		/*遍历div节点,选出id=__search_centent的div节点*/
		for(int i=0;i<divNodeList.getLength();i++){			
			Node divNode =divNodeList.item(i);
			if("summarycontent".equals(((Element)divNode).getAttribute("id"))){									
				NodeList spanNodeList=divNode.getChildNodes();
				for(int j=0;j<spanNodeList.getLength();j++){
					if(spanNodeList.item(j).getNodeType()==Node.ELEMENT_NODE){
						Element spanElement =(Element)spanNodeList.item(j);
						if("ctl00_ContentPlaceHolder1_TotalCount".equals(spanElement.getAttribute("id"))){				
	                         NodeList bList = spanElement.getChildNodes();
	                         for(int k = 0;k<bList.getLength();k++){
	                        	 Node bNode = bList.item(k);
	                        	 if("B".equals(bNode.getNodeName())){
	                        		 if(bNode.hasChildNodes()){
	                        			 Node numNode = bNode.getFirstChild();
	                        			 if(numNode.getNodeType() == Node.ELEMENT_NODE){
	                        				 Element numElement = (Element)numNode;
	                        				 if(numElement.hasChildNodes()&&null != numElement.getFirstChild().getNodeValue()){
		                        				 String numStr = numElement.getFirstChild().getNodeValue().trim();
		                        				 if(null!=numStr)
		                        					 num = Integer.valueOf(numStr);
		                        				 break;
	                        				 }
	                        			 }
	                        		 }
	                        	 }
	                         }
							 break;
						}
					}			    			
				}
				break;
			}
		}
		return num;
	}
	/**Function:getBookAuthor
	 * Description:获得图书作者
	 * Call:no
	 * Called by:mainService
	 * @param bookElement as Element	 
	 * @throws no
	 * @return String
	 */
	public String getBookAuthor(Element bookElement) {
		String bookAuthor="";
		NodeList aList = bookElement.getChildNodes();
		for(int i = 0;i<aList.getLength();i++){
			Node aNode = aList.item(i);
			if(null != aNode.getNodeValue()&&!"".equals(aNode.getNodeValue())){
				if(aNode.getNodeValue().indexOf("作者")!= -1){
					i ++;
					if(i<aList.getLength()){
						Node authorNode = aList.item(i);
						if("A".equals(authorNode.getNodeName())){
							Element authorElement = (Element)authorNode;
							if(authorElement.hasChildNodes()&&null != authorElement.getFirstChild().getNodeValue())
							    bookAuthor = authorElement.getFirstChild().getNodeValue().trim();
							break;
						}
					}
				}
			}
		}
        bookAuthor = bookAuthor.replace(",", " ");
        bookAuthor = bookAuthor.replace(",", " ");
        bookAuthor = bookAuthor.replace(";", " ");
        bookAuthor = bookAuthor.replace("、", " ");
        bookAuthor = bookAuthor.replace("等", "");
        bookAuthor = bookAuthor.replace("著", "");
		//System.out.println(bookAuthor);
		return bookAuthor;
	}
	/**Function:getBookDiscount
	 * Description:获得图书折扣
	 * Call:no
	 * Called by:mainService
	 * @param bookElement as Element	 
	 * @throws no

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -