china_pubparser.java

来自「本系统实现了从五个网站上搜索的图书进行整合后」· Java 代码 · 共 525 行 · 第 1/2 页
JAVA
525 行
package com.booksearch.service.htmlparser;
/************************************************************
FileName: China_pubparser.java
Author: lichao 
Date:11/14/08
Description: 根据检索关键字到www.china-pub.com抽取匹配内容
Class List: China_pubparser
***********************************************************/
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.StringTokenizer;

import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;

import com.booksearch.orm.Book;
import com.booksearch.orm.Price;

/**
 * Class:China_pubparser
 * Description: 根据检索关键字到www.china-pub.com抽取匹配内容
 * extens:no
 * implements:HtmlParser<Element>
 * @author  li chao
 * @since   11/14/08
 */
public class China_pubparser implements HtmlParser<String> {
   
	//private String url = "http://www.china-pub.com/s/?key1=java" ;
	/*存放本网站某一页的记录*/
	private ArrayList<Book> list;
	
//	private static final Logger logger;
//	
//	static 
//    {
//        logger = Logger.getLogger(com.booksearch.service.htmlparser.China_pubparser.class);
//    }
	
	/**
	* Function:  nekohtmlParser
	* Description:  用nekohtml解析器解析指定网页，并转化为dom对象
	* Calls:  no
	* Called By:  no
	* @param no
	* @return Document
	* @throws IOException,SAXException
	*/
	public Document nekohtmlParser(String url)throws Exception{
        // 生成html parse
		DOMParser parser = new DOMParser();
		// 设置网站默认编码
		parser.setProperty(
					"http://cyberneko.org/html/properties/default-encoding",
					"UTF-8");

		URL u = new URL(url);
		/*建立与源网站的连接*/
		URLConnection urlConnection = u.openConnection();
		urlConnection.setReadTimeout(30000);
		//urlConnection.setConnectTimeout(30000);
		//urlConnection.connect();
		//*获得源网站的字节流，并转化为字符流，设置编码为gb2312*/
		BufferedReader inputStream = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"gb2312"));
		parser.parse(new InputSource(inputStream));
		Document doc = parser.getDocument();
		
		return doc;
	}
	/**
	* Function:  mainService
	* Description:  用nekohtml解析器解析指定网页，并转化为dom对象
	* Calls:  getBookImage(),getBookName(),getBookAuthor(),getBookISBN(),getBookFixPrice()
	*         getBookPublisher(),getBookPublishTime(),getBookPrice(),getBookUrl
	* Called By:  no
	* @param doc as Document
	* @return ArrayList<Book>
	* @throws no
	*/
	public ArrayList<Book> mainService(Document doc,boolean flag) {
		list = new ArrayList<Book>();
		/*过滤出<script></script>结点*/
		NodeList servers = doc.getElementsByTagName("script");
		for (int i = 0; i < servers.getLength(); i++) {
			
			Node node = servers.item(i);
			NodeList childNode = node.getChildNodes();
			if (childNode.getLength() > 0
					&& (childNode.item(0).getNodeValue().indexOf("dt") != -1)) {
				
				String tem = childNode.item(0).getNodeValue();
				/*因为结点中不止有一个[]，所以要先把第一二个[]过滤掉*/
				tem = tem.substring(tem.indexOf("dt"));
				/*过滤出数据组中的元素*/
				if(tem.indexOf("[") != -1&& tem.indexOf("]")!= -1){
					
					String bookStr = tem.substring(tem.indexOf("[")-1, tem
							.indexOf("]"));
					
					String bookArr[];
					int j = 0;
					/*按"}"进行分词处理，分成单独的一条条记录*/
					StringTokenizer st = new StringTokenizer(bookStr, "}");
					bookArr = new String[st.countTokens()];
				
					while (st.hasMoreElements()) {
						/*因为分词之后会有一个是空的，所以进行一下判断*/
						if(j<20){
							bookArr[j] = st.nextToken();
							j++;
						}else break;
						
					}
	                /*循环遍历每一条记录，取出其中详细的信息*/
					for (int k = 0; k < bookArr.length-1; k++) {
						
					  Book book = new Book();
					  Price price = new Price();
					  /*按","进行分词*/
	                  StringTokenizer temp =new StringTokenizer(bookArr[k],",");
	              
	                  while(temp.hasMoreElements()){
	                	  
	                	  String temStr = temp.nextToken();
	                	  /*取出图书名称*/
	                	  if(temStr.indexOf("sm:")!=-1){
	                		  String bookName = getBookName(temStr);
	                		  book.setBookName(bookName);
	                		  //System.out.println(bookName);
	                	   /*取出图书市场定价*/
	                	  }else if(temStr.indexOf("dj:")!=-1){
	                		  String bookFixPrice = getBookFixPrice(temStr);
	                		  if(null != bookFixPrice&&!"".equals(bookFixPrice))
	                		      book.setBookFixPrice(Double.valueOf(bookFixPrice.trim()));
	                		  //System.out.println(bookFixPrice);
	                	   /*取出图书折扣和打折后的价格*/
	                	  }else if(temStr.indexOf("zk:")!=-1){
	                		  String bookDiscount = this.getBookDiscount(temStr);
	                		  //book.setBookDiscount(bookDiscount);
	                		  if(null != bookDiscount&&!"".equals(bookDiscount))
	                		      price.setChina_pubDiscount(Float.valueOf(bookDiscount.trim()));
	                		  /*求出本网站普通会员的买书价格*/
	                		  double bookPrice = book.getBookFixPrice()*Float.valueOf(bookDiscount.trim());   
	                		  /*进行格式化，保留两位小数*/
	                		  DecimalFormat df = new DecimalFormat("####.00");           		  
	                		  price.setChina_pubPrice(Double.valueOf(df.format(bookPrice)));
	                		  //price.setChina_pubPrice(bookPrice);
	                		  //System.out.println(df.format(bookPrice));
	                		  //System.out.println(bookDiscount);
	                		  /*取出图书ISBN号*/
	                	  }else if(temStr.indexOf("sh:")!=-1){
	                		  String bookISBN = getBookISBN(temStr);
	                		  book.setBookISBN(bookISBN);
	                		  //System.out.println(bookISBN);
	                		  /*取出图书作者*/
	                	  }else if(temStr.indexOf("zz:")!=-1){
	                		  String bookAuthor = getBookAuthor(temStr);
	                		  book.setBookAuthor(bookAuthor);
	                		  //System.out.println(bookAuthor);
	                		  /*取出图书出版日期*/
	                	  }else if(temStr.indexOf("cq:")!=-1){
	                		  String bookPublishTime = getBookPublishTime(temStr);
	                		  if(!"".equals(bookPublishTime)&&null!=bookPublishTime)
	                		      book.setBookPublishTime(bookPublishTime);
	                		  //System.out.println(bookPublishTime);
	                		  /*取出图书出版社*/
	                	  }else if(temStr.indexOf("cs:")!=-1){
	                		  String bookPublisher = getBookPublisher(temStr);
	                		  book.setBookPublisher(bookPublisher);
	                		  //System.out.println(bookPublisher);
	                		  /*出版图书封面图书地址*/
	                	  }else if(temStr.indexOf("pd:")!=-1){
	                		  String bookImage = getBookImage(temStr);
	                		  book.setBookImage(bookImage);
	                		  //String bookUrl = getBookUrl(temStr);
	                		  //book.setBookUrl(bookUrl);
	                		  //System.out.println(bookImage);
	                		  /*取出图书详细信息地址*/
	                	  }else if(temStr.indexOf("th:")!=-1){
	                		  String bookUrl = getBookUrl(temStr);
	                		 // book.setBookUrl(bookUrl);
	                		  price.setChina_pubUrl(bookUrl);
	                		  //System.out.println(bookUrl);
	                	  }
	                  }
	                  if(book!=null){
	                	  book.setPrice(price);
	                	  list.add(book);
	                  }
					}
				}	
			}
		}
		return list;
	}
	public Price getDetailInfo(Document doc) {
		Price price = new Price();
		String bookFixPrice = "";
		/*过滤出<script></script>结点*/
		NodeList servers = doc.getElementsByTagName("script");
		for (int i = 0; i < servers.getLength(); i++) {
			
			Node node = servers.item(i);
			NodeList childNode = node.getChildNodes();
			if (childNode.getLength() > 0
					&& (childNode.item(0).getNodeValue().indexOf("dt") != -1)) {
				
				String tem = childNode.item(0).getNodeValue();
				/*因为结点中不止有一个[]，所以要先把第一二个[]过滤掉*/
				tem = tem.substring(tem.indexOf("dt"));
				/*过滤出数据组中的元素*/
				if(tem.indexOf("[") != -1&& tem.indexOf("]")!= -1){
					
					String bookStr = tem.substring(tem.indexOf("[")-1, tem
							.indexOf("]"));
					
                    if(bookStr.indexOf("{")!= -1&&bookStr.indexOf("}")!= -1){
                    	
                      String priceStr = bookStr.substring(bookStr.indexOf("{") + 1, bookStr.indexOf("}"));
					  /*按","进行分词*/
	                  StringTokenizer temp =new StringTokenizer(priceStr,",");
	              
	                  while(temp.hasMoreElements()){
	                	  
	                	 String temStr = temp.nextToken();
                         if(temStr.indexOf("dj:")!=-1){
	                		  bookFixPrice = getBookFixPrice(temStr);
	                	   /*取出图书折扣和打折后的价格*/
	                	  }else if(temStr.indexOf("zk:")!=-1){
	                		  String bookDiscount = this.getBookDiscount(temStr);
                              //System.out.println(bookDiscount);
	                		  price.setChina_pubDiscount(Float.valueOf(bookDiscount.trim()));
	                		  /*求出本网站普通会员的买书价格*/
	                		  double bookPrice = Double.valueOf(bookFixPrice)*Double.valueOf(bookDiscount.trim()); 
//	                		  DecimalFormat df=(DecimalFormat)DecimalFormat.getInstance();
//	              			  df.setMaximumFractionDigits(2);
	                		  DecimalFormat df = new DecimalFormat("####.00");     
	                		  price.setChina_pubPrice(Double.valueOf(df.format(bookPrice)));
	                		  //price.setChina_pubPrice(bookPrice);
	                		  //System.out.println(bookPrice);
	                	  }else if(temStr.indexOf("th:")!=-1){
	                		  String bookUrl = getBookUrl(temStr);

		                	   price.setChina_pubUrl(bookUrl);
	                		   //System.out.println(bookUrl);
		                 }
	                  }
					}
				}
china_pubparser.java - 源码说明

本页面展示了「本系统实现了从五个网站上搜索的图书进行整合后」中的 china_pubparser.java 源码文件，采用 Java 编程语言编写，共 525 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与搜索相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?