📄 tsinghuaparser.java

📁 本系统实现了从五个网站上搜索的图书进行整合后
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package com.booksearch.service.htmlparser;

/************************************************************
 FileName: Tsinghuaparser.java
 Author: wang jiaqiang
 Date:11/09/08
 Description: 根据检索关键字到www.tub.tsinghua.edu.cn抽取匹配内容
 Class List: Tsinghuaparser
 ***********************************************************/

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;

import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import com.booksearch.orm.Book;
import com.booksearch.orm.Price;


/**
 * Class:Tsinghuaparser Description: 根据检索关键字到www.tub.tsinghua.edu.cn抽取匹配内容
 * extens:no implements:HtmlParser<Element>
 * 
 * @author wang jiaqiang
 * @since 11/10/08
 */
public class Tsinghuaparser implements HtmlParser<Element> {

	/* 存放本网站某一页的记录 */
	private ArrayList<Book> list;

	/**
	 * Function: nekohtmlParser 
	 * Description: 用nekohtml解析器解析指定网页，并转化为dom对象 
	 * Calls:no 
	 * Called By: no
	 * @param no
	 * @return Document
	 * @throws IOException,SAXException
	 */
	public Document nekohtmlParser(String url) throws Exception{
		/* 生成html 解析器 */
		DOMParser parser = new DOMParser();
		/* 设置网页的默认编码 */
		parser.setProperty(
				"http://cyberneko.org/html/properties/default-encoding",
				"gb2312");

		URL u = new URL(url);
		/* 建立与源网站的连接 */
		URLConnection urlconn = u.openConnection();
		urlconn.setReadTimeout(30000);
		//urlconn.setConnectTimeout(30000);
		//urlconn.connect();
		/* 获得源网站的字节流，并转化为字符流，设置编码为utf-8 */
		BufferedReader in = new BufferedReader(new InputStreamReader(urlconn
				.getInputStream(), "gb2312"));
		/* 进行解析，转化为xml */
		parser.parse(new InputSource(in));
		/* 转化为dom对象 */
		Document doc = parser.getDocument();

		return doc;
	}

	/**
	 * Function: mainService 
	 * Description: 用nekohtml解析器解析指定网页，并转化为dom对象 
	 * Calls:getBookImage(),getBookName(),getBookName(),getBookAuthor(),getBookImage(),getBookISBN(),
	 *       getBookPublisher(),getBookPublishTime(),getBookPrice() 
	 * Called By: no
	 * @param doc as Document
	 * @return ArrayList<Book>
	 * @throws no
	 */
	public ArrayList<Book> mainService(Document doc,boolean flag) {

		/* 取得所有<table>结点 */
		NodeList servers = doc.getElementsByTagName("table");
		list = new ArrayList<Book>();
		for (int i = 0; i < servers.getLength(); i++) {

			Element serveritem = (Element) servers.item(i);

			/* 过滤出<table border="1" ... width=90%> 结点，也就是存放记录的结点 */
			if (("90%".equals(serveritem.getAttribute("width")))
					&& ("1".equals(serveritem.getAttribute("border")))) {

				NodeList childList = serveritem.getChildNodes();

				/* 循环遍历<table border="1" ... width=90%>的子结点 */
				for (int j = 2; j < childList.getLength(); j++) {

					Node childNode = childList.item(j);

					/* 如果是元素结点(<tr>,<td>)，取出其中的文本值 */
					if (childNode.getNodeType() == Node.ELEMENT_NODE) {

						Element childElement = (Element) childNode;
						Book book = new Book();
						Price price = new Price();
						book.setBookPublisher("清华大学出版社");
						price.setTsinghuaDiscount((float) 1.00);
						NodeList trNode = childElement.getChildNodes();

						for (int k = 0; k < trNode.getLength(); k++) {
							Node tdNode = trNode.item(k);

							if (tdNode.getNodeType() == Node.ELEMENT_NODE) {

								Element tdElement = (Element) tdNode;

								/* 取出图书ISBN */
								if (k == 2) {
									String bookAuthor = this.getBookAuthor(tdElement);
									book.setBookAuthor(bookAuthor);
									/* 取出图书名称 */
								} else if ("450pt".equals(tdElement.getAttribute("width"))) {

									String bookName = this.getBookName(tdElement);
									book.setBookName(bookName);
									String bookUrl = this.getBookUrl(tdElement);
									// book.setBookUrl(bookUrl);
									price.setTsinghuaUrl(bookUrl);
									/* 取出作者名 */
								} else if (("60pt".equals(tdElement.getAttribute("width")))
										&& ("left".equals(tdElement.getAttribute("align")))) {

									String bookISBN = this.getBookISBN(tdElement);
									book.setBookISBN(bookISBN);

									/* 取出出版时间 */
								} else if (("60pt".equals(tdElement.getAttribute("width")))
										&& ("middle".equals(tdElement.getAttribute("align")))) {

									String bookPublishTime = this.getBookPublishTime(tdElement);
									if(!"".equals(bookPublishTime)&&null != bookPublishTime)
									    book.setBookPublishTime(bookPublishTime);

									/* 取出图书价格 */
								} else if ("40pt".equals(tdElement.getAttribute("width"))) {
									String bookPrice = this.getBookPrice(tdElement);
									// book.setBookPrice(bookPrice);
									// book.setBookFixPrice(bookPrice);
									// book.setBookDiscount("1");
									if(null != bookPrice&&!"".equals(bookPrice))
									    price.setTsinghuaPrice(Double.valueOf(bookPrice.trim()));
									    book.setBookFixPrice(Double.valueOf(bookPrice.trim()));

								}
							}
						}
						/* 放到存放结果链中 */
						if (book.getBookName() != null) {
							//book.setBookImage("http://www.tup.com.cn/images/nocover.jpg");
							book.setPrice(price);
							list.add(book);
						}
					}
				}
			}

		}
		return list;
	}
	public Price getDetailInfo(Document doc) {
		
	    Price price = new Price();
		/* 取得所有<table>结点 */
		NodeList servers = doc.getElementsByTagName("table");
		list = new ArrayList<Book>();
		for (int i = 0; i < servers.getLength(); i++) {

			Element serveritem = (Element) servers.item(i);
			/* 过滤出<table border="1" ... width=90%> 结点，也就是存放记录的结点 */
			if (("90%".equals(serveritem.getAttribute("width")))
					&& ("1".equals(serveritem.getAttribute("border")))) {

				NodeList childList = serveritem.getChildNodes();
				/* 循环遍历<table border="1" ... width=90%>的子结点 */
				for (int j = 2; j < childList.getLength(); j++) {

					Node childNode = childList.item(j);
					/* 如果是元素结点(<tr>,<td>)，取出其中的文本值 */
					if (childNode.getNodeType() == Node.ELEMENT_NODE) {

						Element childElement = (Element) childNode;
						price.setTsinghuaDiscount((float) 1.00);
						NodeList trNode = childElement.getChildNodes();

						for (int k = 0; k < trNode.getLength(); k++) {
							
							Node tdNode = trNode.item(k);
							if (tdNode.getNodeType() == Node.ELEMENT_NODE) {

								Element tdElement = (Element) tdNode;
								if ("40pt".equals(tdElement.getAttribute("width"))) {
									String bookPrice = this.getBookPrice(tdElement);
                                    if(null != bookPrice&&!"".equals(bookPrice))
									    price.setTsinghuaPrice(Float.valueOf(bookPrice.trim()));
									
								}else if ("450pt".equals(tdElement.getAttribute("width"))) {

									String bookUrl = this.getBookUrl(tdElement);
									price.setTsinghuaUrl(bookUrl);
									/* 取出作者名 */
								}
							}
						}
						break;
					}
				}
				break;
			}
		}

		return price;
	}
	/**
	 * Function: getBookName 
	 * Description: 获得图书名称 
	 * Calls: no 
	 * Called By:mainService
	 * @param bookElement as Element
	 * @return String
	 * @throws no
	 */
	public String getBookName(Element bookElement) {

		String bookName = "";
		/* 取出<a>结点 */
		if(bookElement.hasChildNodes()
				&& Node.ELEMENT_NODE == bookElement.getFirstChild().getNodeType()){
			Element firstElement = (Element) bookElement.getFirstChild();
			if(firstElement.hasChildNodes())
			    bookName = firstElement.getFirstChild().getNodeValue();
		}
		return bookName;
	}

	/**
	 * Function: getBookAuthor 
	 * Description: 获得图书作者 
	 * Calls: no 
	 * CalledBy:mainService
	 * @param bookElement as Element
	 * @return String
	 * @throws no
	 */
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -