📄 china_pubparser.java
字号:
package com.booksearch.service.htmlparser;
/************************************************************
FileName: China_pubparser.java
Author: lichao
Date:11/14/08
Description: 根据检索关键字到www.china-pub.com抽取匹配内容
Class List: China_pubparser
***********************************************************/
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.StringTokenizer;
import org.apache.log4j.Logger;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import com.booksearch.orm.Book;
import com.booksearch.orm.Price;
/**
* Class:China_pubparser
* Description: 根据检索关键字到www.china-pub.com抽取匹配内容
* extens:no
* implements:HtmlParser<Element>
* @author li chao
* @since 11/14/08
*/
public class China_pubparser implements HtmlParser<String> {
//private String url = "http://www.china-pub.com/s/?key1=java" ;
/*存放本网站某一页的记录*/
private ArrayList<Book> list;
// private static final Logger logger;
//
// static
// {
// logger = Logger.getLogger(com.booksearch.service.htmlparser.China_pubparser.class);
// }
/**
* Function: nekohtmlParser
* Description: 用nekohtml解析器解析指定网页,并转化为dom对象
* Calls: no
* Called By: no
* @param no
* @return Document
* @throws IOException,SAXException
*/
public Document nekohtmlParser(String url)throws Exception{
// 生成html parse
DOMParser parser = new DOMParser();
// 设置网站默认编码
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"UTF-8");
URL u = new URL(url);
/*建立与源网站的连接*/
URLConnection urlConnection = u.openConnection();
urlConnection.setReadTimeout(30000);
//urlConnection.setConnectTimeout(30000);
//urlConnection.connect();
//*获得源网站的字节流,并转化为字符流,设置编码为gb2312*/
BufferedReader inputStream = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"gb2312"));
parser.parse(new InputSource(inputStream));
Document doc = parser.getDocument();
return doc;
}
/**
* Function: mainService
* Description: 用nekohtml解析器解析指定网页,并转化为dom对象
* Calls: getBookImage(),getBookName(),getBookAuthor(),getBookISBN(),getBookFixPrice()
* getBookPublisher(),getBookPublishTime(),getBookPrice(),getBookUrl
* Called By: no
* @param doc as Document
* @return ArrayList<Book>
* @throws no
*/
public ArrayList<Book> mainService(Document doc,boolean flag) {
list = new ArrayList<Book>();
/*过滤出<script></script>结点*/
NodeList servers = doc.getElementsByTagName("script");
for (int i = 0; i < servers.getLength(); i++) {
Node node = servers.item(i);
NodeList childNode = node.getChildNodes();
if (childNode.getLength() > 0
&& (childNode.item(0).getNodeValue().indexOf("dt") != -1)) {
String tem = childNode.item(0).getNodeValue();
/*因为结点中不止有一个[],所以要先把第一二个[]过滤掉*/
tem = tem.substring(tem.indexOf("dt"));
/*过滤出数据组中的元素*/
if(tem.indexOf("[") != -1&& tem.indexOf("]")!= -1){
String bookStr = tem.substring(tem.indexOf("[")-1, tem
.indexOf("]"));
String bookArr[];
int j = 0;
/*按"}"进行分词处理,分成单独的一条条记录*/
StringTokenizer st = new StringTokenizer(bookStr, "}");
bookArr = new String[st.countTokens()];
while (st.hasMoreElements()) {
/*因为分词之后会有一个是空的,所以进行一下判断*/
if(j<20){
bookArr[j] = st.nextToken();
j++;
}else break;
}
/*循环遍历每一条记录,取出其中详细的信息*/
for (int k = 0; k < bookArr.length-1; k++) {
Book book = new Book();
Price price = new Price();
/*按","进行分词*/
StringTokenizer temp =new StringTokenizer(bookArr[k],",");
while(temp.hasMoreElements()){
String temStr = temp.nextToken();
/*取出图书名称*/
if(temStr.indexOf("sm:")!=-1){
String bookName = getBookName(temStr);
book.setBookName(bookName);
//System.out.println(bookName);
/*取出图书市场定价*/
}else if(temStr.indexOf("dj:")!=-1){
String bookFixPrice = getBookFixPrice(temStr);
if(null != bookFixPrice&&!"".equals(bookFixPrice))
book.setBookFixPrice(Double.valueOf(bookFixPrice.trim()));
//System.out.println(bookFixPrice);
/*取出图书折扣和打折后的价格*/
}else if(temStr.indexOf("zk:")!=-1){
String bookDiscount = this.getBookDiscount(temStr);
//book.setBookDiscount(bookDiscount);
if(null != bookDiscount&&!"".equals(bookDiscount))
price.setChina_pubDiscount(Float.valueOf(bookDiscount.trim()));
/*求出本网站普通会员的买书价格*/
double bookPrice = book.getBookFixPrice()*Float.valueOf(bookDiscount.trim());
/*进行格式化,保留两位小数*/
DecimalFormat df = new DecimalFormat("####.00");
price.setChina_pubPrice(Double.valueOf(df.format(bookPrice)));
//price.setChina_pubPrice(bookPrice);
//System.out.println(df.format(bookPrice));
//System.out.println(bookDiscount);
/*取出图书ISBN号*/
}else if(temStr.indexOf("sh:")!=-1){
String bookISBN = getBookISBN(temStr);
book.setBookISBN(bookISBN);
//System.out.println(bookISBN);
/*取出图书作者*/
}else if(temStr.indexOf("zz:")!=-1){
String bookAuthor = getBookAuthor(temStr);
book.setBookAuthor(bookAuthor);
//System.out.println(bookAuthor);
/*取出图书出版日期*/
}else if(temStr.indexOf("cq:")!=-1){
String bookPublishTime = getBookPublishTime(temStr);
if(!"".equals(bookPublishTime)&&null!=bookPublishTime)
book.setBookPublishTime(bookPublishTime);
//System.out.println(bookPublishTime);
/*取出图书出版社*/
}else if(temStr.indexOf("cs:")!=-1){
String bookPublisher = getBookPublisher(temStr);
book.setBookPublisher(bookPublisher);
//System.out.println(bookPublisher);
/*出版图书封面图书地址*/
}else if(temStr.indexOf("pd:")!=-1){
String bookImage = getBookImage(temStr);
book.setBookImage(bookImage);
//String bookUrl = getBookUrl(temStr);
//book.setBookUrl(bookUrl);
//System.out.println(bookImage);
/*取出图书详细信息地址*/
}else if(temStr.indexOf("th:")!=-1){
String bookUrl = getBookUrl(temStr);
// book.setBookUrl(bookUrl);
price.setChina_pubUrl(bookUrl);
//System.out.println(bookUrl);
}
}
if(book!=null){
book.setPrice(price);
list.add(book);
}
}
}
}
}
return list;
}
public Price getDetailInfo(Document doc) {
Price price = new Price();
String bookFixPrice = "";
/*过滤出<script></script>结点*/
NodeList servers = doc.getElementsByTagName("script");
for (int i = 0; i < servers.getLength(); i++) {
Node node = servers.item(i);
NodeList childNode = node.getChildNodes();
if (childNode.getLength() > 0
&& (childNode.item(0).getNodeValue().indexOf("dt") != -1)) {
String tem = childNode.item(0).getNodeValue();
/*因为结点中不止有一个[],所以要先把第一二个[]过滤掉*/
tem = tem.substring(tem.indexOf("dt"));
/*过滤出数据组中的元素*/
if(tem.indexOf("[") != -1&& tem.indexOf("]")!= -1){
String bookStr = tem.substring(tem.indexOf("[")-1, tem
.indexOf("]"));
if(bookStr.indexOf("{")!= -1&&bookStr.indexOf("}")!= -1){
String priceStr = bookStr.substring(bookStr.indexOf("{") + 1, bookStr.indexOf("}"));
/*按","进行分词*/
StringTokenizer temp =new StringTokenizer(priceStr,",");
while(temp.hasMoreElements()){
String temStr = temp.nextToken();
if(temStr.indexOf("dj:")!=-1){
bookFixPrice = getBookFixPrice(temStr);
/*取出图书折扣和打折后的价格*/
}else if(temStr.indexOf("zk:")!=-1){
String bookDiscount = this.getBookDiscount(temStr);
//System.out.println(bookDiscount);
price.setChina_pubDiscount(Float.valueOf(bookDiscount.trim()));
/*求出本网站普通会员的买书价格*/
double bookPrice = Double.valueOf(bookFixPrice)*Double.valueOf(bookDiscount.trim());
// DecimalFormat df=(DecimalFormat)DecimalFormat.getInstance();
// df.setMaximumFractionDigits(2);
DecimalFormat df = new DecimalFormat("####.00");
price.setChina_pubPrice(Double.valueOf(df.format(bookPrice)));
//price.setChina_pubPrice(bookPrice);
//System.out.println(bookPrice);
}else if(temStr.indexOf("th:")!=-1){
String bookUrl = getBookUrl(temStr);
price.setChina_pubUrl(bookUrl);
//System.out.println(bookUrl);
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -