📄 zhuoyueparser.java
字号:
NodeList childList=bookElement.getChildNodes();
for(int i=0;i<childList.getLength();i++){
Node childNode=childList.item(i);
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement=(Element)childNode;
if("OurPrice".equals(childElement.getAttribute("class"))){
if(childElement.hasChildNodes()
&&null != childElement.getFirstChild().getNodeValue())
bookPrice = childElement.getFirstChild().getNodeValue().trim();
break;
}
}
}
//System.out.println("<<" + bookPrice + ">>");
return bookPrice;
}
/**
* Function: getBookPublisher
* Description: 获得图书出版社
* Calls: no
* Called By:mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookPublisher(Element bookElement) {
String bookPublisher = "";
NodeList childList=bookElement.getChildNodes();
for(int i=0;i<childList.getLength();i++){
Node childNode=childList.item(i);
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement=(Element)childNode;
if("ProductCom".equals(childElement.getAttribute("class"))){
if(childElement.hasChildNodes())
bookPublisher = childElement.getFirstChild().getNodeValue();
break;
}
}
}
bookPublisher = bookPublisher.replace(" ", "");
if(bookPublisher.length()>64){
bookPublisher = bookPublisher.substring(0, 64);
}
//System.out.println("<<" + bookPublisher + ">>");
return bookPublisher;
}
public String getBookPublishTime(Element bookElement) {
String bookPublishTime = "";
NodeList childList=bookElement.getChildNodes();
for(int i=0;i<childList.getLength();i++){
Node childNode=childList.item(i);
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement=(Element)childNode;
if("PubDate".equals(childElement.getAttribute("class"))){
if(childElement.hasChildNodes())
bookPublishTime = childElement.getFirstChild().getNodeValue();
break;
}
}
}
if(bookPublishTime.indexOf("(") != -1)
bookPublishTime = bookPublishTime.replace("(", "");
if(bookPublishTime.indexOf(")") != -1)
bookPublishTime = bookPublishTime.replace(")", "");
if(bookPublishTime.indexOf("出版") != -1)
bookPublishTime = bookPublishTime.replace("出版", "");
bookPublishTime = bookPublishTime.replace(" ", "");
if(bookPublishTime.length()>1)
bookPublishTime = bookPublishTime + "-00";
//System.out.println("<<" + bookPublishTime + ">>");
return bookPublishTime;
}
/**
* Function: getBookPublisher
* Description: 获得图书市场价格
* Calls: no
* Called By: mainService
* @param bookElement as Element
* @return String
* @throws no
*/
public String getBookFixPrice(Element bookElement) {
String bookFixPrice="";
NodeList childList=bookElement.getChildNodes();
for(int i=0;i<childList.getLength();i++){
Node childNode=childList.item(i);
if("STRIKE".equals(childNode.getNodeName())){
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement=(Element)childNode;
if(childElement.hasChildNodes()){
bookFixPrice = childElement.getFirstChild().getNodeValue();
}
}
break;
}
}
if(bookFixPrice.length()>=1){
bookFixPrice = bookFixPrice.substring(1);
}
//System.out.println("<" + bookFixPrice + ">");
return bookFixPrice;
}
/**
* Function: getBookPublisher
* Description: 获得图书具体url
* Calls: no
* Called By: mainService
* @param bookElement as Element
* @return String
* @throws no
**/
public String getBookUrl(Element bookElement) {
String bookUrl="";
NodeList childList=bookElement.getChildNodes();
/*循环遍历<div class="ProductTitle">的子节点*/
for(int i=0;i<childList.getLength();i++){
Node childNode=childList.item(i);
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement=(Element)childNode;
/* 取出图书名称 */
if("medium".equals(childElement.getAttribute("class"))){
if(null != childElement.getAttribute("href"))
bookUrl = childElement.getAttribute("href").trim();
break;
}
}
}
//System.out.println("<<" + bookUrl + ">>");
return bookUrl;
}
public String getBookDiscount(Element bookElement) {
return null;
}
public String getBookISBN(Element bookElement) {
return null;
}
/**
* Function: getNextPageUrl
* Description: 获得下一页超链接地址
* Calls: no
* Called By: no
* @param doc as Document
* @return String
* @throws no
*/
public String getNextPageUrl(Document doc) {
/*初始化为no,表示没有下一页*/
String nextpageUrl = "no";
NodeList divList = doc.getElementsByTagName("div");
for(int i=0;i<divList.getLength();i++){
Node temNode=divList.item(i);
if(temNode.getNodeType()==Node.ELEMENT_NODE){
Element temElement =(Element)temNode;
/*过滤出<div id="result-page">的结点*/
if("result-page".equals(temElement.getAttribute("id"))){
NodeList aList=temElement.getChildNodes();
for(int j=0;j<aList.getLength();j++){
Node aNode=aList.item(j);
if(aNode.getNodeType()==Node.ELEMENT_NODE){
Element aElement=(Element)aNode;
/*如果有<a href="javascript:(2)"class="page-out1">下一页</a>标签表明还有下一页*/
if("page-out1".equals(aElement.getAttribute("class"))&&"下一页".equals(aElement.getFirstChild().getNodeValue())){
String javas=aElement.getAttribute("href");
if(url.indexOf("&page=") != -1){
String makeUrl=url.substring(0, url.indexOf("&page="))+"&page=";
nextpageUrl=makeUrl+javas.substring(javas.length()-2, javas.length()-1);
}else{
nextpageUrl = this.url+"&page="+javas.substring(javas.length()-2, javas.length()-1);
}
//System.out.println(nextpageUrl);
break;
}
}
}
}
break;
}
}
return nextpageUrl;
}
public long getRecordNum(Document doc) {
long recordNum = 0;
list = new ArrayList<Book>();
/* 取得所有<div>结点 */
NodeList servers = doc.getElementsByTagName("div");
for (int i = 0; i < servers.getLength(); i++) {
Element serveritem = (Element) servers.item(i);
/* 过滤出<div id="product_td-bg">结点,也就是存放记录的结点 */
if ("product_td-bg".equals(serveritem.getAttribute("id"))) {
NodeList childList1 = serveritem.getChildNodes();
/* 循环遍历<div id="product_td-bg">的子结点 */
for(int j=0;j<childList1.getLength();j++){
Node childNode = childList1.item(j);
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement = (Element)childNode;
if("result".equals(childElement.getAttribute("class"))){
if(childElement.hasChildNodes()){
String recordStr = childElement.getFirstChild().getNodeValue();
if(recordStr.indexOf("共")!=-1&&recordStr.lastIndexOf("条")!=-1)
recordNum = Long.valueOf(recordStr.substring(recordStr.indexOf("共") + 1, recordStr.lastIndexOf("条")));
break;
}
}
}
}
}
break;
}
return recordNum;
}
public class ZhuoyueparserSec {
public Document nekohtmlParserSec(String url) throws Exception{
/* 生成html 解析器 */
DOMParser parser = new DOMParser();
/* 设置网页的默认编码 */
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"UTF-8");
URL u = new URL(url);
/* 建立与源网站的连接 */
URLConnection urlconn = u.openConnection();
urlconn.setReadTimeout(30000);
//urlconn.connect();
/* 获得源网站的字节流,并转化为字符流,设置编码为utf-8 */
BufferedReader in = new BufferedReader(new InputStreamReader(urlconn
.getInputStream(), "UTF-8"));
/* 进行解析,转化为xml */
parser.parse(new InputSource(in));
/* 转化为dom对象 */
Document doc = parser.getDocument();
return doc;
}
public String getBookISBNSec(Document doc){
String bookISBN = "";
/* 取得所有<div>结点 */
NodeList list = doc.getElementsByTagName("div");
for(int i = 0;i<list.getLength();i++){
Element childElement = (Element)list.item(i);
if ("Preferences".equals(childElement.getAttribute("id"))) {
NodeList list1 = childElement.getChildNodes();
/* 循环遍历<div id="Preferences">的子结点 */
for(int j = 0;j<list1.getLength();j++){
Node childNode = list1.item(j);
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement1 = (Element) childNode;
if("Left".equals(childElement1.getAttribute("class"))){
NodeList list2 = childElement1.getChildNodes();
for(int r = 0;r<list2.getLength();r++){
Node childNode1 = list2.item(r);
if(childNode1.getNodeType()==Node.ELEMENT_NODE){
Element childElement2 = (Element)childNode1;
if("dark".equals(childElement2.getAttribute("class"))){
NodeList list3 = childElement2.getChildNodes();
for(int p = 0;p<list3.getLength();p++){
Node childNode2 = list3.item(p);
if(childNode2.getNodeValue().equals("ISBN:")){
if(r + 1<list2.getLength()){
Node n = list2.item(r+1);
bookISBN += n.getNodeValue();
}
}
}
}
}
}
}
}
}
}
}
return bookISBN;
}
public String getBookContentSec(Document doc){
String bookContent = "";
/*取出所有的Div节点*/
NodeList list = doc.getElementsByTagName("div");
//boolean flag = true;
for(int i = 0;i<list.getLength();i++){
Element childElement = (Element)list.item(i);
if ("ContentText".equals(childElement.getAttribute("class"))) {
/* 循环遍历<div id="ContentText">的子结点 */
NodeList divList = childElement.getChildNodes();
for(int j = 0;j<divList.getLength();j++){
Node childNode = divList.item(j);
if(childNode.getNodeType()==Node.ELEMENT_NODE){
Element childElement2 = (Element) childNode;
if("bbcontent".equals(childElement2.getAttribute("class"))){
bookContent += childElement2.getFirstChild().getNodeValue();
NodeList list2 = childElement2.getChildNodes();
for(int r = 0;r<list2.getLength();r++){
Node childNode1 = list2.item(r);
if(childNode1.getNodeType()==Node.TEXT_NODE){
bookContent += childNode1.getNodeValue();
}
}
}
}
}
if("".equals(bookContent)){
if(childElement.hasChildNodes())
bookContent += childElement.getFirstChild().getNodeValue();
}
break;
}
}
bookContent = bookContent.replaceAll(">", "");
bookContent = bookContent.replaceAll("<", "");
bookContent = bookContent.replaceAll(" ", "");
bookContent = bookContent.trim();
bookContent = bookContent + "....";
return bookContent;
}
}
public static void main(String args [])throws Exception{
Zhuoyueparser tem = new Zhuoyueparser();
long beginTime = System.currentTimeMillis();
//Document doc = tem.nekohtmlParser("http://www.amazon.cn/mn/advancedSearchApp?type=book&product_name=&author=&publisher=&isbn=9787121000522");
Document doc = tem.nekohtmlParser("http://www.amazon.cn/mn/searchApp?ix=sunray&pageletid=headsearch&searchType=&keywords=java%E7%BC%96%E7%A8%8B%E6%80%9D%E6%83%B3&searchKind=keyword&bestSaleNum=3");
//Document doc = tem.nekohtmlParser("http://www.amazon.cn/mn/searchApp?keywords=" + URLEncoder.encode("编程思想", "utf-8") + "&searchKind=keyword");
System.out.println("记录总数:" + tem.getRecordNum(doc));
System.out.println("下一页:" + tem.getNextPageUrl(doc));
// Price price = tem.getDetailInfo(doc);
// System.out.println(price.getZhuoyueDiscount() + ">>" + price.getZhuoyuePrice() + ">>" + price.getZhuoyueUrl());
ArrayList <Book> list = tem.mainService(doc,false);
Iterator it = list.iterator();
while(it.hasNext()){
Book temp = (Book)it.next();
System.out.println(temp.getBookName() + ">>" + temp.getBookAuthor() + ">>" + temp.getBookFixPrice() + ">>" + temp.getBookImage()
+ ">>" + temp.getBookISBN() + ">>" + temp.getBookProspectus() + ">>" + temp.getBookPublisher()
+ ">>" + temp.getBookPublishTime() + ">>" + temp.getPrice().getZhuoyueDiscount()
+ ">>" + temp.getPrice().getZhuoyueUrl() + ">>" + temp.getPrice().getZhuoyuePrice());
}
System.out.println(System.currentTimeMillis() - beginTime);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -