📄 parsecnkireference.java
字号:
package cn.ac.cintcm.spider.cnki;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.Node;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
//import cn.ac.cintcm.spider.GetUrlContent;
//import cn.ac.cintcm.spider.GetFormContent;
//解析记录页面
public class ParseCnkiReference {
public List<CnkiReference> getOnePageRecordContent(String resource) {//解析一页的内容
List<CnkiReference> references = new ArrayList<CnkiReference>();
try {
NodeList list=NodeFilters.getNodeList(resource,"TD"); //取得表格内容
if(list.size()==0 || list==null){
LogEntity.logFile.log("没有得到解析的结果,可能原因:解析页面内容时出错;没有得到正确的页面\n");
return references;
}
int pos = 1; // 标记source table column
CnkiReference reference = new CnkiReference();
for (Node node : list.toNodeArray()) {
TableColumn tag = new TableColumn();
tag.setText(node.toHtml());
String bgcolor = tag.getAttribute("bgcolor");
if (bgcolor != null
&& (bgcolor.equals("#ffffff") || bgcolor
.equals("#f1f7fe"))) {
// System.out.println(node.toPlainTextString().indexOf("\t"));
// System.out.println(node.toString());
String fileName = null;
String dbName = null;
if (pos == 1) { // 文献的链接
String[] fileNamesValue=null;
NodeList linkNodes = node.getChildren();
linkNodes=linkNodes.extractAllNodesThatMatch(NodeFilters.getSingleFilter("INPUT"));
InputTag inputTag = new InputTag();
inputTag.setText(linkNodes.elementAt(0).toHtml());
fileNamesValue = inputTag.getAttribute("value").split("!");
fileName = fileNamesValue[1];
dbName = fileNamesValue[0].substring(0, 4)+ fileName.substring(4, 8);
String detailUrl = "http://lsg.cnki.net/grid20/detail.aspx?filename="+ fileName + "&dbname=" + dbName;
// System.out.println(detailUrl);
ParseCnkiDetail psd = new ParseCnkiDetail();
String title=psd.getTitleDetail(detailUrl);
if(title==null){
LogEntity.logFile.log("没有得到引证文献的内容");
pos++;
continue;
}
reference.setTitle(title);
pos++;
}
else if (pos == 2) { // 被引文献题名
reference.setReferenceTitle(node.toPlainTextString().trim());
LogEntity.logFile.log(node.toPlainTextString().trim());
pos++;
} else if (pos == 3) { // 被引文献作者
reference.setAuthors(node.toPlainTextString().trim());
pos++;
} else if (pos == 4) { // literature 被引文献来源
reference.setLiterature(node.toPlainTextString().trim());
pos++;
} else if (pos == 5) { // 被引用次数
reference.setReferenceNum(node.toPlainTextString().trim());
references.add(reference);
pos = 1;
reference = new CnkiReference();
}
}
}
} catch (Exception e) {
LogEntity.logFile.log(e,"happened in getOnePageRecordContent");
e.printStackTrace();
}
return references;
}
public List<CnkiReference> getPagesContent(String resource){ //解析所有页的内容
List<CnkiReference> referencesAll = new ArrayList<CnkiReference>();
String source=resource;
String curPage="1";
String lastPage="1";
NodeList list=NodeFilters.getNodeList(source,"INPUT","type","hidden"); //得到current page num and last page num
if(list.size()==0 || list==null){
LogEntity.logFile.log("没有得包含curpage和lastpage的信息");
return referencesAll;
}
for(Node inputNode:list.toNodeArray()){
InputTag inputTag=new InputTag();
inputTag.setText(inputNode.toHtml());
String attributeName=inputTag.getAttribute("name");
if(attributeName.equals("curpage"))
curPage=inputTag.getAttribute("value");
if(attributeName.equals("lastpage"))
lastPage=inputTag.getAttribute("value");
}
while(curPage.compareTo(lastPage)<0){
referencesAll.addAll(getOnePageRecordContent(source));
// System.out.println(referencesAll.size());
String nextPageUrl=null;
NodeList linkList=NodeFilters.getNodeList(source,"A"); //得到下一页的链接
if(linkList.size()==0||linkList==null){
LogEntity.logFile.log("没有得到下一页链接");
return referencesAll;
}
for(Node linkNode:linkList.toNodeArray()){
if(linkNode.getLastChild().getText().equals("下页")){
LinkTag linkTag=new LinkTag();
linkTag.setText(linkNode.toHtml());
if(linkTag.getAttribute("href")==null){
LogEntity.logFile.log("没有得到"+referencesAll.size()/20+"的链接");
// System.out.println("没有得到"+referencesAll.size()/20+"的链接");
}
nextPageUrl="http://lsg.cnki.net/grid20/Brief.aspx"+linkTag.getAttribute("href");
// System.out.println(nextPageUrl);
break;
}
}
//******************************************************* //此段程序用来得到翻页的内容
HttpClient client = new HttpClient();
HttpState initialState = new HttpState(); //添加cookie属性
initialState.addCookies(GetFormContent.cookies);
client.setState(initialState);
GetMethod method = new GetMethod(nextPageUrl);
method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler(3, false));
try {
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK) {
LogEntity.logFile.log("Method failed: " + method.getStatusLine());
System.err.println("Method failed: " + method.getStatusLine());
}
byte[] responseBody = method.getResponseBody();
source = new String(responseBody, "UTF-8");
}catch(Exception e){
LogEntity.logFile.log(e,"happened in get next page content");
e.printStackTrace();
}finally {
method.releaseConnection();
}
//***********************************************************************
curPage=Integer.toString(Integer.parseInt(curPage) + 1 );
}
referencesAll.addAll(getOnePageRecordContent(source));
return referencesAll;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -