⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parsecnkireference.java

📁 本程序是专门用于从网页上自动收集cmi,cnki上的被引文献的数据
💻 JAVA
字号:
package cn.ac.cintcm.spider.cnki;

import java.util.ArrayList;
import java.util.List;


import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.Node;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;


//import cn.ac.cintcm.spider.GetUrlContent;
//import cn.ac.cintcm.spider.GetFormContent;

//解析记录页面
public class ParseCnkiReference {

	public List<CnkiReference> getOnePageRecordContent(String resource) {//解析一页的内容
		List<CnkiReference> references = new ArrayList<CnkiReference>();		
		try {     
			NodeList list=NodeFilters.getNodeList(resource,"TD");   //取得表格内容
			if(list.size()==0 || list==null){
				LogEntity.logFile.log("没有得到解析的结果,可能原因:解析页面内容时出错;没有得到正确的页面\n");
				return  references;
			}
			int pos = 1; // 标记source table column
			CnkiReference reference = new CnkiReference();
			for (Node node : list.toNodeArray()) {					
				TableColumn tag = new TableColumn();				
				tag.setText(node.toHtml());
				String bgcolor = tag.getAttribute("bgcolor");

				if (bgcolor != null
						&& (bgcolor.equals("#ffffff") || bgcolor
								.equals("#f1f7fe"))) {
					
				//	System.out.println(node.toPlainTextString().indexOf("\t"));
				//	System.out.println(node.toString());
									
					String fileName = null;
					String dbName = null;
					if (pos == 1) { // 文献的链接
						String[] fileNamesValue=null;
						NodeList linkNodes = node.getChildren();
						linkNodes=linkNodes.extractAllNodesThatMatch(NodeFilters.getSingleFilter("INPUT"));
						InputTag inputTag = new InputTag();
    					inputTag.setText(linkNodes.elementAt(0).toHtml());
						fileNamesValue = inputTag.getAttribute("value").split("!");						
						fileName = fileNamesValue[1];						
						dbName = fileNamesValue[0].substring(0, 4)+ fileName.substring(4, 8);
						
						String detailUrl = "http://lsg.cnki.net/grid20/detail.aspx?filename="+ fileName + "&dbname=" + dbName;
//						System.out.println(detailUrl);
						ParseCnkiDetail psd = new ParseCnkiDetail();
						String title=psd.getTitleDetail(detailUrl);
						if(title==null){
							LogEntity.logFile.log("没有得到引证文献的内容");
							pos++;
							continue;
						}
						reference.setTitle(title);
						pos++;
					}

					else if (pos == 2) { // 被引文献题名
						reference.setReferenceTitle(node.toPlainTextString().trim());					
						LogEntity.logFile.log(node.toPlainTextString().trim());
						pos++;
					} else if (pos == 3) { // 被引文献作者											
						reference.setAuthors(node.toPlainTextString().trim());
						pos++;
					} else if (pos == 4) { // literature 被引文献来源																	
						reference.setLiterature(node.toPlainTextString().trim());
						pos++;
					} else if (pos == 5) { // 被引用次数											
						reference.setReferenceNum(node.toPlainTextString().trim());
						references.add(reference);
						pos = 1;			
						reference = new CnkiReference();
					}					                    
				}
				
			}
		} catch (Exception e) {
			LogEntity.logFile.log(e,"happened in getOnePageRecordContent");
			e.printStackTrace();
		}
		 return references;
	}
	
	public List<CnkiReference> getPagesContent(String resource){  //解析所有页的内容
		List<CnkiReference> referencesAll = new ArrayList<CnkiReference>();
		String source=resource;
		String curPage="1";
		String lastPage="1";
		NodeList list=NodeFilters.getNodeList(source,"INPUT","type","hidden");	//得到current page num and last page num
		if(list.size()==0 || list==null){
			LogEntity.logFile.log("没有得包含curpage和lastpage的信息");
			return  referencesAll;
		}
		for(Node inputNode:list.toNodeArray()){			
			InputTag inputTag=new InputTag();
			inputTag.setText(inputNode.toHtml());
			String attributeName=inputTag.getAttribute("name");
			if(attributeName.equals("curpage"))
				curPage=inputTag.getAttribute("value");
			if(attributeName.equals("lastpage"))
				lastPage=inputTag.getAttribute("value");										
		}	
		
		while(curPage.compareTo(lastPage)<0){
			referencesAll.addAll(getOnePageRecordContent(source));		
//			System.out.println(referencesAll.size());
			String nextPageUrl=null;
			NodeList linkList=NodeFilters.getNodeList(source,"A");      //得到下一页的链接
			if(linkList.size()==0||linkList==null){
				LogEntity.logFile.log("没有得到下一页链接");
				return referencesAll;
			}
			for(Node linkNode:linkList.toNodeArray()){							
				if(linkNode.getLastChild().getText().equals("下页")){
					LinkTag linkTag=new LinkTag();
					linkTag.setText(linkNode.toHtml());
					if(linkTag.getAttribute("href")==null){
						LogEntity.logFile.log("没有得到"+referencesAll.size()/20+"的链接");
//						System.out.println("没有得到"+referencesAll.size()/20+"的链接");
					}				
					nextPageUrl="http://lsg.cnki.net/grid20/Brief.aspx"+linkTag.getAttribute("href");
//					System.out.println(nextPageUrl);
					break;
				}					
			}	
            //*******************************************************			//此段程序用来得到翻页的内容
			HttpClient client = new HttpClient();
			
			HttpState initialState = new HttpState();                         //添加cookie属性
            initialState.addCookies(GetFormContent.cookies);
            client.setState(initialState);
            
		    GetMethod method = new GetMethod(nextPageUrl);		  
		    method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, 
		    		new DefaultHttpMethodRetryHandler(3, false));
		    try {
		      int statusCode = client.executeMethod(method);
		      if (statusCode != HttpStatus.SC_OK) {
		    	  LogEntity.logFile.log("Method failed: " + method.getStatusLine());
		          System.err.println("Method failed: " + method.getStatusLine());
		      }
		      byte[] responseBody = method.getResponseBody();
		      source = new String(responseBody, "UTF-8");	
		    }catch(Exception e){
		    	LogEntity.logFile.log(e,"happened in get next page content");
				e.printStackTrace();
			}finally {
			      method.releaseConnection();
		    } 
            //***********************************************************************			
			curPage=Integer.toString(Integer.parseInt(curPage) + 1 );			
		}
		
		referencesAll.addAll(getOnePageRecordContent(source));		
		return referencesAll;
		
		
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -