⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 getcnkireference.java

📁 本程序是专门用于从网页上自动收集cmi,cnki上的被引文献的数据
💻 JAVA
字号:
package cn.ac.cintcm.spider.cnki;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

import org.apache.commons.httpclient.NameValuePair;
import cn.ac.cintcm.spider.FormParameter;

public class GetCnkiReference extends Get2FormContent{
	private static String url = "http://lsg.cnki.net/grid20/Navigator.aspx?id=6";
	private static String url2 = "http://lsg.cnki.net/grid20/Brief.aspx?ID=6&classtype=&systemno=&NaviDatabaseName=&NaviField=";
	private static final List<String> hidden =getHiddenParams();
	private static String queryField;
	private List<String> queryValue = new ArrayList<String>();
	private static String yearStart;
	private static String yearEnd;
	private MSCnkiReference mr;
	private ParseCnkiReference pc;
	
	public GetCnkiReference(String configFile) throws IOException{		
		super("");
		Map config = ConfigUtil.loadJobConfig(configFile);
		if (config != null) {
			queryField=(String)config.get("advancedfield1");
			String advancedvalue1 = (String) config.get("advancedvalue1");
			if (advancedvalue1 != null) {
				queryValue.addAll(Arrays.asList(advancedvalue1.split(",")));
			}
			else{
				queryValue.add(new String("中医文献杂志"));
			}
			yearStart=(String) config.get("RealYearStart");
			yearEnd=(String)config.get("RealYearEnd");			
			url = (String) config.get("url");
			url2 = (String) config.get("url2");
		}	
							
	}//read config  file

	public GetCnkiReference(FormParameter formParameters) {
		super(formParameters);
	}
		
	public static void main(String[] args){
		try {
			String outDb = "web.mdb";
			String userInput = getInputParameter();
			if (userInput != null && !userInput.trim().equals("")) {
				outDb = userInput; 
			}
			(new GetCnkiReference("cnki.txt")).process(outDb);
		} catch (IOException e) {
			LogEntity.logFile.log(e, "IOException");
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			LogEntity.logFile.log(e, "ClassNotFoundException");
			e.printStackTrace();
		}
	}
	
	private static String getInputParameter() {
    	System.out.println("请先准备好一个空的保存抓取结果的Access文件后输入该文件名:\n");
    	String parameter = null;
        try {
            BufferedReader inStream = new BufferedReader (
                                            new InputStreamReader(System.in)
                                          );

            parameter = inStream.readLine();
        } catch (IOException e) {
        	LogEntity.logFile.log(e, "happened in reading the access file");
            System.out.println("IOException: " + e);
        }
		return parameter;
	}//get access file name
	
	public void process(String msaccess)throws IOException, ClassNotFoundException{
		mr = new MSCnkiReference(msaccess);
		pc = new ParseCnkiReference();		
		try{
			mr.dropTable();
			mr.createTable();	
		}catch(SQLException se){
			LogEntity.logFile.log(se,"happened in drop  or create table");
			se.printStackTrace();
		}
		
		System.out.println("开始抓取...");
		for (Object qv : queryValue) { //期刊名称
	         FormParameter params = newFormParameter();
		     List<NameValuePair> values = new ArrayList<NameValuePair>(); 		   				    			 
		     NameValuePair advalue = new NameValuePair("advancedvalue1",new String(((String)qv).getBytes("utf-8"),"ISO-8859-1"));
		     values.add(advalue);		 
		     values.addAll(getSomeFormParams());
		     
		     params.setNameValues(values);
		    		    
		     System.out.println("开始抓取 "+ qv + " "+yearStart+"-"+yearEnd+" 年的记录...");
		     LogEntity.logFile.log("开始抓取 "+ qv + " "+yearStart+"-"+yearEnd+" 年的记录...");
	    	
	    	 InputStream result = (new GetCnkiReference(params)).getContent();			//get the content of the first page  			    			
		     if(result==null){
		    	 LogEntity.logFile.log("没有得到 "+qv+" 的查询结果。您输入的期刊不存在,或者网络忙!\n");
		    	 continue;
		     }
		     
	    	 String str = GetFormContent.slurp(result);
	    	 List<CnkiReference> list=new ArrayList<CnkiReference>();
		     list=pc.getPagesContent(str);		   	    
		     if(list.size()==0||list==null){
		    	 System.out.println("没有得到解析的结果,可能原因:解析 "+qv+" 时出错;输入的期刊不存在\n");
		    	 LogEntity.logFile.log("没有得到解析的结果,可能原因:解析 "+qv+" 时出错;输入的期刊不存在\n");			    	 
		    	 continue;
		     }
		     		     			    
		     mr.addRecords( list);			   					
		     System.out.println(qv + " "+yearStart+"-"+yearEnd+" 年完成 , 共 "+list.size()+" 条记录 \n");
		     LogEntity.logFile.log(qv + " "+yearStart+"-"+yearEnd+" 年完成 , 共 "+list.size()+" 条记录 \n");
		}
		System.out.println("全部抓取完成.");
		LogEntity.logFile.log("全部抓取完成.\n\n");
	}// write the access file and outfile

	

	private static FormParameter newFormParameter() {
		FormParameter params = new FormParameter();
		params.setUrl(url);
		params.setUrl2(url2);
		params.setHiddenParameters(hidden);
		return params;
	}
	
	private  static List<NameValuePair> getSomeFormParams() throws UnsupportedEncodingException{
		List<NameValuePair> list = new ArrayList<NameValuePair>(); 
		NameValuePair adfield = new NameValuePair("advancedfield1", new String(queryField.getBytes("utf-8"),"ISO-8859-1"));			
		NameValuePair yearStartValue = new NameValuePair("RealYearStart",yearStart);
		NameValuePair yearEndValue = new NameValuePair("RealYearEnd",yearEnd);
		List<NameValuePair> selectbox=new ArrayList<NameValuePair>();			
		selectbox.add(new NameValuePair("selectbox","A"));
		selectbox.add(new NameValuePair("selectbox","B"));
		selectbox.add(new NameValuePair("selectbox","C"));
		selectbox.add(new NameValuePair("selectbox","D"));
		selectbox.add(new NameValuePair("selectbox","E"));
		selectbox.add(new NameValuePair("selectbox","F"));
		selectbox.add(new NameValuePair("selectbox","G"));
		selectbox.add(new NameValuePair("selectbox","H"));
		selectbox.add(new NameValuePair("selectbox","I"));
		selectbox.add(new NameValuePair("selectbox","J"));
		NameValuePair selectall=new NameValuePair("hdnIsAll","true");
		NameValuePair orderValue=new NameValuePair("order","dec");	
		list.add(adfield);
		list.add(yearStartValue);
		list.add(yearEndValue);
		list.addAll(selectbox);
		list.add(selectall);
		list.add(orderValue);
		return list;
	}
	
	private static  List<String>  getHiddenParams(){
		List<String> hidden =new ArrayList<String>();
		hidden.add("ID");
		hidden.add("NaviField");
		hidden.add("NaviDatabaseName");
		hidden.add("hdnFathorCode");
		hidden.add("strNavigatorValue");
		hidden.add("SearchFieldRelationDirectory");
		hidden.add("strNavigatorName");
		hidden.add("bCurYearTempDB");
		hidden.add("fieldnowordfrequency");
		hidden.add("searchmatch");
		hidden.add("RecordsPerPage");
		hidden.add("TableType");
		hidden.add("display");
		hidden.add("encode");
		hidden.add("TablePrefix");
		hidden.add("View");
		hidden.add("yearFieldName");
		hidden.add("VarNum");
		return hidden;	
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -