📄 getcnkireference.java
字号:
package cn.ac.cintcm.spider.cnki;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.commons.httpclient.NameValuePair;
import cn.ac.cintcm.spider.FormParameter;
public class GetCnkiReference extends Get2FormContent{
private static String url = "http://lsg.cnki.net/grid20/Navigator.aspx?id=6";
private static String url2 = "http://lsg.cnki.net/grid20/Brief.aspx?ID=6&classtype=&systemno=&NaviDatabaseName=&NaviField=";
private static final List<String> hidden =getHiddenParams();
private static String queryField;
private List<String> queryValue = new ArrayList<String>();
private static String yearStart;
private static String yearEnd;
private MSCnkiReference mr;
private ParseCnkiReference pc;
public GetCnkiReference(String configFile) throws IOException{
super("");
Map config = ConfigUtil.loadJobConfig(configFile);
if (config != null) {
queryField=(String)config.get("advancedfield1");
String advancedvalue1 = (String) config.get("advancedvalue1");
if (advancedvalue1 != null) {
queryValue.addAll(Arrays.asList(advancedvalue1.split(",")));
}
else{
queryValue.add(new String("中医文献杂志"));
}
yearStart=(String) config.get("RealYearStart");
yearEnd=(String)config.get("RealYearEnd");
url = (String) config.get("url");
url2 = (String) config.get("url2");
}
}//read config file
public GetCnkiReference(FormParameter formParameters) {
super(formParameters);
}
public static void main(String[] args){
try {
String outDb = "web.mdb";
String userInput = getInputParameter();
if (userInput != null && !userInput.trim().equals("")) {
outDb = userInput;
}
(new GetCnkiReference("cnki.txt")).process(outDb);
} catch (IOException e) {
LogEntity.logFile.log(e, "IOException");
e.printStackTrace();
} catch (ClassNotFoundException e) {
LogEntity.logFile.log(e, "ClassNotFoundException");
e.printStackTrace();
}
}
private static String getInputParameter() {
System.out.println("请先准备好一个空的保存抓取结果的Access文件后输入该文件名:\n");
String parameter = null;
try {
BufferedReader inStream = new BufferedReader (
new InputStreamReader(System.in)
);
parameter = inStream.readLine();
} catch (IOException e) {
LogEntity.logFile.log(e, "happened in reading the access file");
System.out.println("IOException: " + e);
}
return parameter;
}//get access file name
public void process(String msaccess)throws IOException, ClassNotFoundException{
mr = new MSCnkiReference(msaccess);
pc = new ParseCnkiReference();
try{
mr.dropTable();
mr.createTable();
}catch(SQLException se){
LogEntity.logFile.log(se,"happened in drop or create table");
se.printStackTrace();
}
System.out.println("开始抓取...");
for (Object qv : queryValue) { //期刊名称
FormParameter params = newFormParameter();
List<NameValuePair> values = new ArrayList<NameValuePair>();
NameValuePair advalue = new NameValuePair("advancedvalue1",new String(((String)qv).getBytes("utf-8"),"ISO-8859-1"));
values.add(advalue);
values.addAll(getSomeFormParams());
params.setNameValues(values);
System.out.println("开始抓取 "+ qv + " "+yearStart+"-"+yearEnd+" 年的记录...");
LogEntity.logFile.log("开始抓取 "+ qv + " "+yearStart+"-"+yearEnd+" 年的记录...");
InputStream result = (new GetCnkiReference(params)).getContent(); //get the content of the first page
if(result==null){
LogEntity.logFile.log("没有得到 "+qv+" 的查询结果。您输入的期刊不存在,或者网络忙!\n");
continue;
}
String str = GetFormContent.slurp(result);
List<CnkiReference> list=new ArrayList<CnkiReference>();
list=pc.getPagesContent(str);
if(list.size()==0||list==null){
System.out.println("没有得到解析的结果,可能原因:解析 "+qv+" 时出错;输入的期刊不存在\n");
LogEntity.logFile.log("没有得到解析的结果,可能原因:解析 "+qv+" 时出错;输入的期刊不存在\n");
continue;
}
mr.addRecords( list);
System.out.println(qv + " "+yearStart+"-"+yearEnd+" 年完成 , 共 "+list.size()+" 条记录 \n");
LogEntity.logFile.log(qv + " "+yearStart+"-"+yearEnd+" 年完成 , 共 "+list.size()+" 条记录 \n");
}
System.out.println("全部抓取完成.");
LogEntity.logFile.log("全部抓取完成.\n\n");
}// write the access file and outfile
private static FormParameter newFormParameter() {
FormParameter params = new FormParameter();
params.setUrl(url);
params.setUrl2(url2);
params.setHiddenParameters(hidden);
return params;
}
private static List<NameValuePair> getSomeFormParams() throws UnsupportedEncodingException{
List<NameValuePair> list = new ArrayList<NameValuePair>();
NameValuePair adfield = new NameValuePair("advancedfield1", new String(queryField.getBytes("utf-8"),"ISO-8859-1"));
NameValuePair yearStartValue = new NameValuePair("RealYearStart",yearStart);
NameValuePair yearEndValue = new NameValuePair("RealYearEnd",yearEnd);
List<NameValuePair> selectbox=new ArrayList<NameValuePair>();
selectbox.add(new NameValuePair("selectbox","A"));
selectbox.add(new NameValuePair("selectbox","B"));
selectbox.add(new NameValuePair("selectbox","C"));
selectbox.add(new NameValuePair("selectbox","D"));
selectbox.add(new NameValuePair("selectbox","E"));
selectbox.add(new NameValuePair("selectbox","F"));
selectbox.add(new NameValuePair("selectbox","G"));
selectbox.add(new NameValuePair("selectbox","H"));
selectbox.add(new NameValuePair("selectbox","I"));
selectbox.add(new NameValuePair("selectbox","J"));
NameValuePair selectall=new NameValuePair("hdnIsAll","true");
NameValuePair orderValue=new NameValuePair("order","dec");
list.add(adfield);
list.add(yearStartValue);
list.add(yearEndValue);
list.addAll(selectbox);
list.add(selectall);
list.add(orderValue);
return list;
}
private static List<String> getHiddenParams(){
List<String> hidden =new ArrayList<String>();
hidden.add("ID");
hidden.add("NaviField");
hidden.add("NaviDatabaseName");
hidden.add("hdnFathorCode");
hidden.add("strNavigatorValue");
hidden.add("SearchFieldRelationDirectory");
hidden.add("strNavigatorName");
hidden.add("bCurYearTempDB");
hidden.add("fieldnowordfrequency");
hidden.add("searchmatch");
hidden.add("RecordsPerPage");
hidden.add("TableType");
hidden.add("display");
hidden.add("encode");
hidden.add("TablePrefix");
hidden.add("View");
hidden.add("yearFieldName");
hidden.add("VarNum");
return hidden;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -