⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 login.java

📁 用来为垂直搜索引擎抓取数据的采集系统
💻 JAVA
字号:
package org.indigo.parser;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;
import org.indigo.parser.Parser;

public class Login {

	/**
	 * 当遇到需要登录的页面时,需要调用此方法。
	 * @param formUrl  登录页面中的表单中的action参数值。
	 * @param data     存放需要登录的参数。
	 * @param url      需要采集的页面URL
	 * @return
	 */
	public static String getHtmlByLogin(String formUrl,NameValuePair[] data,String url)
	{
		url = "http://www.iim.ac.cn/kaoqin3/new01.asp";
		String sourceCode=null;
		HttpClient httpClient = new HttpClient();
		httpClient.getParams().setContentCharset("gb2312");
		PostMethod postMethod = new PostMethod(formUrl);
		postMethod.setRequestBody(data);
		int statusCode=0;
//		 执行postMethod
		try {
			 statusCode =httpClient.executeMethod(postMethod);
			 String charSet=postMethod.getResponseCharSet();
			 InputStream is=postMethod.getResponseBodyAsStream();
			 BufferedReader br=new BufferedReader(new InputStreamReader(is,charSet));
			 String line=br.readLine();
			 while(line!=null)
			 {
				 sourceCode+=line.trim();
				 line=br.readLine();
			 }
//			 sourceCode=postMethod.getResponseBodyAsString();
		} catch (HttpException e) {
			// TODO Auto-generated catch block
			//e.printStackTrace();
			return null;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			//e.printStackTrace();
			return null;
		}
//		 HttpClient对于要求接受后继服务的请求,象POST和PUT等不能自动处理转发
//		 301或者302
       if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {    
//		 从头中取出转向的地址
		Header locationHeader = postMethod.getResponseHeader("location");
		
		String location = null;  
		if (locationHeader != null) 
		{
		location = locationHeader.getValue();
		System.out.println("The page was redirected to:" + location);
		 } 
		else 
		{   
			System.err.println("Location field value is null.");
		 }    
		}
       /**
        * 登录之后,获取对应的Cookie,然后通过Cookie访问需要采集的页面。
        */
       Cookie cookie[]= httpClient.getState().getCookies();
       postMethod.releaseConnection();
       HttpClient httpClient1 = new HttpClient();
       HttpState state=new HttpState();
       for(Cookie c:cookie)
       {
    	   state.addCookie(c);
       }
       httpClient1.setState(state);
       httpClient1.getParams().setContentCharset("gb2312");
       PostMethod post1=new PostMethod(url);
       try {
		httpClient1.executeMethod(post1);
		 sourceCode=post1.getResponseBodyAsString();
		//String charSet=postMethod.getResponseCharSet();
		//InputStream iss=postMethod.getResponseBodyAsStream();
		// BufferedReader brr=new BufferedReader(new InputStreamReader(iss,charSet));
		// String line=brr.readLine();
//		 while(line!=null)
//		 {
//			 sourceCode+=line.trim();
//			 line=brr.readLine();
//		 }
	    
	} catch (HttpException e) {
		// TODO Auto-generated catch block
		//e.printStackTrace();
		return null;
	} catch (IOException e) {
		// TODO Auto-generated catch block
		//e.printStackTrace();
		return null;
	}
      
     //  System.out.println(sourceCode);
       
      
    //  return null;
      
      
      return sourceCode;

	}
	/**
	 * 遇到翻页时用到了js或表单,调用此方法。
	 * @param url  访问指定页面URL
	 * @param data  访问此页面需要的参数。
	 * @return
	 */
	public static String getHtmlByPost(String url,NameValuePair[] data) {
		//String url = "http://www.scnjw.gov.cn/schq/schq.aspx";
		String sourceCode=null;
		HttpClient httpClient = new HttpClient();
		PostMethod postMethod = new PostMethod(url);
//		httpClient.getParams().setContentCharset("utf-8");
		postMethod.addRequestHeader("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");
		postMethod.setRequestBody(data);
		int statusCode=0;
//		 执行postMethod
		try {
			 statusCode =httpClient.executeMethod(postMethod);
			 String charSet=postMethod.getResponseCharSet();
			 InputStream is=postMethod.getResponseBodyAsStream();
			 BufferedReader br=new BufferedReader(new InputStreamReader(is,charSet));
			 String line=br.readLine();
			 while(line!=null)
			 {
				 sourceCode+=line.trim();
				 line=br.readLine();
			 }
		} catch (HttpException e) {
			// TODO Auto-generated catch block
			//e.printStackTrace();
			return null;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			//e.printStackTrace();
			return null;
		}
//		 HttpClient对于要求接受后继服务的请求,象POST和PUT等不能自动处理转发
//		 301或者302
       if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {    
//		 从头中取出转向的地址
		Header locationHeader = postMethod.getResponseHeader("location");
		
		String location = null;  
		if (locationHeader != null) 
		{
		location = locationHeader.getValue();
		System.out.println("The page was redirected to:" + location);
		 } 
		else 
		{   
			System.err.println("Location field value is null.");
		 }    
		}
       return sourceCode;

	}
	}


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -