⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 urlreader.java

📁 是个java写的sipder,非常不错!能承受很大的压力,每天采集的数量在10000万
💻 JAVA
字号:
package cn.yicha.subject.spider.fecther;

import java.io.*;
import java.net.*;

public class URLReader 
{
	private static String _content_type = "";
     /**
      * 读取一个指定的Url的内容并返回
      * @param url String 指定的url,如 http://www.1233.net
      * @throws IOException
      * @return String
      */
     public static String readUrlContent(String url, boolean withUTF) throws IOException
     {
		System.getProperties().put( "proxySet", "false" );

		StringBuffer result = new StringBuffer();
		BufferedReader in = null;
		
		try {
			// 建立连接,读取数据
			URL requestedUrl = new URL(url);
			HttpURLConnection conn = (HttpURLConnection) requestedUrl.openConnection();
			conn.setRequestProperty("User-Agent", "Nokia7650/1.0 symbianOS/6.1 series60/0.9 Profile/MIDP-1.0 Configuration/CLDC-1.0");

			if (withUTF) {						
				in = new BufferedReader(new InputStreamReader(
					conn.getInputStream(), "utf-8"));
			}
			else {
				in = new BufferedReader(new InputStreamReader(
					conn.getInputStream()));
			}

			String inputLine;
			while ( (inputLine = in.readLine()) != null) {
				result.append(inputLine);
				result.append("\n");
			}
		}
		catch (IOException ex) {
			throw ex;
		}
		finally {
			if (in != null) {
				in.close();
			}
		}
		return result.toString();

     }

     /**
      * 向一个指定的Url发送HTTP请求
      * @param url String 指定的url,如 http://www.1233.net
      * @throws IOException
      * @return String
      */
     public static void sendUrlRequest(String url) throws IOException
     {
		System.getProperties().put( "proxySet", "false" );

		StringBuffer result = new StringBuffer();
		BufferedReader in = null;
		
		try {
			// 建立连接,读取数据
			URL requestedUrl = new URL(url);
			HttpURLConnection conn = (HttpURLConnection) requestedUrl.openConnection();
			conn.getInputStream();
		}
		catch (IOException ex) {
			ex.printStackTrace();
		}
		finally {
			if (in != null) {
				in.close();
			}
		}
     }

     /**
      * 读取一个指定的Url的内容并返回
      * @param url String 指定的url,如 http://www.1233.net
      * @throws IOException
      * @return String
      */
     public static String readUTFUrlContent(String url) throws IOException {
     	return readUrlContent(url, true);
     }

     /**
      * 读取一个指定的Url的内容并返回
      * @param url String 指定的url,如 http://www.1233.net
      * @throws IOException
      * @return String
      */
     public static String readUrlContent(String url) throws IOException {
     	return readUrlContentByProxy(url, "", "", false);
     }

     /**
      * 读取一个指定的Url的内容并返回
      * @param url String 指定的url,如 http://www.1233.net
      * @throws IOException
      * @return String
      */
     public static String readUrlContent(String url, String userAgent) throws IOException {
     	return readUrlContentByProxy(url, "", "", userAgent);
     }

     /**
      * 读取一个指定的Url的内容以及SessionID并返回
      * @param url String 指定的url,如 http://www.1233.net
      * @throws IOException
      * @return String
      */
    public static String readUrlSessionID(String url) throws IOException {
		StringBuffer result = new StringBuffer();
		BufferedReader in = null;
		String sessionId = null;
		
		try {
			// 建立连接,读取数据
			URL requestedUrl = new URL(url);
			HttpURLConnection conn = (HttpURLConnection) requestedUrl.openConnection();

			in = new BufferedReader(new InputStreamReader(
				conn.getInputStream()));
			
			String inputLine;
			while ( (inputLine = in.readLine()) != null) {
				result.append(inputLine);
				result.append("\n");
			}

			// 从响应中检索会话ID
			String cookieValue = conn.getHeaderField("Set-Cookie");
			if (cookieValue	!= null) {
				sessionId = cookieValue.substring(0, cookieValue.indexOf(";"));
				System.out.println("session id --> " + sessionId);
			}
		}
		catch (Exception ex) {
			ex.printStackTrace();
		}

		finally {
			if (in != null) {
				in.close();
			}
		}
		return sessionId;
     }

     /**
      * 设置SessionID,然后读取一个指定的Url的内容并返回
      * @param url String 指定的url,如 http://www.1233.net
      * @throws IOException
      * @return String
      */
    public static String getUrlContentBySessionID(String url, String sessionId) throws IOException {
		StringBuffer result = new StringBuffer();
		BufferedReader in = null;
		
		try {
			// 建立连接,读取数据
			URL requestedUrl = new URL(url);
			HttpURLConnection conn = (HttpURLConnection) requestedUrl.openConnection();
			conn.setRequestProperty("Cookie", sessionId);
			System.out.println("sending session id --> " + sessionId);

			in = new BufferedReader(new InputStreamReader(
				conn.getInputStream()));
			
			String inputLine;
			while ( (inputLine = in.readLine()) != null) {
				result.append(inputLine);
				result.append("\n");
			}

		}
		catch (Exception ex) {
			ex.printStackTrace();
		}

		finally {
			if (in != null) {
				in.close();
			}
		}
		return result.toString();
     }

     /**
      * 通过代理读取一个指定的Url的内容并返回
      * @param url String 指定的url,如 http://www.1233.net
	  * @param proxyHost 代理主机地址
	  * @param proxyPort 代理端口地址
      * @throws IOException
      * @return String
      */
	public static String readUrlContentByProxy(String url, String proxyHost, String proxyPort, boolean bForceUTF) throws IOException
	{
		final String _USER_AGENT = "Nokia6108/1.0 (05.04) Profile/MIDP-1.0 Configuration/CLDC-1.0";
		return readUrlContentByProxy(url, proxyHost, proxyPort, _USER_AGENT, bForceUTF);
	}


     /**
      * 通过代理读取一个指定的Url的内容并返回
      * @param url String 指定的url,如 http://www.1233.net
	  * @param proxyHost 代理主机地址
	  * @param proxyPort 代理端口地址
      * @throws IOException
      * @return String
      */
	public static String readUrlContentByProxy(String url, String proxyHost, String proxyPort, String userAgent) throws IOException
   	{
   		return readUrlContentByProxy(url, proxyHost, proxyPort, userAgent, false);
   	}
	
     /**
      * 通过代理读取一个指定的Url的内容并返回
      * @param url String 指定的url,如 http://www.1233.net
	  * @param proxyHost 代理主机地址
	  * @param proxyPort 代理端口地址
      * @throws IOException
      * @return String
      */
	public static String readUrlContentByProxy(String url, String proxyHost, String proxyPort, String userAgent, boolean bForceUtf) throws IOException
	{
		final String _UTF_CHARSET = "utf-8";
			
		// 设置代理
		if (proxyHost != null && proxyPort != null) {
			System.getProperties().put( "proxySet", "true" );
			System.getProperties().put( "proxyHost", proxyHost );
			System.getProperties().put( "proxyPort", proxyPort );
		}
		else {
			System.getProperties().put( "proxySet", "false" );
		}

		StringBuffer result = new StringBuffer();
		BufferedReader in = null;
		
		try {
			// 建立连接,读取数据
			URL requestedUrl = new URL(url);
			HttpURLConnection conn = (HttpURLConnection) requestedUrl.openConnection();
			System.out.println("user agent --> " + userAgent);

			conn.setRequestProperty("User-Agent", userAgent);
			conn.setRequestProperty("connection", "Keep-Alive");
			conn.setRequestProperty("host", getHost(url));
			conn.setRequestProperty("accept", "text/html, application/xhtml+xml; profile=http://www.wapforum.org/xhtml, application/vnd.wap.xhtml+xml, text/vnd.wap.wml, application/vnd.wap.wmlc, application/vnd.wap.wbxml, application/vnd.wap.wmlscriptc, */*");
			conn.setRequestProperty("accept-charset", "utf-8, utf-16");
			conn.setRequestProperty("accept-language", "English, Chinese");
			conn.setRequestProperty("x-wap-profile", "http://gsm.lge.com/html/gsm/LG-U8138.xml");
			conn.setRequestProperty("via","(infoX WAP Gateway), HTTP/1.1, Huawei Technologies");
			//conn.setRequestProperty("x-up-calling-line-id", "13110238365");
			conn.setRequestProperty("x-up-calling-line-id", "13910169234");
			conn.setRequestProperty("x-huawei-authmethod", "MSISDN");
			conn.setRequestProperty("x-forwarded-for", "10.77.15.185");
			conn.setRequestProperty("x-up-bear-type", "GPRS");
			conn.setRequestProperty("x-huawei-stacktype", "WAP2.0");
			conn.setRequestProperty("x-huawei-networktype", "GSM");
			conn.setRequestProperty("x-huawei-apn", "cmwap");
			conn.setRequestProperty("x-huawei-nasip", "211.137.197.73");

			_content_type = conn.getContentType();
			// 根据编码不同设置不同的编码方式
			if (bForceUtf || (conn.getContentType() != null && conn.getContentType().toLowerCase().indexOf(_UTF_CHARSET) >= 0)) {
				in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
			}
			else {
				in = new BufferedReader(new InputStreamReader(conn.getInputStream()));
			}

			// 读取网页内容
			String inputLine;
			while ( (inputLine = in.readLine()) != null) {
				result.append(inputLine);
				result.append("\n");
			}
		}
		catch (IOException ex) {
			throw ex;
		}
		finally {
			if (in != null) {
				in.close();
			}
		}
		return result.toString();
	}

	/**
	* 获取域名
	*/
	public static String getHost(String urlName)
	{
		try {
			URL url = new URL(urlName);	
			String host = url.getHost();
			return host;
		}
		catch (Exception e) {
		}

		return null;
	}

	public static void main(String[] args)
	{
	 	String content = "";
	 	// String url = "http://wap.sina.com.cn";
		String proxyHost = "10.0.0.172";
		String proxyPort = "80";
		String url = "http://cota.cn/video";

		try {
			content = readUrlContentByProxy(url, proxyHost, proxyPort, false);
			System.out.println(content);
		} catch (Exception e){
			e.printStackTrace();
		}
	}

	public static String get_content_type() {
		return _content_type;
	}

	public static void set_content_type(String _content_type) {
		URLReader._content_type = _content_type;
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -