⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 webspider.java

📁 一个抓取程序
💻 JAVA
字号:
package com.phpcoo.utils;

import java.net.*; 
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*; 

import com.phpcoo.po.HttpHeaderPo;
public class WebSpider
{
	private HttpHeaderPo https;
	
	private static  String lineTerminator = System.getProperty("line.separator"); /**  取得系统分隔符*/
	
	private Socket socket;                        /** Socket对象*/ 
	
	/**
	 * 默认构造器
	 */
	public WebSpider()
	{
		
	}
	/**
	 * 重载构造器
	 * @param http
	 * @throws UnknownHostException
	 * @throws IOException
	 */
	public WebSpider(HttpHeaderPo http) throws UnknownHostException, IOException
	{
		https = http;
		socket = new Socket(https.getServerIp(),https.getServerPort());
		if(socket != null)
		{
			System.out.println("连接成功");
		}
		else
		{
			System.out.println("连接失败");
		}
	}
	/**
	 * 得到指定网页的源文件
	 * @param urls
	 * @return string
	 */
	private String getContent() throws IOException
	{
		String line = "";
		StringBuffer content = new StringBuffer();
		InputStream in = socket.getInputStream();
		OutputStream os = socket.getOutputStream();
		BufferedReader reader = new BufferedReader(new InputStreamReader(in));
		PrintWriter writer = new PrintWriter(new OutputStreamWriter(os));
		writer.println(https.getHttpGET());
		writer.println(https.getHttpAccept());
		writer.println(https.getHttpAcceptLanguage());
		writer.println(https.getHttpUaCpu());
		writer.println(https.getHttpAcceptEncoding());
		writer.println(https.getHttpUserAgent());
		writer.println(https.getHttpHost());
		writer.println(https.getHttpConnection());
		writer.flush();
		socket.shutdownOutput();
		while((line = reader.readLine()) != null)
		{
			content.append(line+lineTerminator);
		}
		socket.shutdownInput();
		return content.toString();
	}
	
	 /**
     * 在指定范围内获取内容
     * @param findStr 网页数据
     * @param startRex 开始标记
     * @param endRex 结束标记
     * @return
	 * @throws IOException 
     */
	private String getAreaContent() throws IOException
	{ 
		String findStr = getContent();
		String startRex = https.getStartTag();
		String endRex = https.getEndTag();
		String result = "";
		int start = findStr.lastIndexOf(startRex) + startRex.length();
		int end = findStr.lastIndexOf(endRex);
		result = findStr.substring(start,end);
		return result;
	}
	/**
	 * 在指定范围内获取连接的标题
	 * @param findStr
	 * @return ArrayList
	 * @throws IOException 
	 */
	public ArrayList getHrefTitle() throws IOException
	{
		String findStr = getAreaContent();
		ArrayList<String> list = new ArrayList<String>();
        String patWhA="<[a]\\s+?[^>]*?>[^<]+?</[a]\\s*?>";
        String patA="</?\\s*?[a][\\s\\S]*?>";
        Pattern aPattern=Pattern.compile(patWhA,2);
        Matcher aMat=aPattern.matcher(findStr);
        Pattern aPattern2=Pattern.compile(patA,2);
        while(aMat.find())
        {
            String a=aMat.group();
            Matcher mat=aPattern2.matcher(a);
            list.add(mat.replaceAll(""));
         }
        return list;
	}
	/**
	 * 在>与<范围内获取浮点数
	 * @param findStr
	 * @return ArrayList
	 * @throws IOException 
	 */
	public ArrayList getFundValue() throws IOException
	{
		String findStr = getAreaContent();
		ArrayList<String> list = new ArrayList<String>();
		String patWhA=">((-?\\d+)(\\.\\d+)?).<";
		Pattern aPattern=Pattern.compile(patWhA);
        Matcher aMat=aPattern.matcher(findStr);
        while(aMat.find())
        {
        	list.add(findStr.substring(aMat.start()+1, aMat.end()-1));
        }
        return list;
	}
	public static void main(String[] args) {
		HttpHeaderPo hh = new HttpHeaderPo();
		hh.setServerIp("127.0.0.1");
		hh.setServerPort(8090);
		hh.setHttpGET("GET /WebSpider/index.htm HTTP/1.1");
		hh.setHttpAccept("Accept: */*");
		hh.setHttpAcceptEncoding("Accept-Encoding: gzip, deflate");
		hh.setHttpUaCpu("UA-CPU: x86");
		hh.setHttpAcceptLanguage("Accept-Language: zh-cn");
		hh.setHttpUserAgent("User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
		hh.setHttpHost("Host: localhost:8090");
		hh.setHttpConnection("Connection: Keep-Alive\r\n");
		hh.setStartTag("align=right>日涨跌");
		hh.setEndTag("height=-126>&nbsp;</TD></TR></TBODY></TABLE>");
		
		try {
			WebSpider ws = new WebSpider(hh);
			System.out.println(ws.getHrefTitle().size());
			System.out.println(ws.getFundValue().size());
		} catch (UnknownHostException e) {
			// TODO 自动生成 catch 块
			e.printStackTrace();
		} catch (IOException e) {
			// TODO 自动生成 catch 块
			e.printStackTrace();
		}
	}
	public void setHttps(HttpHeaderPo https) {
		this.https = https;
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -