📄 webspider.java
字号:
package com.phpcoo.utils;
import java.net.*;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;
import com.phpcoo.po.HttpHeaderPo;
public class WebSpider
{
private HttpHeaderPo https;
private static String lineTerminator = System.getProperty("line.separator"); /** 取得系统分隔符*/
private Socket socket; /** Socket对象*/
/**
* 默认构造器
*/
public WebSpider()
{
}
/**
* 重载构造器
* @param http
* @throws UnknownHostException
* @throws IOException
*/
public WebSpider(HttpHeaderPo http) throws UnknownHostException, IOException
{
https = http;
socket = new Socket(https.getServerIp(),https.getServerPort());
if(socket != null)
{
System.out.println("连接成功");
}
else
{
System.out.println("连接失败");
}
}
/**
* 得到指定网页的源文件
* @param urls
* @return string
*/
private String getContent() throws IOException
{
String line = "";
StringBuffer content = new StringBuffer();
InputStream in = socket.getInputStream();
OutputStream os = socket.getOutputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
PrintWriter writer = new PrintWriter(new OutputStreamWriter(os));
writer.println(https.getHttpGET());
writer.println(https.getHttpAccept());
writer.println(https.getHttpAcceptLanguage());
writer.println(https.getHttpUaCpu());
writer.println(https.getHttpAcceptEncoding());
writer.println(https.getHttpUserAgent());
writer.println(https.getHttpHost());
writer.println(https.getHttpConnection());
writer.flush();
socket.shutdownOutput();
while((line = reader.readLine()) != null)
{
content.append(line+lineTerminator);
}
socket.shutdownInput();
return content.toString();
}
/**
* 在指定范围内获取内容
* @param findStr 网页数据
* @param startRex 开始标记
* @param endRex 结束标记
* @return
* @throws IOException
*/
private String getAreaContent() throws IOException
{
String findStr = getContent();
String startRex = https.getStartTag();
String endRex = https.getEndTag();
String result = "";
int start = findStr.lastIndexOf(startRex) + startRex.length();
int end = findStr.lastIndexOf(endRex);
result = findStr.substring(start,end);
return result;
}
/**
* 在指定范围内获取连接的标题
* @param findStr
* @return ArrayList
* @throws IOException
*/
public ArrayList getHrefTitle() throws IOException
{
String findStr = getAreaContent();
ArrayList<String> list = new ArrayList<String>();
String patWhA="<[a]\\s+?[^>]*?>[^<]+?</[a]\\s*?>";
String patA="</?\\s*?[a][\\s\\S]*?>";
Pattern aPattern=Pattern.compile(patWhA,2);
Matcher aMat=aPattern.matcher(findStr);
Pattern aPattern2=Pattern.compile(patA,2);
while(aMat.find())
{
String a=aMat.group();
Matcher mat=aPattern2.matcher(a);
list.add(mat.replaceAll(""));
}
return list;
}
/**
* 在>与<范围内获取浮点数
* @param findStr
* @return ArrayList
* @throws IOException
*/
public ArrayList getFundValue() throws IOException
{
String findStr = getAreaContent();
ArrayList<String> list = new ArrayList<String>();
String patWhA=">((-?\\d+)(\\.\\d+)?).<";
Pattern aPattern=Pattern.compile(patWhA);
Matcher aMat=aPattern.matcher(findStr);
while(aMat.find())
{
list.add(findStr.substring(aMat.start()+1, aMat.end()-1));
}
return list;
}
public static void main(String[] args) {
HttpHeaderPo hh = new HttpHeaderPo();
hh.setServerIp("127.0.0.1");
hh.setServerPort(8090);
hh.setHttpGET("GET /WebSpider/index.htm HTTP/1.1");
hh.setHttpAccept("Accept: */*");
hh.setHttpAcceptEncoding("Accept-Encoding: gzip, deflate");
hh.setHttpUaCpu("UA-CPU: x86");
hh.setHttpAcceptLanguage("Accept-Language: zh-cn");
hh.setHttpUserAgent("User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
hh.setHttpHost("Host: localhost:8090");
hh.setHttpConnection("Connection: Keep-Alive\r\n");
hh.setStartTag("align=right>日涨跌");
hh.setEndTag("height=-126> </TD></TR></TBODY></TABLE>");
try {
WebSpider ws = new WebSpider(hh);
System.out.println(ws.getHrefTitle().size());
System.out.println(ws.getFundValue().size());
} catch (UnknownHostException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
} catch (IOException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
}
}
public void setHttps(HttpHeaderPo https) {
this.https = https;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -