⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 test.java

📁 用JAVA写的简单爬虫
💻 JAVA
字号:
package test;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.ConnectException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.InflaterInputStream;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class Test {
    static String regex = "charset=\\s*?\"?\'?\\s*?([a-z][_\\-0-9a-z]*)\\s*?\"?\'?";
    static Pattern patten = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
    
    public static String getEncoding(String content) {
        String encode = "";
        try {
            Parser parser = Parser.createParser(content, "ISO-8859-1");
            NodeFilter metaFilter = new NodeClassFilter(MetaTag.class);
            OrFilter lastFilter = new OrFilter();
            lastFilter.setPredicates(new NodeFilter[] { metaFilter });
            
            NodeList nodelist = parser.parse(lastFilter);
            Node[] nodes = nodelist.toNodeArray();

            for (int i = 0; i < nodes.length; i++) {
                Node node = nodes[i];
                
                if (node instanceof MetaTag) {
                    MetaTag metaTag = (MetaTag) node;
                    String line = metaTag.getText();
                    if (line.indexOf("charset") != -1) {
                        Matcher m = patten.matcher(line);
                        while (m.find()) {
                            encode = m.group(1);
                        }
                        return encode;
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return encode;
    }
    
    public static String showUrl(String s) {
        try {
            URL url = new URL(s);
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            // 模拟成ie
//            connection.setRequestProperty("user-agent",
//                    "mozilla/4.0 (compatible; msie 6.0; windows 2000)");
            connection.setConnectTimeout(30000);
            connection.setReadTimeout(30000);
            connection.setAllowUserInteraction(false);
            HttpURLConnection.setFollowRedirects(true);
            connection.setRequestProperty("User-Agent", "mozlla/5.0");
            connection.setRequestProperty("Accept-Encoding", "gzip, deflate");
//            connection.connect();
            int responseCode = connection.getResponseCode();
            if (responseCode == HttpURLConnection.HTTP_OK) {
                String contentEncoding = connection.getContentEncoding();
                InputStream in = null;
                if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("gzip"))) {   
                    in = new GZIPInputStream (connection.getInputStream());
                } else if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("deflate"))) {   
                    in = new InflaterInputStream (connection.getInputStream());   
                } else {   
                    in = connection.getInputStream();   
                 }   

                BufferedReader breader = new BufferedReader(new InputStreamReader(in, "iso-8859-1"));
//                BufferedWriter writer = new BufferedWriter(new FileWriter("D:/111.txt"));
                StringBuffer str = new StringBuffer("");
                String s1 = breader.readLine();
                while ( s1 != null) {
                    str.append(s1);
//                    writer.write(s1 + "\n");
//                    System.out.println(s1);
                    s1 = breader.readLine();
//                    System.out.println(s1);
                }
                in.close();
                breader.close();
                
                System.out.println(str.toString());
                System.out.println(new String(str.toString().getBytes("iso-8859-1"), "GBK"));
                
                connection.disconnect();
                
                connection.connect();
                System.out.println(connection.getURL());
                in = null;
                if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("gzip"))) {   
                    in = new GZIPInputStream (connection.getInputStream());
                } else if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("deflate"))) {   
                    in = new InflaterInputStream (connection.getInputStream());   
                } else {   
                    in = connection.getInputStream();   
                 }   
                str = new StringBuffer("");
                breader = new BufferedReader(new InputStreamReader(in, "utf-8"));
//              BufferedWriter writer = new BufferedWriter(new FileWriter("D:/111.txt"));
              s1 = breader.readLine();
              while ( s1 != null) {
                  str.append(s1);
//                  writer.write(s1 + "\n");
//                  System.out.println(s1);
                  s1 = breader.readLine();
//                  System.out.println(s1);
              }
              in.close();
              breader.close();
              System.out.println(str.toString());
//                writer.flush();
//                writer.close();
                Parser parser = null;
                
//                System.out.println(new String (str.toString().getBytes("UTF-8")));
//                System.out.println(connection.getURL());
//                System.out.println(connection.getURL().getPath());
//                System.out.println(connection.getContentEncoding());
                String contentType = connection.getContentType();
                System.out.println("contentType:" + contentType);
                String encoding = ""; 
                if (contentType.toLowerCase().indexOf("charset") != -1) {
                    String charset = contentType.substring(contentType.toLowerCase().indexOf("charset"));
                    int nn = charset.indexOf(";");
                    if (nn != -1) {
                        nn--;
                    } else {
                        nn = charset.length();
                    }
                    encoding = charset.substring(charset.indexOf("=") + 1, nn);
                }
                System.out.println("encoding:" + encoding);
//                Map<String, List<String>> map = connection.getHeaderFields();
//                Set set = map.keySet();
//                Iterator<String> iterator = set.iterator();
//                while (iterator.hasNext()) {
//                    String name = iterator.next();
//                    System.out.println(name);
//                    List<String> list = map.get(name);
//                    for (String ss : list) {
//                        System.out.println("\t" + ss);
//                    }
//                }
                return str.toString();
            } else {
                System.out.println("URL连接出错");
            }
        } catch (MalformedURLException e) {
            System.out.println("缺少协议名");
        } catch (ClassCastException e) {
            System.out.println("协议名错误");
        } catch (ConnectException e) {
            System.out.println("连接错误");
        } catch (UnknownHostException e) {
            System.out.println("URL错误");
        } catch (SocketTimeoutException e) {
            System.out.println("连接超时");
        } catch (IOException e) {
            System.out.println("读取出错");
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "";
    }
    
    public List list = new ArrayList<String>();
    synchronized public String getT(String name, int nn) {
        for (int i = 0; i < 3; i ++) {
            try {
                Thread.sleep(500);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            System.out.println("come1:" + name);
        }
        list.add("a");
        return name;
    }
    
    public String getT2(String name, int nn) {
        for (int i = 0; i < 3; i ++) {
            try {
                Thread.sleep(500);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            System.out.println("come2:" + name);
        }
        list.remove("a");
        return name;
    }
    
    class TestT implements Runnable {
        String name;
        int nn;
        public TestT(String s, int n) {
            name = s;
            nn = n;
        }
        
        public void run() {
            for (int i = 0; i < 3; i++) {
                System.out.println(getT(name, nn));
            }
        }
    }
    
    class TestT2 implements Runnable {
        String name;
        int nn;
        public TestT2(String s, int n) {
            name = s;
            nn = n;
        }
        
        public void run() {
            for (int i = 0; i < 3; i++) {
                System.out.println(getT2(name, nn));
            }
        }
    }
    
    public void go() {
        new Thread(new TestT("AA", 1000)).start();
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        new Thread(new TestT2("BB", 100)).start();
        
        try {
            Thread.sleep(10000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        for (int i = 0; i < list.size(); i ++) {
            System.out.println(list.get(i));
        }
    }
    
    public static void main(String[] args) {
//        System.out.println("=" + getEncoding(showUrl("http://www.photos.sh")));
//        System.out.println("=" + getEncoding(showUrl("http://www.cnfstar.com/news/")));
//        System.out.println("=" + getEncoding(showUrl("http://bbs.99wed.com/")));
        showUrl("http://bbs.5iout.com");
//        new Test().go();
//    	try {
//			URL url = new URL(new URL("http://www.photos.sh/product/productList.jsp?id=11"), "http://172.20.7.181/welcome.jsp?id=1#a");
//			System.out.println("url=" + url.toString());
//			System.out.println("url.getAuthority=" + url.getAuthority());
//			System.out.println("url.getDefaultPort=" + url.getDefaultPort());
//			System.out.println("url.getFile=" + url.getFile());
//			System.out.println("url.getHost=" + url.getHost());
//			System.out.println("url.getPath=" + url.getPath());
//			System.out.println("url.getPort=" + url.getPort());
//			System.out.println("url.getProtocol=" + url.getProtocol());
//			System.out.println("url.getQuery=" + url.getQuery());
//			System.out.println("url.getRef=" + url.getRef());
//			System.out.println("url.toExternalForm=" + url.toExternalForm());
//			System.out.println("url.toURI=" + url.toURI());
//			
//			
//			System.out.println("--------------------------");
//			
//			HttpURLConnection connection = (HttpURLConnection) url.openConnection();
//			connection.connect();
//			System.out.println(connection.getResponseCode() + " " + connection.getResponseMessage());
//			System.out.println("--------------------------");
//			url = connection.getURL();
//			System.out.println("url=" + url.toString());
//			System.out.println("url.getAuthority=" + url.getAuthority());
//			System.out.println("url.getDefaultPort=" + url.getDefaultPort());
//			System.out.println("url.getFile=" + url.getFile());
//			System.out.println("url.getHost=" + url.getHost());
//			System.out.println("url.getPath=" + url.getPath());
//			System.out.println("url.getPort=" + url.getPort());
//			System.out.println("url.getProtocol=" + url.getProtocol());
//			System.out.println("url.getQuery=" + url.getQuery());
//			System.out.println("url.getRef=" + url.getRef());
//			System.out.println("url.toExternalForm=" + url.toExternalForm());
//			System.out.println("url.toURI=" + url.toURI());
//			
//			connection.disconnect();
//		} catch (Exception e) {
//			// TODO Auto-generated catch block
//			e.printStackTrace();
//		}
    	
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -