📄 crawlerweb.java

📁 一个用JAVA编写的小小爬虫,在做实验的时候觉得挺好的,拿来大家分享下,看看没什么损失的~`
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
//package Crawlerweb;import java.io.*;import java.net.*;import java.util.*;import java.lang.*;import Crawlerweb.TQueue;import Crawlerweb.ThreeStrings;import Crawlerweb.TwoStrings;import javax.swing.*;import java.awt.*;import java.awt.event.*;import java.awt.event.ActionListener;import java.awt.event.ActionEvent;/*A filter class for FileChooser that shows only directories in file chooser. */class DirectoryFilter extends javax.swing.filechooser.FileFilter {	public boolean accept(File f) {		return f.isDirectory();	}	public String getDescription() {		return "Directory";	}}/*Main class. */public class Crawlerweb extends JDialog {    /*******************************************************************************************     CLASS FIELDS WHICH ARE (MOSTLY) SET AT THE BEGINNING AND ARE USEFUL DURING WHOLE     PROCESS OF RETRIEVING A WEB SITE.     *******************************************************************************************/    private  String DOMAIN = "";	/* Always contains current domain (if an URL was http://www.yahoo.com/mail, this field will be http://www.yahoo.com */    private  String STARTING_URL = "";	/* An URL user entered at the beginning */    private  int DEPTH = 0;	/* How much links to put on queue */	private  boolean INFINITE = false;	/* If set to true, program will put every link on queue (but not if on foreign domain) */    private  boolean FOREIGN_DOMAIN_ALLOWED = false;	/* If set to true, program will put on queue links that point to domains other than STARTING_URL */    private  String ROOT_DIR_PATH = "";	/* Absolute path to root directory, where we store web site */	private  String USER_DIR = "";	/* Absolute path to directory user chose from GUI. */	private  File ROOT_DIR = null;	/* Instance of File class, where program records data. Instantiated with ROOT_DIR = new File(ROOT_DIR_PATH) */	private  boolean IN_SCRIPT = false;	/* Set to true if program parses a HTML code between <SCRIPT> and </SCRIPT> tags.*/	/* Search for links is much heavier when IN_SCRIPT == true. */	private  HashSet VALID_CHARS = null;	/* Hash set of Character objects containing characters that can be used to build a valid URL address. */	private  HashSet TRIED_FILES = null;	/* Hash set containing absolute URL addresses of files that are downloaded or will be downloaded. */	private  boolean VERBOSE = true;	/* If set to true, prints some debugging information to output.log. */	private  boolean SILENT = true;	/* If set to false, program will ask user what to do with every link (Download, Abort, Skip). */	private  boolean LOGGING = false;	/* If set to true, program will log output to output.log that is shown on screen. */    private final static String APP_NAME = "Crawlerweb";    private final static String APP_VERSION = "v0.2";	/***********************************************************************************************************/	/* Fields used in GUI. */    private JSpinner depthSpinner = null;    private JCheckBox foreignCheckBox = null;    private JCheckBox infCheckBox = null;	  private JCheckBox silentCheckBox = null;    private JLabel numLabel = null;    private JLabel dirLabel = null;	  private JTextArea output = null;	  private JLabel outputLabel = null;	  private JCheckBox logCheckBox = null;    private JTextField dirText = null;    private JLabel urlLabel = null;    private JTextField urlText = null;	  private JButton directory_button = null;    private JButton main_button = null;	/***********************************************************************************************************/	private FileWriter err_output;	private BufferedWriter err_out;	/* err_out will be instantiated to write to output.log */    private boolean EOF = false;	/* EOF will be set to true if method getTag reached */    private TQueue queue = new TQueue();	/* A main queue on which we put URLs and some other useful information about URLs.*/    /*******************************************************************************************/    /*    Reads input file and makes a first string made of all chars until "open" char,	and a second string made of all char between "open" and "close" char.         */    private TwoStrings getTag(char open, char close, InputStreamReader in) throws IOException {		    String temp_str = "";        String pre_tag = "";        int temp_char = 0;        TwoStrings ts = null;        boolean newline = false;        boolean open_tag = false, close_tag = false;		        do {			temp_char = in.read();			if ( ((char) temp_char == '\r') | ((char) temp_char == '\n') ) {				newline = true; 				/*				we stop making string when we find a newline character, 				because there are web pages with a lot of plain text and no tags,				and processing pages like that causes java buffers to overflow.				 */				pre_tag = pre_tag + (char) temp_char;			}			else if (temp_char == -1)				EOF = true;			else if ( (char) temp_char == open )				open_tag = true;			else				pre_tag = pre_tag + (char) temp_char;        } while (!EOF & !newline & !open_tag);        if (!EOF & !newline & open_tag) {                        temp_str = temp_str + (char) temp_char; // open char goes to temp_str           			do {				temp_char = in.read();				if (temp_char == -1)					EOF = true;				else if ((char) temp_char == close) {					close_tag = true;					temp_str = temp_str + (char) temp_char;				}				else {					temp_str = temp_str + (char) temp_char;				}               			}  while (!EOF & !close_tag);        }        ts = new TwoStrings(pre_tag, temp_str);		return ts;    }	/*	Returns true if string s contains string p, false otherwise.	 */	private boolean StringContains(String s, String p) throws IOException {		int i;		i = QuickSearch(s, p);				if (i != -1)			return true;		else			return false;	}	/*	Returns a string found between quotes (double or single) starting from string offset.	 */    private String whatsInsideQuotes(String s, int offset) throws IOException {        String temp_str = "";        int i = offset;        char temp_char = 'z';		boolean single_quotes, ok = false;		// There can be few characters between "href" and quotes, for example: href = "wapgoin" we have " = " between href and quotes...		while ((i <= s.length() - 1) & ((s.charAt(i) == ' ') | (s.charAt(i) == '=') | (s.charAt(i) == '\n') | (s.charAt(i) == '\r')))			i++;		if (i <= s.length()-1) {			temp_char = s.charAt(i);			if ((temp_char != '\"') & (temp_char != '\''))				// if first character found (other than those upthere) is not a quote, return empty string.				return "";			if (i <= s.length() - 1) {				if (temp_char == '\'')					single_quotes = true;				else					single_quotes = false;				i++; // We are not interested in first quote anymore, so move on to next char...				temp_char = s.charAt(i);				if (single_quotes) {					// make a new string made of all characters found until other single quote					while ((i <= s.length() - 1) & (temp_char != '\'') ) {												temp_char = s.charAt(i);						if ( (temp_char != '\r') & (temp_char != '\n') ) {							temp_str = temp_str + temp_char;						}            						i++;					}				}				else {					// make a new string made of all characters found until other double quote					while ((i <= s.length() - 1) & (temp_char != '\"') ) {												temp_char = s.charAt(i);						if ( (temp_char != '\r') & (temp_char != '\n') ) {							temp_str = temp_str + temp_char;						}						i++;					}				}				if (temp_str.length() > 0)					temp_str = temp_str.substring(0, temp_str.length()-1);			}		}        return temp_str;    }    /*    Returns position of first instance of string p in string s. Returns -1 if p not found.     */    private int QuickSearch(String s, String p) throws IOException {        int[] shift = new int[65536];        char ch;        int i, offset, test, position, end;		// We want this method to search ignoring case so...		s = s.toLowerCase();		p = p.toLowerCase();        if (s.length() < p.length()) {            return -1;        }        else if (p.length() == 1) {            position = 0;            ch = p.charAt(0);            while ( (position <= s.length()-1) && (s.charAt(position) != ch) ) {                position++;            }            if (position > s.length()-1)                return -1;            else                return position;        }        else if (s.length() == p.length()) {            if (s.equalsIgnoreCase(p))                return 0;            else                return -1;        }        else {            offset = p.length();            for (int j=0; j<= 65535; j++) {                shift[j] = offset;            }            for (int k = 0; k <= p.length()-1; k++) {                shift[(int) p.charAt(k)] = offset - k;            }                        position = 0;            test = 0;            end = s.length() - p.length();            do {                if (p.charAt(test) == s.charAt(position + test)) {                    test++;                }                else {					if (position+offset == s.length()) // temporary fix for bug "if p.length() == s.length()/2 then crash"                        return -1;                    position = position + shift[(int) s.charAt(position + offset)];                    test = 0;                }            } while ((test <= p.length()-1) & (position <= end));            if (test > p.length()-1) {                return position;            }            else {                return -1;            }        }    }    /*    Returns domain URL in complete form (http://www.yahoo.com instead of www.yahoo.com)     */    private String getDomain(String s) throws IOException {		int i = QuickSearch(s, "://");        if (i != -1) { // URL begins with ftp:// or http:// or file:// or similar			i += 3;			do {				i++;			} while ((i <= s.length() - 1) && (s.charAt(i) != '/'));			if (i <= s.length() - 1)				s = s.substring(0, i);        }        else {			i = 0;			do {				i++;			} while ((i <= s.length() - 1) && (s.charAt(i) != '/'));			if (i <= s.length() - 1)				s = s.substring(0, i);            s = "http://" + s;        }		return s;    }	/* 	Counts slashes in string :)	 */	private int SlashCount(String s) throws IOException {		int i = 0, count = 0;		while (i <= s.length() - 1) {			if (s.charAt(i) == '/')				count++;			i++;		}		return count;	}	/*	Tries to determine whether given URL points to directory or not.	TAKES FULL URL AS AN ARGUMENT!!! URL'S WITHOUT HTTP:// OR SIMILAR WILL NOT WORK!!!	 */	private boolean IsDirectory(String s) throws IOException {		int i;		int dots = 0;		i = SlashCount(s);		if (i < 3)			return true;		else {			if (s.charAt(s.length() - 1) == '/')				return true;			else {				i = s.length() - 1;				while (s.charAt(i) != '/') {					if (s.charAt(i) == '.')						dots++;					i--;				}				if (dots >= 2)					return true;				else if (dots == 0)					return true;				else					return false;			}		}	}	/*	Makes root directory name from absolute URL.	Relative URL's will NOT work!	 */    private String mkRootDirName(String s) throws IOException  {		int i = QuickSearch(s, "://");        if (i != -1) {            s = s.substring(i + 3);			s = s.replace('/', File.separatorChar);            return s;        }        else            return "";    }	/*	Searches for regular anchor within HTML tag...	 */    private String getAnchor(String tag) throws IOException {		int i = 1;		boolean ok = true;		String temp = "";        tag = tag.substring(1, tag.length()-1);        if ((i = QuickSearch(tag, "href")) != -1)            temp = whatsInsideQuotes(tag, i+4); // i+4 because "href".length() == 4        else if ((i = QuickSearch(tag, "src")) != -1)            temp = whatsInsideQuotes(tag, i+3);		else if ((i = QuickSearch(tag, "background")) != -1)            temp = whatsInsideQuotes(tag, i+10);		return temp;    }	/*	Determines type of URL (absolute, relative, mailto)	 */    private String typeOfURL(String URL) throws IOException {		if (URL.startsWith(".") | URL.startsWith("..") | URL.startsWith("/"))			return "relative";		else if (URL.startsWith("http") | URL.startsWith("ftp"))			return "absolute";		else if (URL.startsWith("mailto:"))			return "mail";		else			return "relative";    }	/*	Returns file name from given URL, empty string if URL contains no file name	 */    private String getFileName(String URLAddress) throws IOException {        String temp = "";        int i = URLAddress.length();        if (i != 0) {            i--;            while ( (i > 0) && (URLAddress.charAt(i) != '/') ) {                i--;            }            if (i != 0) {                i++;                while (i <= URLAddress.length() - 1) {                    temp = temp + URLAddress.charAt(i);                    i++;                }            }        }		return temp;    }	/*	Removes file name from URL (useful when creating directories)	 */    private String remFileName(String URLAddress) throws IOException {        String temp = "";        int i = URLAddress.length() - 1;        int j = 0;		boolean ok = true;		if (VERBOSE) {			err_out.write("remFileName: URLAddress: " + URLAddress);			err_out.newLine();			err_out.flush();		}        if (i > 0) {            while ( (i >= 0) & ok ) {				if (URLAddress.charAt(i) == '/')					ok = false;                i--;			}
12 3 4 下一页
💿 文件大小 13 K
👤 上传用户 my
📂 所属分类 Java编程
🏷️ 相关标签

#JAVA #编写 #实验 #分
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -