📄 crawlerweb.java
字号:
//package Crawlerweb;import java.io.*;import java.net.*;import java.util.*;import java.lang.*;import Crawlerweb.TQueue;import Crawlerweb.ThreeStrings;import Crawlerweb.TwoStrings;import javax.swing.*;import java.awt.*;import java.awt.event.*;import java.awt.event.ActionListener;import java.awt.event.ActionEvent;/*A filter class for FileChooser that shows only directories in file chooser. */class DirectoryFilter extends javax.swing.filechooser.FileFilter { public boolean accept(File f) { return f.isDirectory(); } public String getDescription() { return "Directory"; }}/*Main class. */public class Crawlerweb extends JDialog { /******************************************************************************************* CLASS FIELDS WHICH ARE (MOSTLY) SET AT THE BEGINNING AND ARE USEFUL DURING WHOLE PROCESS OF RETRIEVING A WEB SITE. *******************************************************************************************/ private String DOMAIN = ""; /* Always contains current domain (if an URL was http://www.yahoo.com/mail, this field will be http://www.yahoo.com */ private String STARTING_URL = ""; /* An URL user entered at the beginning */ private int DEPTH = 0; /* How much links to put on queue */ private boolean INFINITE = false; /* If set to true, program will put every link on queue (but not if on foreign domain) */ private boolean FOREIGN_DOMAIN_ALLOWED = false; /* If set to true, program will put on queue links that point to domains other than STARTING_URL */ private String ROOT_DIR_PATH = ""; /* Absolute path to root directory, where we store web site */ private String USER_DIR = ""; /* Absolute path to directory user chose from GUI. */ private File ROOT_DIR = null; /* Instance of File class, where program records data. Instantiated with ROOT_DIR = new File(ROOT_DIR_PATH) */ private boolean IN_SCRIPT = false; /* Set to true if program parses a HTML code between <SCRIPT> and </SCRIPT> tags.*/ /* Search for links is much heavier when IN_SCRIPT == true. */ private HashSet VALID_CHARS = null; /* Hash set of Character objects containing characters that can be used to build a valid URL address. */ private HashSet TRIED_FILES = null; /* Hash set containing absolute URL addresses of files that are downloaded or will be downloaded. */ private boolean VERBOSE = true; /* If set to true, prints some debugging information to output.log. */ private boolean SILENT = true; /* If set to false, program will ask user what to do with every link (Download, Abort, Skip). */ private boolean LOGGING = false; /* If set to true, program will log output to output.log that is shown on screen. */ private final static String APP_NAME = "Crawlerweb"; private final static String APP_VERSION = "v0.2"; /***********************************************************************************************************/ /* Fields used in GUI. */ private JSpinner depthSpinner = null; private JCheckBox foreignCheckBox = null; private JCheckBox infCheckBox = null; private JCheckBox silentCheckBox = null; private JLabel numLabel = null; private JLabel dirLabel = null; private JTextArea output = null; private JLabel outputLabel = null; private JCheckBox logCheckBox = null; private JTextField dirText = null; private JLabel urlLabel = null; private JTextField urlText = null; private JButton directory_button = null; private JButton main_button = null; /***********************************************************************************************************/ private FileWriter err_output; private BufferedWriter err_out; /* err_out will be instantiated to write to output.log */ private boolean EOF = false; /* EOF will be set to true if method getTag reached */ private TQueue queue = new TQueue(); /* A main queue on which we put URLs and some other useful information about URLs.*/ /*******************************************************************************************/ /* Reads input file and makes a first string made of all chars until "open" char, and a second string made of all char between "open" and "close" char. */ private TwoStrings getTag(char open, char close, InputStreamReader in) throws IOException { String temp_str = ""; String pre_tag = ""; int temp_char = 0; TwoStrings ts = null; boolean newline = false; boolean open_tag = false, close_tag = false; do { temp_char = in.read(); if ( ((char) temp_char == '\r') | ((char) temp_char == '\n') ) { newline = true; /* we stop making string when we find a newline character, because there are web pages with a lot of plain text and no tags, and processing pages like that causes java buffers to overflow. */ pre_tag = pre_tag + (char) temp_char; } else if (temp_char == -1) EOF = true; else if ( (char) temp_char == open ) open_tag = true; else pre_tag = pre_tag + (char) temp_char; } while (!EOF & !newline & !open_tag); if (!EOF & !newline & open_tag) { temp_str = temp_str + (char) temp_char; // open char goes to temp_str do { temp_char = in.read(); if (temp_char == -1) EOF = true; else if ((char) temp_char == close) { close_tag = true; temp_str = temp_str + (char) temp_char; } else { temp_str = temp_str + (char) temp_char; } } while (!EOF & !close_tag); } ts = new TwoStrings(pre_tag, temp_str); return ts; } /* Returns true if string s contains string p, false otherwise. */ private boolean StringContains(String s, String p) throws IOException { int i; i = QuickSearch(s, p); if (i != -1) return true; else return false; } /* Returns a string found between quotes (double or single) starting from string offset. */ private String whatsInsideQuotes(String s, int offset) throws IOException { String temp_str = ""; int i = offset; char temp_char = 'z'; boolean single_quotes, ok = false; // There can be few characters between "href" and quotes, for example: href = "wapgoin" we have " = " between href and quotes... while ((i <= s.length() - 1) & ((s.charAt(i) == ' ') | (s.charAt(i) == '=') | (s.charAt(i) == '\n') | (s.charAt(i) == '\r'))) i++; if (i <= s.length()-1) { temp_char = s.charAt(i); if ((temp_char != '\"') & (temp_char != '\'')) // if first character found (other than those upthere) is not a quote, return empty string. return ""; if (i <= s.length() - 1) { if (temp_char == '\'') single_quotes = true; else single_quotes = false; i++; // We are not interested in first quote anymore, so move on to next char... temp_char = s.charAt(i); if (single_quotes) { // make a new string made of all characters found until other single quote while ((i <= s.length() - 1) & (temp_char != '\'') ) { temp_char = s.charAt(i); if ( (temp_char != '\r') & (temp_char != '\n') ) { temp_str = temp_str + temp_char; } i++; } } else { // make a new string made of all characters found until other double quote while ((i <= s.length() - 1) & (temp_char != '\"') ) { temp_char = s.charAt(i); if ( (temp_char != '\r') & (temp_char != '\n') ) { temp_str = temp_str + temp_char; } i++; } } if (temp_str.length() > 0) temp_str = temp_str.substring(0, temp_str.length()-1); } } return temp_str; } /* Returns position of first instance of string p in string s. Returns -1 if p not found. */ private int QuickSearch(String s, String p) throws IOException { int[] shift = new int[65536]; char ch; int i, offset, test, position, end; // We want this method to search ignoring case so... s = s.toLowerCase(); p = p.toLowerCase(); if (s.length() < p.length()) { return -1; } else if (p.length() == 1) { position = 0; ch = p.charAt(0); while ( (position <= s.length()-1) && (s.charAt(position) != ch) ) { position++; } if (position > s.length()-1) return -1; else return position; } else if (s.length() == p.length()) { if (s.equalsIgnoreCase(p)) return 0; else return -1; } else { offset = p.length(); for (int j=0; j<= 65535; j++) { shift[j] = offset; } for (int k = 0; k <= p.length()-1; k++) { shift[(int) p.charAt(k)] = offset - k; } position = 0; test = 0; end = s.length() - p.length(); do { if (p.charAt(test) == s.charAt(position + test)) { test++; } else { if (position+offset == s.length()) // temporary fix for bug "if p.length() == s.length()/2 then crash" return -1; position = position + shift[(int) s.charAt(position + offset)]; test = 0; } } while ((test <= p.length()-1) & (position <= end)); if (test > p.length()-1) { return position; } else { return -1; } } } /* Returns domain URL in complete form (http://www.yahoo.com instead of www.yahoo.com) */ private String getDomain(String s) throws IOException { int i = QuickSearch(s, "://"); if (i != -1) { // URL begins with ftp:// or http:// or file:// or similar i += 3; do { i++; } while ((i <= s.length() - 1) && (s.charAt(i) != '/')); if (i <= s.length() - 1) s = s.substring(0, i); } else { i = 0; do { i++; } while ((i <= s.length() - 1) && (s.charAt(i) != '/')); if (i <= s.length() - 1) s = s.substring(0, i); s = "http://" + s; } return s; } /* Counts slashes in string :) */ private int SlashCount(String s) throws IOException { int i = 0, count = 0; while (i <= s.length() - 1) { if (s.charAt(i) == '/') count++; i++; } return count; } /* Tries to determine whether given URL points to directory or not. TAKES FULL URL AS AN ARGUMENT!!! URL'S WITHOUT HTTP:// OR SIMILAR WILL NOT WORK!!! */ private boolean IsDirectory(String s) throws IOException { int i; int dots = 0; i = SlashCount(s); if (i < 3) return true; else { if (s.charAt(s.length() - 1) == '/') return true; else { i = s.length() - 1; while (s.charAt(i) != '/') { if (s.charAt(i) == '.') dots++; i--; } if (dots >= 2) return true; else if (dots == 0) return true; else return false; } } } /* Makes root directory name from absolute URL. Relative URL's will NOT work! */ private String mkRootDirName(String s) throws IOException { int i = QuickSearch(s, "://"); if (i != -1) { s = s.substring(i + 3); s = s.replace('/', File.separatorChar); return s; } else return ""; } /* Searches for regular anchor within HTML tag... */ private String getAnchor(String tag) throws IOException { int i = 1; boolean ok = true; String temp = ""; tag = tag.substring(1, tag.length()-1); if ((i = QuickSearch(tag, "href")) != -1) temp = whatsInsideQuotes(tag, i+4); // i+4 because "href".length() == 4 else if ((i = QuickSearch(tag, "src")) != -1) temp = whatsInsideQuotes(tag, i+3); else if ((i = QuickSearch(tag, "background")) != -1) temp = whatsInsideQuotes(tag, i+10); return temp; } /* Determines type of URL (absolute, relative, mailto) */ private String typeOfURL(String URL) throws IOException { if (URL.startsWith(".") | URL.startsWith("..") | URL.startsWith("/")) return "relative"; else if (URL.startsWith("http") | URL.startsWith("ftp")) return "absolute"; else if (URL.startsWith("mailto:")) return "mail"; else return "relative"; } /* Returns file name from given URL, empty string if URL contains no file name */ private String getFileName(String URLAddress) throws IOException { String temp = ""; int i = URLAddress.length(); if (i != 0) { i--; while ( (i > 0) && (URLAddress.charAt(i) != '/') ) { i--; } if (i != 0) { i++; while (i <= URLAddress.length() - 1) { temp = temp + URLAddress.charAt(i); i++; } } } return temp; } /* Removes file name from URL (useful when creating directories) */ private String remFileName(String URLAddress) throws IOException { String temp = ""; int i = URLAddress.length() - 1; int j = 0; boolean ok = true; if (VERBOSE) { err_out.write("remFileName: URLAddress: " + URLAddress); err_out.newLine(); err_out.flush(); } if (i > 0) { while ( (i >= 0) & ok ) { if (URLAddress.charAt(i) == '/') ok = false; i--; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -