📄 crawlerweb.java
字号:
}// while (!end_of_string) return strings; } /* This method determines whether string s is a filename or not. String s is considered a filename if: a) Contains only valid characters. b) Has at least one dot. If valid_url_check is set to true, this method only checks if s is a valid url. */ private boolean IsFile(String s, boolean valid_url_check) throws IOException { boolean ok = true; int i = 0; char temp_char; Character temp_Character = null; if ((s.length() == 0) | (s.length() == 1)) return false; while (ok & (i <= s.length() - 1)) { temp_char = s.charAt(i); temp_Character = new Character(temp_char); ok = VALID_CHARS.contains(temp_Character); i++; } if (!valid_url_check & ok) { i = s.length() - 1; while ((s.charAt(i) != '.') & (i > 0) & ok) { i--; if (i < s.length() - 10) { ok = false; } } if (s.charAt(i) == '.') return true; else return false; } else if (valid_url_check & ok) { return true; } else return false; } /* Takes an vector of strings found in quotes, and current directory as an argument. Iterates through vector, for every string taken out from vector checks if it's a file name, if it is a file name, makes an absolute address of that file and all other information needed, and pushes it to main queue. */ private void FilesInScripts(Vector strings, String curr_dir) throws IOException { int i = 0; String temp_str = "", type = "", tempwola = ""; Object temp = null; boolean ok = true, added; ThreeStrings threes = null; ok = !strings.isEmpty(); while (ok) { if (!strings.isEmpty()) { temp = strings.remove(0); if (temp != null) { temp_str = (String) temp; /* This one ensures that if we have more quotes encapsulated in this string, we go through all of them and search for filenames. */ if (HasMoreQuotes(temp_str)) FilesInScripts(EverythingInQuotes(temp_str), curr_dir); else { if (IsFile(temp_str, false)) { if (INFINITE) DEPTH = 1; tempwola = temp_str; if (ContainsLocalAnchor(tempwola)) tempwola = RemLocalAnchor(tempwola); type = typeOfURL(temp_str); if (type.equals("absolute")) { if (!TRIED_FILES.contains(tempwola) & (DEPTH > 0)) { added = TRIED_FILES.add(tempwola); if (sameDomain(temp_str)) { DEPTH--; threes = new ThreeStrings(tempwola , makeCurrDir(tempwola, curr_dir), DOMAIN); queue.Push(threes); } else { if (FOREIGN_DOMAIN_ALLOWED) { DEPTH--; threes = new ThreeStrings(tempwola , makeCurrDir(tempwola, "#"), getDomain(temp_str)); queue.Push(threes); } } } } else if (type.equals("relative")) { if (!TRIED_FILES.contains(makeAbsAddr(tempwola, curr_dir)) & (DEPTH > 0)) { DEPTH--; added = TRIED_FILES.add(makeAbsAddr(tempwola, curr_dir)); threes = new ThreeStrings(makeAbsAddr(tempwola, curr_dir), makeCurrDir(tempwola, curr_dir), DOMAIN); queue.Push(threes); } } } } } else ok = false; } else ok = false; } } private void outputPrintLn(String s) { output.setText(output.getText() + "\r\n" + s); } private void outputClearScreen() { output.setText(""); } /* This class will read files from given URL and store them to given location. If file is a HTML file, this class will parse tags when found, change href="" if needed, and then save HTML file. */ public void RWFactory(Crawlerweb app) throws IOException, MalformedURLException { ThreeStrings ts = null, temp_ts = null; TwoStrings get_tag_output = null; URL URLconn = null; String URLAddress = ""; String curr_dir = ""; String file_name = ""; String rel_addr = ""; String temp_tag = "", type = "", temp = "", tempwola = ""; // tempwola = tempWithOutLocalAnchor String first_string = ""; String file_path = ""; String temp_abs_addr = ""; String beginning_of_tag = ""; String ContentType = "", ResponseMessage = ""; String charset = ""; String s = null; int ContentLength = -1, ResponseCode = -1; BufferedInputStream ins = null; BufferedReader in = null; InputStreamReader insr = null; OutputStreamWriter outw = null; FileOutputStream filew = null; File file = null; FileOutputStream fileos = null; BufferedOutputStream outos = null; HttpURLConnection HTTPconn = null; ImageIcon icon = new ImageIcon("images/question.png"); int r, k; boolean urlOK, fileOK, binary = false, added, ok = false; while ((ts = (ThreeStrings) queue.Pop()) != null) { URLAddress = ts.getFirstString(); curr_dir = ts.getSecString(); DOMAIN = ts.getThirdString(); outputClearScreen(); outputPrintLn("----------------------------------------------------------------------"); outputPrintLn("Currently downloading: " + URLAddress); if (LOGGING) { err_out.write("----------------------------------------------------------------------"); err_out.newLine(); err_out.write("Currently downloading: " + URLAddress); err_out.newLine(); err_out.flush(); } urlOK = true; fileOK = true; try { URLconn = new URL(URLAddress); } catch (MalformedURLException e) { urlOK = false; } if (urlOK) { try { HTTPconn = (HttpURLConnection) URLconn.openConnection(); } catch (MalformedURLException e) { outputPrintLn(e.toString()); if (LOGGING) { err_out.write(e.toString()); err_out.newLine(); } continue; } catch (IOException e) { outputPrintLn(e.toString()); if (LOGGING) { err_out.write(e.toString()); err_out.newLine(); } continue; } try { ResponseCode = HTTPconn.getResponseCode(); } catch(UnknownHostException g) { outputPrintLn(g.toString()); if (LOGGING) { err_out.write(g.toString()); err_out.newLine(); } continue; } catch (ConnectException f) { outputPrintLn(f.toString()); if (LOGGING) { err_out.write(f.toString()); err_out.newLine(); } continue; } catch (MalformedURLException m) { outputPrintLn(m.toString()); if (LOGGING) { err_out.write(m.toString()); err_out.newLine(); } continue; } if (ResponseCode/100 == 4) fileOK = false; else { ResponseMessage = HTTPconn.getResponseMessage(); ContentLength = HTTPconn.getContentLength(); ContentType = HTTPconn.getContentType(); } } if (urlOK & fileOK) { outputPrintLn("Content type: " + ContentType + "."); if (LOGGING) { err_out.write("Content type: " + ContentType + "."); err_out.newLine(); } if (ContentLength != -1) { outputPrintLn("Content length: " + ContentLength + " bytes."); if (LOGGING) { err_out.write("Content length: " + ContentLength + " bytes."); err_out.newLine(); } } else { outputPrintLn("Content length: unknown."); if (LOGGING) { err_out.write("Content length: unknown."); err_out.newLine(); } } err_out.flush(); if (StringContains(ContentType, "text")) binary = false; else binary = true; if (!binary) { k = QuickSearch(ContentType, "charset="); if (k != -1) charset = ContentType.substring(k+8); else charset = ""; try { if (charset != "") insr = new InputStreamReader(HTTPconn.getInputStream(), charset); else insr = new InputStreamReader(HTTPconn.getInputStream()); in = new BufferedReader(insr); } catch (FileNotFoundException e) { outputPrintLn(e.toString()); continue; } if (ContainsVariables(URLAddress)) URLAddress = RemVariables(URLAddress); if (ContainsLocalAnchor(URLAddress)) URLAddress = RemLocalAnchor(URLAddress); file_name = getFileName(URLAddress); if (IsDirectory(URLAddress)) { if (URLAddress.endsWith("/")) file_name = "index.html"; else file_name = "/index.html"; } if (NoFileExtension(URLAddress) & StringContains(ContentType, "html")) { if (URLAddress.endsWith("/")) file_name = "index.html"; else file_name = "/index.html"; } else URLAddress = remFileName(URLAddress); if (!makeFilePath(URLAddress, curr_dir).endsWith("/")) file_path = makeFilePath(URLAddress, curr_dir) + File.separatorChar + file_name; else file_path = makeFilePath(URLAddress, curr_dir) + file_name; file = new File(file_path); outputPrintLn("Saving to: " + file.getAbsolutePath()); if (LOGGING) { err_out.write("Saving to: " + file.getAbsolutePath()); err_out.newLine(); err_out.flush(); } if (!SILENT) { do { Object[] possibilities = {"Download", "Skip", "Abort"}; s = (String) JOptionPane.showInputDialog( app, "What do you want\n" + "to do with this file?", "Customized Dialog", JOptionPane.PLAIN_MESSAGE, icon, possibilities, "Download"); if (s != null) { s = s.trim(); if (s.equals("Skip")) { insr.close(); continue; } else if (s.equals("Abort")) { insr.close(); return; } } } while (s == null); } try { ok = file.createNewFile(); } catch (IOException e) { outputPrintLn(e.toString()); continue; } if (ok & file.canWrite()) { filew = new FileOutputStream(file); if (charset != "") outw = new OutputStreamWriter(filew, charset); else outw = new OutputStreamWriter(filew); while (!EOF) { get_tag_output = getTag('<', '>', insr); //EOF is set/unset in getTag method first_string = get_tag_output.getFirstString(); if ( (first_string != "") ) { if (IN_SCRIPT) FilesInScripts(EverythingInQuotes(first_string), curr_dir); outw.write(first_string); // we will never change non-tag strings, so we can write 'em down } temp_tag = get_tag_output.getSecString(); if (temp_tag != "") { // if there is a tag... if (temp_tag.length() >= 7) { beginning_of_tag = temp_tag.substring(0, 7); beginning_of_tag = beginning_of_tag.toLowerCase(); if (temp_tag.startsWith(beginning_of_tag)) IN_SCRIPT = true; else if (temp_tag.equalsIgnoreCase("</script>")) IN_SCRIPT = false; } FilesInScripts(EverythingInQuotes(temp_tag), curr_dir); //I've enabled this because there can be an inline script or some unknown tag fields (other than href, src) temp = getAnchor(temp_tag); // see if there's some "href" or "src" in tag if ((temp != "") && IsFile(temp, true)) { // if there is... type = typeOfURL(temp); tempwola = temp; if (ContainsLocalAnchor(temp)) tempwola = RemLocalAnchor(temp); if (type.equals("absolute")) { if (sameDomain(temp)) { if (INFINITE) DEPTH = 1; if (!TRIED_FILES.contains(tempwola)) { if (DEPTH > 0) { DEPTH--; added = TRIED_FILES.add(tempwola); ts = new ThreeStrings(tempwola, makeCurrDir(tempwola, curr_dir), DOMAIN); queue.Push(ts); rel_addr = AbsToRel(temp, curr_dir, DOMAIN); temp_tag = Replace(temp_tag, temp, rel_addr); } } } else { if (INFINITE) DEPTH = 1; if (!TRIED_FILES.contains(tempwola)) { if (FOREIGN_DOMAIN_ALLOWED & (DEPTH > 0)) { DEPTH--; added = TRIED_FILES.add(tempwola); ts = new ThreeStrings(tempwola , makeCurrDir(tempwola, "#"), getDomain(temp)); queue.Push(ts); rel_addr = AbsToRel(temp, curr_dir, getDomain(temp)); temp_tag = Replace(temp_tag, temp, rel_addr); } } } } else if (type.equals("relative")) { if (INFINITE) DEPTH = 1; temp_abs_addr = makeAbsAddr(tempwola, curr_dir); if (!TRIED_FILES.contains(temp_abs_addr)) { if (DEPTH > 0) { DEPTH--; added = TRIED_FILES.add(temp_abs_addr); ts = new ThreeStrings(temp_abs_addr, makeCurrDir(tempwola, curr_dir), DOMAIN); queue.Push(ts); } } if (temp.startsWith("/")) temp_tag = Replace(temp_tag, temp, "." + temp); } outw.write(temp_tag); } else { outw.write(temp_tag); } } } // while (!EOF) EOF = false; outw.flush(); outw.close(); insr.close(); in.close(); } else { // if (file.createNewFile() & file.canWrite())
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -