⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawlerweb.java

📁 一个用JAVA编写的小小爬虫,在做实验的时候觉得挺好的,拿来大家分享下,看看没什么损失的~`
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
		}// while (!end_of_string)		return strings;	}	/*	This method determines whether string s is a filename or not.	String s is considered a filename if:	    a) Contains only valid characters.		b) Has at least one dot.	If valid_url_check is set to true, this method only checks if s is a valid url.	 */	private boolean IsFile(String s, boolean valid_url_check) throws IOException {		boolean ok = true;		int i = 0;		char temp_char;		Character temp_Character = null;		if ((s.length() == 0) | (s.length() == 1))			return false;		while (ok & (i <= s.length() - 1)) {			temp_char = s.charAt(i);			temp_Character = new Character(temp_char);			ok = VALID_CHARS.contains(temp_Character);						i++;		}		if (!valid_url_check & ok) {			i = s.length() - 1;			while ((s.charAt(i) != '.') & (i > 0) & ok) {				i--;				if (i < s.length() - 10) {					ok = false;				}			}			if (s.charAt(i) == '.')				return true;			else				return false;		}		else if (valid_url_check & ok) {			return true;		}		else			return false;	}	/*	Takes an vector of strings found in quotes, and current directory as an argument.	Iterates through vector, for every string taken out from vector checks if it's a file name,	if it is a file name, makes an absolute address of that file and all other information	needed, and pushes it to main queue.	 */	private void FilesInScripts(Vector strings, String curr_dir) throws IOException {		int i = 0;		String temp_str = "", type = "", tempwola = "";		Object temp = null;		boolean ok = true, added;		ThreeStrings threes = null;		ok = !strings.isEmpty();		while (ok) {			if (!strings.isEmpty()) {				temp = strings.remove(0);							if (temp != null) {					temp_str = (String) temp;					/*					This one ensures that if we have more quotes encapsulated in this string, we go through all					of them and search for filenames.					 */					if (HasMoreQuotes(temp_str))						FilesInScripts(EverythingInQuotes(temp_str), curr_dir);					else {						if (IsFile(temp_str, false)) {							if (INFINITE)								DEPTH = 1;							tempwola = temp_str;							if (ContainsLocalAnchor(tempwola))								tempwola = RemLocalAnchor(tempwola);							type = typeOfURL(temp_str);							if (type.equals("absolute")) {								if (!TRIED_FILES.contains(tempwola) & (DEPTH > 0)) {									added = TRIED_FILES.add(tempwola);																		if (sameDomain(temp_str)) {										DEPTH--;										threes = new ThreeStrings(tempwola , makeCurrDir(tempwola, curr_dir), DOMAIN);										queue.Push(threes);									}									else {										if (FOREIGN_DOMAIN_ALLOWED) {											DEPTH--;											threes = new ThreeStrings(tempwola , makeCurrDir(tempwola, "#"), getDomain(temp_str));											queue.Push(threes);										}									}																	}							}							else if (type.equals("relative")) {								if (!TRIED_FILES.contains(makeAbsAddr(tempwola, curr_dir)) & (DEPTH > 0)) {									DEPTH--;									added = TRIED_FILES.add(makeAbsAddr(tempwola, curr_dir));									threes = new ThreeStrings(makeAbsAddr(tempwola, curr_dir), makeCurrDir(tempwola, curr_dir), DOMAIN);									queue.Push(threes);								}							}													}					}				}				else					ok = false;			}			else				ok = false;		}	}	private void outputPrintLn(String s) {		output.setText(output.getText() + "\r\n" + s);	}	private void outputClearScreen() {		output.setText("");	}    /*    This class will read files from given URL and store them to given location.    If file is a HTML file, this class will parse tags when found, change href=""    if needed, and then save HTML file.     */    public void RWFactory(Crawlerweb app) throws IOException, MalformedURLException {        ThreeStrings ts = null, temp_ts = null;		TwoStrings get_tag_output = null;        URL URLconn = null;        String URLAddress = "";        String curr_dir = "";        String file_name = "";		String rel_addr = "";        String temp_tag = "", type = "", temp = "", tempwola = ""; // tempwola = tempWithOutLocalAnchor		String first_string = "";		String file_path = "";		String temp_abs_addr = "";		String beginning_of_tag = "";		String ContentType = "", ResponseMessage = "";		String charset = "";		String s = null;		int ContentLength = -1, ResponseCode = -1;		BufferedInputStream ins = null;		BufferedReader in = null;		InputStreamReader insr = null;		OutputStreamWriter outw = null;		FileOutputStream filew = null;		File file = null;		FileOutputStream fileos = null;		BufferedOutputStream outos = null;		HttpURLConnection HTTPconn = null;		ImageIcon icon = new ImageIcon("images/question.png");		int r, k;		boolean urlOK, fileOK, binary = false, added, ok = false;        while ((ts = (ThreeStrings) queue.Pop()) != null) {			URLAddress = ts.getFirstString();			curr_dir = ts.getSecString();			DOMAIN = ts.getThirdString();			outputClearScreen();			outputPrintLn("----------------------------------------------------------------------");			outputPrintLn("Currently downloading: " + URLAddress);			if (LOGGING) {				err_out.write("----------------------------------------------------------------------");				err_out.newLine();				err_out.write("Currently downloading: " + URLAddress);				err_out.newLine();				err_out.flush();			}			urlOK = true;			fileOK = true;			try {				URLconn = new URL(URLAddress);			}			catch (MalformedURLException e) {				urlOK = false;			}			if (urlOK) {				try {					HTTPconn = (HttpURLConnection) URLconn.openConnection();									}				catch (MalformedURLException e) {					outputPrintLn(e.toString());					if (LOGGING) {						err_out.write(e.toString());						err_out.newLine();					}					continue;				}				catch (IOException e) {					outputPrintLn(e.toString());					if (LOGGING) {						err_out.write(e.toString());						err_out.newLine();					}					continue;				}							    try {					ResponseCode = HTTPconn.getResponseCode();				}				catch(UnknownHostException g) {					outputPrintLn(g.toString());					if (LOGGING) {						err_out.write(g.toString());						err_out.newLine();					}					continue;				}				catch (ConnectException f) {					outputPrintLn(f.toString());					if (LOGGING) {						err_out.write(f.toString());						err_out.newLine();					}					continue;				}				catch (MalformedURLException m) {					outputPrintLn(m.toString());					if (LOGGING) {						err_out.write(m.toString());						err_out.newLine();					}					continue;				}				if (ResponseCode/100 == 4)					fileOK = false;				else {					ResponseMessage = HTTPconn.getResponseMessage();									ContentLength = HTTPconn.getContentLength();					ContentType = HTTPconn.getContentType();				}							}			if (urlOK & fileOK) {								outputPrintLn("Content type: " + ContentType + ".");				if (LOGGING) {					err_out.write("Content type: " + ContentType + ".");					err_out.newLine();				}				if (ContentLength != -1) {					outputPrintLn("Content length: " + ContentLength + " bytes.");					if (LOGGING) {						err_out.write("Content length: " + ContentLength + " bytes.");						err_out.newLine();					}				}				else {					outputPrintLn("Content length: unknown.");					if (LOGGING) {						err_out.write("Content length: unknown.");						err_out.newLine();					}				}				err_out.flush();				if (StringContains(ContentType, "text"))					binary = false;				else					binary = true;				if (!binary) {					k = QuickSearch(ContentType, "charset=");					if (k != -1)						charset = ContentType.substring(k+8);					else						charset = "";					try {						if (charset != "")							insr = new InputStreamReader(HTTPconn.getInputStream(), charset);						else							insr = new InputStreamReader(HTTPconn.getInputStream());						in = new BufferedReader(insr);					}					catch (FileNotFoundException e) {						outputPrintLn(e.toString());						continue;					}					if (ContainsVariables(URLAddress))						URLAddress = RemVariables(URLAddress);					if (ContainsLocalAnchor(URLAddress))						URLAddress = RemLocalAnchor(URLAddress);					file_name = getFileName(URLAddress);									if (IsDirectory(URLAddress)) {						if (URLAddress.endsWith("/"))							file_name = "index.html";						else							file_name = "/index.html";					}					if (NoFileExtension(URLAddress) & StringContains(ContentType, "html")) {						if (URLAddress.endsWith("/"))							file_name = "index.html";						else							file_name = "/index.html";					}					else						URLAddress = remFileName(URLAddress);								if (!makeFilePath(URLAddress, curr_dir).endsWith("/"))						file_path = makeFilePath(URLAddress, curr_dir) + File.separatorChar + file_name;					else						file_path = makeFilePath(URLAddress, curr_dir) + file_name;										file = new File(file_path);										outputPrintLn("Saving to: " + file.getAbsolutePath());					if (LOGGING) {						err_out.write("Saving to: " + file.getAbsolutePath());						err_out.newLine();						err_out.flush();					}					if (!SILENT) {						do {							Object[] possibilities = {"Download", "Skip", "Abort"};							s = (String) JOptionPane.showInputDialog(								                                    app,								                                    "What do you want\n"								                                    + "to do with this file?",								                                    "Customized Dialog",								                                    JOptionPane.PLAIN_MESSAGE,								                                    icon,								                                    possibilities,								                                    "Download");							if (s != null) {								s = s.trim();								if (s.equals("Skip")) {									insr.close();									continue;								}								else if (s.equals("Abort")) {									insr.close();									return;								}							}						} while (s == null);					}					try {						ok = file.createNewFile();					}					catch (IOException e) {						outputPrintLn(e.toString());						continue;					}					if (ok & file.canWrite()) {						filew = new FileOutputStream(file);													if (charset != "")							outw = new OutputStreamWriter(filew, charset);						else							outw = new OutputStreamWriter(filew);						while (!EOF) {							get_tag_output = getTag('<', '>', insr); //EOF is set/unset in getTag method							first_string = get_tag_output.getFirstString();							if ( (first_string != "") ) {								if (IN_SCRIPT)									FilesInScripts(EverythingInQuotes(first_string), curr_dir);								outw.write(first_string); // we will never change non-tag strings, so we can write 'em down							}							temp_tag = get_tag_output.getSecString();							if (temp_tag != "") { // if there is a tag...																if (temp_tag.length() >= 7) {									beginning_of_tag = temp_tag.substring(0, 7);									beginning_of_tag = beginning_of_tag.toLowerCase();																	if (temp_tag.startsWith(beginning_of_tag))										IN_SCRIPT = true;									else if (temp_tag.equalsIgnoreCase("</script>"))										IN_SCRIPT = false;								}																FilesInScripts(EverythingInQuotes(temp_tag), curr_dir);								//I've enabled this because there can be an inline script or some unknown tag fields (other than href, src)								temp = getAnchor(temp_tag); // see if there's some "href" or "src" in tag								if ((temp != "") && IsFile(temp, true)) { // if there is...									type = typeOfURL(temp);									tempwola = temp;									if (ContainsLocalAnchor(temp))										tempwola = RemLocalAnchor(temp);																		if (type.equals("absolute")) {										if (sameDomain(temp)) {											if (INFINITE)												DEPTH = 1;											if (!TRIED_FILES.contains(tempwola)) {												if (DEPTH > 0) {													DEPTH--;																									added = TRIED_FILES.add(tempwola);													ts = new ThreeStrings(tempwola, makeCurrDir(tempwola, curr_dir), DOMAIN);													queue.Push(ts);													rel_addr = AbsToRel(temp, curr_dir, DOMAIN);													temp_tag = Replace(temp_tag, temp, rel_addr);												}											}										}										else {											if (INFINITE)												DEPTH = 1;											if (!TRIED_FILES.contains(tempwola)) {												if (FOREIGN_DOMAIN_ALLOWED & (DEPTH > 0)) {													DEPTH--;													added = TRIED_FILES.add(tempwola);													ts = new ThreeStrings(tempwola , makeCurrDir(tempwola, "#"), getDomain(temp));													queue.Push(ts);													rel_addr = AbsToRel(temp, curr_dir, getDomain(temp));													temp_tag = Replace(temp_tag, temp, rel_addr);												}											}										}									}									else if (type.equals("relative")) {										if (INFINITE)											DEPTH = 1;										temp_abs_addr = makeAbsAddr(tempwola, curr_dir);										if (!TRIED_FILES.contains(temp_abs_addr)) {											if (DEPTH > 0) {												DEPTH--;												added = TRIED_FILES.add(temp_abs_addr);												ts = new ThreeStrings(temp_abs_addr, makeCurrDir(tempwola, curr_dir), DOMAIN);												queue.Push(ts);																							}										}										if (temp.startsWith("/"))											temp_tag = Replace(temp_tag, temp, "." + temp);									}									outw.write(temp_tag);								}								else {									outw.write(temp_tag);								}							}						} // while (!EOF)						EOF = false;						outw.flush();						outw.close();						insr.close();						in.close();					}					else { // if (file.createNewFile() & file.canWrite())

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -