首页 › 资源下载 › Java编程 › Java Crawler with do › 源码查看

saveurl1.java

来自「Java Crawler with domain knowledge path」· Java 代码 · 共 199 行

JAVA

199 行

package ie.vnit.webcrawler;

import java.net.*;
import java.io.*;
import java.util.*;

/**
 * Class with static methods that can save URLs and extract links
 * Works also as a standalone program, usage is: java SaveURL <url> [<file>]
 * If file is not specified, all links on the specified url are printed to the
 * standard console.
 *
 
 */

public class SaveURL1
{

	/**
	 * Opens a buffered stream on the url and copies the contents to writer
	 */
	public static void saveURL(URL url, Writer writer)
		throws IOException {
		BufferedInputStream in = new BufferedInputStream(url.openStream());
		for (int c = in.read(); c != -1; c = in.read()) {
			writer.write(c);
		}
	}

	/**
	 * Opens a buffered stream on the url and copies the contents to OutputStream
	 */
	public static void saveURL(URL url, OutputStream os)
		throws IOException {
		InputStream is = url.openStream();
		byte[] buf = new byte[1048576];
		int n = is.read(buf);
		while (n != -1) {
			os.write(buf, 0, n);
			n = is.read(buf);
		}
	}

	/**
	 * Writes the contents of the url to a string by calling saveURL with a
	 * string writer as argument
	 */
	public static String getURL(URL url)
		throws IOException {
		StringWriter sw = new StringWriter();
		saveURL(url, sw);
		return sw.toString();
	}

	/**
	 * Writes the contents of the url to a new file by calling saveURL with
	 * a file writer as argument
	 */
	public static void writeURLtoFile(URL url, String filename)
		throws IOException {
		// FileWriter writer = new FileWriter(filename);
		// saveURL(url, writer);
		// writer.close();
		FileOutputStream os = new FileOutputStream(filename);
		saveURL(url, os);
		os.close();
	}

	/**
	 * Extract links directly from a URL by calling extractLinks(getURL())
	 */
	public static Vector extractLinks(URL url)
		throws IOException {
		return extractLinks(getURL(url));
	}

	public static Map extractLinksWithText(URL url)
		throws IOException {
		return extractLinksWithText(getURL(url));
	}

	/**
     * Extract links from a html page given as a raw and a lower case string
     * In order to avoid the possible double conversion from mixed to lower case
     * a second method is provided, where the conversion is done externally.
     */
    public static Vector extractLinks(String rawPage, String page) {
		int index = 0;
		Vector links = new Vector();
		while ((index = page.indexOf("<a ", index)) != -1)
		{
		    if ((index = page.indexOf("href", index)) == -1) break;
		    if ((index = page.indexOf("=", index)) == -1) break;
		    String remaining = rawPage.substring(++index);
		    StringTokenizer st 
				= new StringTokenizer(remaining, "\t\n\r\"'>#");
		    String strLink = st.nextToken();
			if (! links.contains(strLink)) links.add(strLink);
		}
		return links;
    }


	/**
	 * Extract links (key) with link text (value)
	 * Note that due to the nature of a Map only one link text is returned per
	 * URL, even if a link occurs multiple times with different texts.
	 */
	public static Map extractLinksWithText(String rawPage, String page) {
		int index = 0;
		Map links = new HashMap();
		while ((index = page.indexOf("<a ", index)) != -1)
		{
			int tagEnd = page.indexOf(">", index);
		    if ((index = page.indexOf("href", index)) == -1) break;
		    if ((index = page.indexOf("=", index)) == -1) break;
			int endTag = page.indexOf("</a", index);
		    String remaining = rawPage.substring(++index);
		    StringTokenizer st 
				= new StringTokenizer(remaining, "\t\n\r\"'>#");
		    String strLink = st.nextToken();
			String strText = "";
			if (tagEnd != -1 && tagEnd + 1 <= endTag) {
				strText = rawPage.substring(tagEnd + 1, endTag);
			}
			strText = strText.replaceAll("\\s+", " ");
			links.put(strLink, strText);
		}
		return links;
		
	}
    
    /**
	 * Extract links from a html page given as a String
	 * The return value is a vector of strings. This method does neither check
	 * the validity of its results nor does it care about html comments, so
	 * links that are commented out are also retrieved.
	 */
	public static Vector extractLinks(String rawPage) {
        return extractLinks(rawPage, rawPage.toLowerCase().replaceAll("\\s", " "));
	}

	public static Map extractLinksWithText(String rawPage) {
        return extractLinksWithText(rawPage, rawPage.toLowerCase().replaceAll("\\s", " "));
	}

	/**
	 * As a standalone program this class is capable of copying a url to a file
	 */
	public static void main(String[] args) {
		
		/*		Behind a firewall set your proxy and port here!
*/
                Properties props= new Properties(System.getProperties());
                props.put("http.proxySet", "true");
        	    props.put("http.proxyHost", "172.31.16.10");
        	    props.put("http.proxyPort", "8080");

                Properties newprops = new Properties(props);
                System.setProperties(newprops);
		
		try {
			//PrintWriter out=new PrintWriter(new FileWriter("1.txt"));
			if (args.length == 1) {
                URL url = new URL(args[0]);
                System.out.println("Content-Type: " +
                    url.openConnection().getContentType());
// 				Vector links = extractLinks(url);
// 				for (int n = 0; n < links.size(); n++) {
// 					System.out.println((String) links.elementAt(n));
// 				}
				Set links = extractLinksWithText(url).entrySet();
				Iterator it = links.iterator();
				while (it.hasNext()) {
					Map.Entry en = (Map.Entry) it.next();
					String strLink = (String) en.getKey();
					String strText = (String) en.getValue();
					System.out.println(strLink + " \"" + strText + "\" ");
					//out.println(strText);
				}
				return;
			} else if (args.length == 2) {
				writeURLtoFile(new URL(args[0]), args[1]);
				return;
			}
			//out.close();
		} catch (Exception e) {
			System.err.println("An error occured: ");
			e.printStackTrace();
// 			System.err.println(e.toString());
		}
		// Display usage information
		// (If the program had done anything sensible, we wouldn't be here.)
		System.err.println("Usage: java SaveURL <url> [<file>]");
		System.err.println("Saves a URL to a file.");
		System.err.println("If no file is given, extracts hyperlinks on url to console.");
	}
}

saveurl1.java - 源码说明

本页面展示了「Java Crawler with domain knowledge path」中的 saveurl1.java 源码文件，采用 Java 编程语言编写，共 199 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与knowledge相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?