📄

📁 一个简单的在互联网上抓包的程序
💻
字号:
import java.util.*;
import java.net.URL;
import java.io.RandomAccessFile;

public class UrlMap extends TreeMap {
  public static String url2path(String pUrl) {
  	try {
  	  URL url = new URL(pUrl);
  	  String file = STR.last(url.getFile(), "/");
  	  if (file.length() == 0) file = "index.htm";
  	  if (file.endsWith("/")) file = file+"index.htm";
  	  if (file.endsWith(".html")) file = file.substring(0, file.length()-1);
  	  String site = url.getHost().replaceAll("www\\.", "").replaceAll("((.com)|(.org)|(.edu)|(.idv)|(.tw)|(.cn))", "");
  	  return site+"/"+Integer.toHexString(Math.abs(url.hashCode()))+"_"+file.substring(Math.max(file.length()-12, 0));
    } catch (Exception e) { return null; }
  }
	
  public static String fullUrl(String pFileURL, String pPath) { 
    try { 
      URL fileURL = new URL(pFileURL);
      URL fullURL = new URL(fileURL, pPath);
      return fullURL.toString();
    } catch (Exception e) { return null; }
  }
  
  public static UrlData[] html2urls(String html) {
  	html = REGEX.transform(html, "<a\\s[^>]*?href=\"?([^\"> ]{1,256}).*?>(.*?)</a>", "<a>(?1),(?2)</a>");
	String[] lines = REGEX.matches(html, 1, "<a>(.+?)</a>");
	UrlData urls[] = new UrlData[lines.length];
	for (int i=0; i<urls.length; i++) {
//		System.out.println("lines[i]="+lines[i]);
		urls[i] = new UrlData();
		urls[i].url = STR.head(lines[i], ",");
		urls[i].title = REGEX.xml2plaintext(STR.tail(lines[i], ","));
//		urls[i].title = STR.tail(lines[i], ",");
//		System.out.println("urldata="+urls[i]);
	}
    return urls;
  }

/*  
  long urlOffset = 0;
  
  public static RandomAccessFile urlFile=null;
  
  public static void main(String[] args) throws Exception {
    String path = url2path("http://www.kmit.edu.tw/ccc/Carwler.htm");
    System.out.println(path);
    System.out.println(fullUrl("http://www.kmit.edu.tw/abc/def/ghi.htm", "123/456.htm"));
    UrlData[] urls = html2urls(STR.file2text("test.htm"));
    System.out.println(Arrays.asList(urls));
    UrlMap urlMap = new UrlMap();
    urlMap.add("http://www.kmit.edu.tw/ccc/Carwler.htm", "");
    urlMap.add("http://www.kmit.edu.tw/ccc/Carwler.htm", "");
    System.out.println(urlMap.getIndex("http://www.kmit.edu.tw/ccc/Carwler.htm"));
    System.out.println(urlMap);
  }

  UrlMap() throws Exception {
    if (urlFile != null) throw new Error("Error : UrlMap cannot be declared twice !");
    urlFile = new RandomAccessFile("url.txt", "rw");
  }
  
  UrlIndex getIndex(String pUrl) throws Exception {
  	UrlIndex targetUrl = new UrlIndex();
  	targetUrl.hashCode = pUrl.hashCode();
  	return (UrlIndex) get(targetUrl);
  }
  
  UrlIndex add(String url, String title) throws Exception {
  	UrlIndex urlIdx = getIndex(url);
  	if (urlIdx == null) {
  		urlIdx = new UrlIndex();
  		urlIdx.hashCode = url.hashCode();
  		urlIdx.offset = (int) append(url, title);
  	}
  	urlIdx.refCount++;
	put(urlIdx, urlIdx);
  	return urlIdx;
  }
  
  int fileSize() throws Exception { 
    return (int) urlFile.length(); 
  }
  
  public long append(String url, String title) throws Exception {
	urlFile.seek(fileSize());
	long offset = fileSize();
	String line = url+","+title+"\r\n";
	byte[] bytes = line.getBytes();
	urlFile.write(bytes);
	return offset;
  }
  
  public static UrlData read(long pOffset) throws Exception {
  	UrlData urldata = new UrlData();
  	urlFile.seek(pOffset);
  	String line = urlFile.readLine();
  	urldata.url = STR.head(line, ",");
  	urldata.title = STR.tail(line, ",");
  	return urldata;
  }
  
  public String toString(UrlIndex urlIndex) {
  	try {
  	  urlFile.seek(urlIndex.offset);
  	  return urlFile.readLine();
    } catch (Exception e) {
      return null;
    }
  }  */
}

class UrlData {
  String url, title;  
  public String toString() { return url+","+title; }
}
/*
class UrlIndex implements Comparable {
  int hashCode=0, offset=0; short refCount=0;
  
  public int hashCode() { return hashCode; }
  
  public int compareTo(Object o) {
  	UrlIndex idx2 = (UrlIndex) o;
  	if (hashCode > idx2.hashCode) return 1;
  	if (hashCode < idx2.hashCode) return -1;
  	return 0;
  }
  
  public String toString() { return "(hash="+hashCode+" offset="+offset+" ref="+refCount+")"; }
}*/
💿 文件大小 2147 K
👤 上传用户 jxsflq
📂 所属分类 Jsp/Servlet
🏷️ 相关标签

#互联网 #程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -