📄
字号:
import java.util.*;
import java.net.URL;
import java.io.RandomAccessFile;
public class UrlMap extends TreeMap {
public static String url2path(String pUrl) {
try {
URL url = new URL(pUrl);
String file = STR.last(url.getFile(), "/");
if (file.length() == 0) file = "index.htm";
if (file.endsWith("/")) file = file+"index.htm";
if (file.endsWith(".html")) file = file.substring(0, file.length()-1);
String site = url.getHost().replaceAll("www\\.", "").replaceAll("((.com)|(.org)|(.edu)|(.idv)|(.tw)|(.cn))", "");
return site+"/"+Integer.toHexString(Math.abs(url.hashCode()))+"_"+file.substring(Math.max(file.length()-12, 0));
} catch (Exception e) { return null; }
}
public static String fullUrl(String pFileURL, String pPath) {
try {
URL fileURL = new URL(pFileURL);
URL fullURL = new URL(fileURL, pPath);
return fullURL.toString();
} catch (Exception e) { return null; }
}
public static UrlData[] html2urls(String html) {
html = REGEX.transform(html, "<a\\s[^>]*?href=\"?([^\"> ]{1,256}).*?>(.*?)</a>", "<a>(?1),(?2)</a>");
String[] lines = REGEX.matches(html, 1, "<a>(.+?)</a>");
UrlData urls[] = new UrlData[lines.length];
for (int i=0; i<urls.length; i++) {
// System.out.println("lines[i]="+lines[i]);
urls[i] = new UrlData();
urls[i].url = STR.head(lines[i], ",");
urls[i].title = REGEX.xml2plaintext(STR.tail(lines[i], ","));
// urls[i].title = STR.tail(lines[i], ",");
// System.out.println("urldata="+urls[i]);
}
return urls;
}
/*
long urlOffset = 0;
public static RandomAccessFile urlFile=null;
public static void main(String[] args) throws Exception {
String path = url2path("http://www.kmit.edu.tw/ccc/Carwler.htm");
System.out.println(path);
System.out.println(fullUrl("http://www.kmit.edu.tw/abc/def/ghi.htm", "123/456.htm"));
UrlData[] urls = html2urls(STR.file2text("test.htm"));
System.out.println(Arrays.asList(urls));
UrlMap urlMap = new UrlMap();
urlMap.add("http://www.kmit.edu.tw/ccc/Carwler.htm", "");
urlMap.add("http://www.kmit.edu.tw/ccc/Carwler.htm", "");
System.out.println(urlMap.getIndex("http://www.kmit.edu.tw/ccc/Carwler.htm"));
System.out.println(urlMap);
}
UrlMap() throws Exception {
if (urlFile != null) throw new Error("Error : UrlMap cannot be declared twice !");
urlFile = new RandomAccessFile("url.txt", "rw");
}
UrlIndex getIndex(String pUrl) throws Exception {
UrlIndex targetUrl = new UrlIndex();
targetUrl.hashCode = pUrl.hashCode();
return (UrlIndex) get(targetUrl);
}
UrlIndex add(String url, String title) throws Exception {
UrlIndex urlIdx = getIndex(url);
if (urlIdx == null) {
urlIdx = new UrlIndex();
urlIdx.hashCode = url.hashCode();
urlIdx.offset = (int) append(url, title);
}
urlIdx.refCount++;
put(urlIdx, urlIdx);
return urlIdx;
}
int fileSize() throws Exception {
return (int) urlFile.length();
}
public long append(String url, String title) throws Exception {
urlFile.seek(fileSize());
long offset = fileSize();
String line = url+","+title+"\r\n";
byte[] bytes = line.getBytes();
urlFile.write(bytes);
return offset;
}
public static UrlData read(long pOffset) throws Exception {
UrlData urldata = new UrlData();
urlFile.seek(pOffset);
String line = urlFile.readLine();
urldata.url = STR.head(line, ",");
urldata.title = STR.tail(line, ",");
return urldata;
}
public String toString(UrlIndex urlIndex) {
try {
urlFile.seek(urlIndex.offset);
return urlFile.readLine();
} catch (Exception e) {
return null;
}
} */
}
class UrlData {
String url, title;
public String toString() { return url+","+title; }
}
/*
class UrlIndex implements Comparable {
int hashCode=0, offset=0; short refCount=0;
public int hashCode() { return hashCode; }
public int compareTo(Object o) {
UrlIndex idx2 = (UrlIndex) o;
if (hashCode > idx2.hashCode) return 1;
if (hashCode < idx2.hashCode) return -1;
return 0;
}
public String toString() { return "(hash="+hashCode+" offset="+offset+" ref="+refCount+")"; }
}*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -