📄 spider.java
字号:
package com.bitmechanic.spindle;import cvu.html.HTMLTokenizer;import cvu.html.TagToken;import cvu.html.TextToken;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import java.util.ArrayList;import java.util.HashMap;import java.util.Enumeration;import java.net.URL;import java.net.HttpURLConnection;import java.io.InputStreamReader;import java.io.BufferedReader;import java.io.StringReader;import java.io.File;import java.io.FileNotFoundException;import java.security.Security;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;import org.apache.commons.httpclient.methods.GetMethod;import com.microvois.luence.inputproxy;public class Spider implements Runnable { private static String lineSep = System.getProperty("line.separator"); private String indexDir; private ArrayList urls; private ArrayList include; private ArrayList exclude; private ArrayList threadList; private boolean verbose; private boolean incremental; private boolean groksHTTPS; private HashMap indexedURLs; private HashMap mimeTypes; private int threads; private int descSize; private int nInputcount = 0; private int bytes; private HttpClient httpclient; public static void main(String argv[]) { try { Spider s = new Spider(argv); s.go(); }catch(Exception se) { } inputproxy.close(); } public Spider(String argv[]) { groksHTTPS = true; verbose = false; incremental = false; threads = 2; descSize = 1024; bytes = 0; include = new ArrayList(); exclude = new ArrayList(); urls = new ArrayList(); threadList = new ArrayList(); indexedURLs = new HashMap(); mimeTypes = new HashMap(); parseArgs(argv); httpclient = new HttpClient(new MultiThreadedHttpConnectionManager()); httpclient.getHttpConnectionManager(). getParams().setConnectionTimeout(30000); } public void go() throws Exception { // create the index directory -- or append to existing if (verbose) { print("Creating index in: " + indexDir); if (incremental) print(" - using incremental mode"); } // check if we can do https URLs try { System.setProperty("java.protocol.handler.pkgs", "com.sun.net.ssl.internal.www.protocol"); Security.addProvider(new com.sun.net.ssl.internal.ssl.Provider()); URL url = new URL("https://www.bitmechanic.com/"); } catch (Exception e) { groksHTTPS = false; if (verbose) print("Disabling support for https URLs"); } // index each entry point URL long start = System.currentTimeMillis(); for (int i = 0; i < threads; i++) { Thread t = new Thread(this, "Spindle Spider Thread #" + (i + 1)); t.start(); threadList.add(t); } while (threadList.size() > 0) { Thread child = (Thread) threadList.remove(0); child.join(); } long elapsed = System.currentTimeMillis() - start; // save the index if (verbose) { print("Indexed " + indexedURLs.size() + " URLs (" + (bytes / 1024) + " KB) in " + (elapsed / 1000) + " seconds"); print("Optimizing index"); } } public void run() { String url; try { while ((url = dequeueURL()) != null) { indexURL(url); } } catch (Exception e) { e.printStackTrace(); } threads--; } public synchronized String dequeueURL() throws Exception { while (true) { if (urls.size() > 0) { return (String) urls.remove(0); } else { threads--; if (threads > 0) { wait(); threads++; } else { notifyAll(); return null; } } } } public synchronized void enqueueURL(String url) { if (indexedURLs.get(url) == null) { urls.add(url); indexedURLs.put(url, Boolean.TRUE); notifyAll(); } } private void indexURL(String url) throws Exception { if (verbose) print(" " + Thread.currentThread().getName() + ": Adding URL: " + url); URLSummary summary = loadURL(url); //System.out.println(summary.toString()); if (summary != null && summary.body != null) { String urls[] = parseURLs(summary); System.out.println(summary.toString()); synchronized (this) { bytes += summary.body.length(); if(inputproxy.inputData(summary.title,summary.desc , url)) { System.out.println(url+" input search engine sucess ..."); } else System.out.println(url+" input search engine failed ..."); if(nInputcount % 100 ==0 && nInputcount !=0) inputproxy.flush(); } for (int i = 0; i < urls.length; i++) { // check against the include/exclude list boolean add = true; for (int x = 0; add && x < include.size(); x++) { String inc = (String) include.get(x); add = (urls[i].indexOf(inc) != -1); } for (int x = 0; add && x < exclude.size(); x++) { String ex = (String) exclude.get(x); add = (urls[i].indexOf(ex) == -1); } if (add) { enqueueURL(urls[i]); } } } } // 解析页面中的内容 。。。。。 private String[] parseURLs(URLSummary summary) throws Exception { StringBuffer desc = new StringBuffer(); ArrayList urls = new ArrayList(); boolean isIgnoreText = false; HTMLTokenizer ht = new HTMLTokenizer(new StringReader(summary.body)); for (Enumeration e = ht.getTokens(); e.hasMoreElements();) { Object obj = e.nextElement(); if (obj instanceof TagToken) { TagToken tag = (TagToken) obj; String tagName = tag.getName().toLowerCase(); //System.out.println("tag="+tag.toString()+"::"+tagName); String url = null; if(tagName.equals("meta") ) { // 将keywords , description 加入到关键字中。 //System.out.println(tag.getAttribute("name")+"::"+ tag.getAttribute("content")); if(tag.getAttribute("name")!=null && "keywords".equalsIgnoreCase(tag.getAttribute("name")) && tag.getAttribute("content") !=null) { desc.append(tag.getAttribute("content")); } else if(tag.getAttribute("name")!=null && "description".equalsIgnoreCase(tag.getAttribute("name")) && tag.getAttribute("content") !=null) { desc.append(tag.getAttribute("content")); } } //过滤到script 里的内容, if(tag.isEndTag()==false && tagName.equals("script")) isIgnoreText= true; else if(tag.isEndTag()==true && tagName.equals("script")) { isIgnoreText = false; } else if (tagName.equals("a")) { url = tag.getAttributes().get("href"); } else if (tagName.equals("frame")) { url = tag.getAttributes().get("src"); } else if (tagName.equals("title") && e.hasMoreElements() && !tag.isEndTag()) { obj = e.nextElement(); if (obj instanceof TextToken) { TextToken title = (TextToken) obj; summary.title = title.getText(); } } if (url != null) { if (url.startsWith("http://") || (url.startsWith("https://") && groksHTTPS)) { // verify we're on the same host and port URL u = new URL(url); if (u.getHost().equals(summary.url.getHost()) && u.getPort() == summary.url.getPort()) { url = chopOffNamedAnchor(url); if (indexedURLs.get(url) == null) urls.add(url); } } else if (url.indexOf("://") == -1 && !url.startsWith("mailto:") && !url.startsWith("#") && !url.startsWith("javascript:")) { // parse relative url url = formURL(summary.url, url); url = chopOffNamedAnchor(url); if (indexedURLs.get(url) == null) urls.add(url); } } } else if (obj instanceof TextToken) { if(isIgnoreText) continue; TextToken t = (TextToken) obj; String text = t.getText(); if (text != null && text.trim().length() > 0) desc.append(text.trim()).append(" "); } } if (desc.length() > descSize) desc.setLength(descSize); summary.desc = desc.toString(); String list[] = new String[urls.size()]; urls.toArray(list); return list; } private String chopOffNamedAnchor(String url) { int pos = url.indexOf("#"); if (pos == -1) return url; else return url.substring(0, pos); } // converts relative URL to absolute URL private String formURL(URL origURL, String newURL) { StringBuffer base = new StringBuffer(origURL.getProtocol()); base.append("://").append(origURL.getHost()); if (origURL.getPort() != -1) { base.append(":").append(origURL.getPort()); } if (newURL.startsWith("/")) { base.append(newURL); } else if (newURL.startsWith("..")) { String file = origURL.getFile(); } else { String file = origURL.getFile(); int pos = file.lastIndexOf("/"); if (pos != -1) file = file.substring(0, pos); while (newURL.startsWith("../")) { pos = file.lastIndexOf("/"); file = file.substring(0, pos); newURL = newURL.substring(3); } base.append(file).append("/").append(newURL); } return base.toString(); } /** * 解析content type , text/html;charset=utf-8 * * @param strcontenttype :: text/html;charset=utf-8 * @return ret[0]= text/html ret[1]= utf-8 */ private String [] parseContentType(String strcontenttype) { String straret [] = new String[2]; // 默认数值。 straret[0] ="text/html"; straret[1] ="gb2312"; try { if(strcontenttype !=null) { int npos = strcontenttype.indexOf(";"); if(npos ==-1) straret[0] =strcontenttype; else { straret[0] =strcontenttype.substring(0,npos); npos = strcontenttype.indexOf("="); if(npos !=-1) { straret[1] =strcontenttype.substring(npos+1); } } } }catch(Exception se) { } return straret; } private URLSummary loadURL(String url) throws Exception { HttpURLConnection uc; String ct = ""; URLSummary summary = null; GetMethod get =null; try { get = new GetMethod(url); get.setFollowRedirects(true); int iGetResultCode = httpclient.executeMethod(get); if(iGetResultCode ==200) { final String strGetResponseBody = get.getResponseBodyAsString(); ct = get.getResponseCharSet(); //System.out.println("charset=="+ct+"iGetResultCode="+iGetResultCode); summary = new URLSummary(); summary.url = new URL(url); summary.body =strGetResponseBody; } //ct = uc.getContentType(); } catch (FileNotFoundException e) { // 404 summary=null; } finally { if(get!=null) get.releaseConnection(); } // String contdata[] = parseContentType(ct); return summary; } private void parseArgs(String argv[]) { for (int i = 0; i < argv.length; i++) { if (argv[i].equals("-u")) urls.add(argv[++i]); else if (argv[i].equals("-d")) indexDir = argv[++i]; else if (argv[i].equals("-i")) include.add(argv[++i]); else if (argv[i].equals("-e")) exclude.add(argv[++i]); else if (argv[i].equals("-v")) verbose = true; else if (argv[i].equals("-a")) incremental = true; else if (argv[i].equals("-m")) mimeTypes.put(argv[++i], Boolean.TRUE); else if (argv[i].equals("-t")) threads = Integer.parseInt(argv[++i]); else if (argv[i].equals("-s")) descSize = Integer.parseInt(argv[++i]); } if (urls.size() == 0) throw new IllegalArgumentException( "Missing required argument: -u [start url]"); if (indexDir == null) throw new IllegalArgumentException( "Missing required argument: -d [index dir]"); if (threads < 1) throw new IllegalArgumentException("Invalid number of threads: " + threads); if (mimeTypes.size() == 0) { // add default MIME types mimeTypes.put("text/html", Boolean.TRUE); mimeTypes.put("text/plain", Boolean.TRUE); } } private void print(String str) { System.out.println(str); }}class URLSummary { URL url; String body; String desc = ""; String title = "Untitled"; public String toString() { return "URL=" + url.toString() + "\r\ndesc=" + desc + "\r\ntitle=" + title + "\r\n"; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -