📄 spider.java
字号:
/* * Spider.java * * Created on September 20, 2004, 9:20 AM */import java.util.*;import java.io.*; import java.net.*;import javax.swing.*;import javax.swing.tree.*;import javax.swing.text.html.parser.*;import javax.swing.text.html.HTMLEditorKit.*;import javax.swing.text.html.*;import javax.swing.text.*;/** * Object used to search the web (or a subset of given domains) for a list of keywords * @author Mark Pendergast */public class Spider extends Thread{ /** site visit limit (stops search at some point) */ private int siteLimit = 100; /** search depth limit */ private int depthLimit = 100; /** keyword list for seach */ private String keywordList[]; /** ip type list */ private String ipDomainList[]; /** visited tree */ private JTree searchTree = null; /** message JTextArea, place to post errors */ private JTextArea messageArea; /** place to put search statistics */ private JLabel statsLabel; /** keep track of web sites searched */ private int sitesSearched = 0; /** keep track of web sites found with matching criteria */ private int sitesFound = 0; /** starting site for the search */ private String startSite; /** flag used to stop search */ private boolean stopSearch = false; /** * Creates a new instance of Spider * @param atree JTree used to display the search space * @param amessagearea JTextArea used to display error/warning messages * @param astatlabel JLabel to display number of searched sites and hits * @param akeywordlist list of keywords to search for * @param aipdomainlist list of top level domains * @param asitelimit maximum number of web pages to look at * @param adepthlimit maximum number of levels down to search (controls recursion) * @param astartsite web site to use to start the search */ public Spider(JTree atree, JTextArea amessagearea,JLabel astatlabel, String astartsite, String[] akeywordlist, String[] aipdomainlist, int asitelimit, int adepthlimit) { searchTree = atree; // place to display search tree messageArea = amessagearea; // place to display error messages statsLabel = astatlabel; // place to put run statistics startSite = fixHref(astartsite); keywordList = new String[akeywordlist.length]; for(int i = 0; i< akeywordlist.length; i++) keywordList[i] = akeywordlist[i].toUpperCase(); // use all upper case for matching ipDomainList = new String[aipdomainlist.length]; for(int i = 0; i< aipdomainlist.length; i++) ipDomainList[i] = aipdomainlist[i].toUpperCase(); // use all upper case for matching siteLimit = asitelimit; // max number of sites to look at depthLimit = adepthlimit; // max depth of recursion to use DefaultMutableTreeNode root = new DefaultMutableTreeNode(new UrlTreeNode("Root")); DefaultTreeModel treeModel = new DefaultTreeModel(root); // create a tree model with a root searchTree.setModel(treeModel); searchTree.setCellRenderer(new UrlNodeRenderer()); // use a custom cell renderer } /** * start running the search in a new thread */ public void run() { DefaultTreeModel treeModel = (DefaultTreeModel)searchTree.getModel(); // get our model DefaultMutableTreeNode root = (DefaultMutableTreeNode)treeModel.getRoot(); String urllc = startSite.toLowerCase(); if(!urllc.startsWith("http://") && !urllc.startsWith("ftp://") && !urllc.startsWith("www.")) { startSite = "file:///"+startSite; // note you must have 3 slashes ! } else // http missing ? if(urllc.startsWith("www.")) { startSite = "http://"+startSite; // tack on http:// } startSite = startSite.replace('\\', '/'); // fix bad slashes sitesFound = 0; sitesSearched = 0; updateStats(); searchWeb(root,startSite); // search the web messageArea.append("Done!\n\n"); }/** * search the url search tree to see if we've already visited the specified url * @param urlstring url to search for * @return true if the url is already in the tree */ public boolean urlHasBeenVisited(String urlstring){ String teststring = fixHref(urlstring); DefaultTreeModel treeModel = (DefaultTreeModel)searchTree.getModel(); // get our model DefaultMutableTreeNode root = (DefaultMutableTreeNode)treeModel.getRoot(); Enumeration etree = root.breadthFirstEnumeration(); while(etree.hasMoreElements()) { UrlTreeNode node = (UrlTreeNode)(((DefaultMutableTreeNode)etree.nextElement()).getUserObject()); if(node instanceof UrlTreeNode && node.equals(teststring)) return true; } return false;} /** * Check depth of search * @return true if depth limit exceeded * @param node search tree node to test the depth limit of */public boolean depthLimitExceeded(DefaultMutableTreeNode node){ if(node.getLevel() >= depthLimit) return true; else return false; }/** * add a node to the search tree * @param parentnode parent to add the new node under * @param newnode node to be added to the tree * */private DefaultMutableTreeNode addNode(DefaultMutableTreeNode parentnode, UrlTreeNode newnode){ DefaultMutableTreeNode node = new DefaultMutableTreeNode(newnode); DefaultTreeModel treeModel = (DefaultTreeModel)searchTree.getModel(); // get our model int index = treeModel.getChildCount(parentnode); // how many children are there already? treeModel.insertNodeInto(node, parentnode,index); // add as last child TreePath tp = new TreePath(parentnode.getPath()); searchTree.expandPath(tp); // make sure the user can see the node just added return node; }/** * determines if the given url is in a one of the top level domains in the domain * search list * * @param url url to be checked * @return true if its ok, else false if url should be skipped */private boolean isDomainOk(URL url){ if(url.getProtocol().equals("file")) return true; // file protocol always ok String host = url.getHost(); int lastdot = host.lastIndexOf("."); if(lastdot <= 0) return true; String domain = host.substring(lastdot); // just the .com or .edu part if(ipDomainList.length == 0) return true; for(int i=0; i < ipDomainList.length; i++) { if(ipDomainList[i].equalsIgnoreCase("<any>")) return true; if(ipDomainList[i].equalsIgnoreCase(domain)) return true; } return false; }/** * upate statistics label */private void updateStats(){ statsLabel.setText("Sites searched : "+sitesSearched+" Sites found : "+sitesFound); } /** * repairs a sloppy href, flips backwards /, adds missing / * @return repaired web page reference * @param href web site reference */ public static String fixHref(String href) { String newhref = href.replace('\\', '/'); // fix sloppy web references int lastdot = newhref.lastIndexOf('.'); int lastslash = newhref.lastIndexOf('/'); if(lastslash > lastdot) { if(newhref.charAt(newhref.length()-1) != '/') newhref = newhref+"/"; // add on missing / } return newhref; }/** * recursive routine to search the web * @param parentnode parentnode in the search tree * @param urlstr web page address to search */ public void searchWeb(DefaultMutableTreeNode parentnode, String urlstr){ if(urlHasBeenVisited(urlstr)) // have we been here? return; // yes, just return if(depthLimitExceeded(parentnode)) return; if(sitesSearched > siteLimit) return; yield(); // allow the main program to run if(stopSearch) return; messageArea.append("Searching :"+urlstr+" \n"); sitesSearched++; updateStats(); // // now look in the file // try{ URL url = new URL(urlstr); // create the url object from a string. String protocol = url.getProtocol(); // ask the url for its protocol if(!protocol.equalsIgnoreCase("http") && !protocol.equalsIgnoreCase("file")) { messageArea.append(" Skipping : "+urlstr+" not a http site\n\n"); return; } String path = url.getPath(); // ask the url for its path int lastdot = path.lastIndexOf("."); // check for file extension if(lastdot > 0) { String extension = path.substring(lastdot); // just the file extension if(!extension.equalsIgnoreCase(".html") && !extension.equalsIgnoreCase(".htm")) return; // skip everything but html files } if(!isDomainOk(url)) { messageArea.append(" Skipping : "+urlstr+" not in domain list\n\n"); return; } UrlTreeNode newnode = new UrlTreeNode(url); // create the node InputStream in = url.openStream(); // ask the url object to create an input stream InputStreamReader isr = new InputStreamReader(in); // convert the stream to a reader. DefaultMutableTreeNode treenode = addNode(parentnode, newnode); SpiderParserCallback cb = new SpiderParserCallback(treenode); // create a callback object ParserDelegator pd = new ParserDelegator(); // create the delegator pd.parse(isr,cb,true); // parse the stream isr.close(); // close the stream } // end try catch(MalformedURLException ex) { messageArea.append(" Bad URL encountered : "+urlstr+"\n\n"); } catch(IOException e) { messageArea.append(" IOException, could not access site : "+e.getMessage()+"\n\n"); } yield(); return; } /** * Stops the search. */ public void stopSearch() { stopSearch = true; } /** * Inner class used to html handle parser callbacks */ public class SpiderParserCallback extends HTMLEditorKit.ParserCallback { /** url node being parsed */ private UrlTreeNode node; /** tree node */ private DefaultMutableTreeNode treenode; /** contents of last text element */ private String lastText = ""; /** * Creates a new instance of SpiderParserCallback * @param atreenode search tree node that is being parsed */ public SpiderParserCallback(DefaultMutableTreeNode atreenode) { treenode = atreenode; node = (UrlTreeNode)treenode.getUserObject(); } /** * handle HTML tags that don't have a start and end tag * @param t HTML tag * @param a HTML attributes * @param pos Position within file */ public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) { if(t.equals(HTML.Tag.IMG)) { node.addImages(1); return; } if(t.equals(HTML.Tag.BASE)) { Object value = a.getAttribute(HTML.Attribute.HREF); if(value != null) node.setBase(fixHref(value.toString())); } } /** * take care of start tags * @param t HTML tag * @param a HTML attributes * @param pos Position within file */ public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { if(t.equals(HTML.Tag.TITLE)) { lastText=""; return; } if(t.equals(HTML.Tag.A)) { Object value = a.getAttribute(HTML.Attribute.HREF); if(value != null) { node.addLinks(1); String href = value.toString(); href = fixHref(href); try{ URL referencedURL = new URL(node.getBase(),href); searchWeb(treenode, referencedURL.getProtocol()+"://"+referencedURL.getHost()+referencedURL.getPath()); } catch (MalformedURLException e) { messageArea.append(" Bad URL encountered : "+href+"\n\n"); return; } } } } /** * take care of start tags * @param t HTML tag * @param pos Position within file */ public void handleEndTag(HTML.Tag t, int pos) { if(t.equals(HTML.Tag.TITLE) && lastText != null) { node.setTitle(lastText.trim()); DefaultTreeModel tm = (DefaultTreeModel)searchTree.getModel(); tm.nodeChanged(treenode); } } /** * take care of text between tags, check against keyword list for matches, if * match found, set the node match status to true * @param data Text between tags * @param pos position of text within web page */ public void handleText(char[] data, int pos) { lastText = new String(data); node.addChars(lastText.length()); String text = lastText.toUpperCase(); for(int i = 0; i < keywordList.length; i++) { if(text.indexOf(keywordList[i]) >= 0) { if(!node.isMatch()) { sitesFound++; updateStats(); } node.setMatch(keywordList[i]); return; } } } } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -