📄 spider.java

📁 不错的网络蜘蛛
💻 JAVA
字号:
/* * Spider.java * * Created on September 20, 2004, 9:20 AM */import java.util.*;import java.io.*; import java.net.*;import javax.swing.*;import javax.swing.tree.*;import javax.swing.text.html.parser.*;import javax.swing.text.html.HTMLEditorKit.*;import javax.swing.text.html.*;import javax.swing.text.*;/** * Object used to search the web (or a subset of given domains) for a list of keywords * @author Mark Pendergast */public class Spider extends Thread{ /** site visit limit (stops search at some point) */ private int siteLimit = 100; /** search depth limit */ private int depthLimit = 100; /** keyword list for seach */ private String keywordList[];  /** ip type list */ private String ipDomainList[]; /** visited tree */ private JTree searchTree = null; /** message JTextArea, place to post errors */ private JTextArea messageArea; /** place to put search statistics */ private JLabel statsLabel; /** keep track of web sites searched */ private int sitesSearched = 0; /** keep track of web sites found with matching criteria */ private int sitesFound = 0;  /** starting site for the search */ private String startSite;  /** flag used to stop search */ private boolean stopSearch = false;    /**     * Creates a new instance of Spider     * @param atree JTree used to display the search space     * @param amessagearea JTextArea used to display error/warning messages     * @param astatlabel JLabel to display number of searched sites and hits     * @param akeywordlist list of keywords to search for     * @param aipdomainlist list of top level domains     * @param asitelimit maximum number of web pages to look at     * @param adepthlimit maximum number of levels down to search (controls recursion)     * @param astartsite web site to use to start the search     */    public Spider(JTree atree, JTextArea amessagearea,JLabel astatlabel, String astartsite, String[] akeywordlist, String[] aipdomainlist, int asitelimit, int adepthlimit) {      searchTree = atree; // place to display search tree       messageArea = amessagearea; // place to display error messages      statsLabel = astatlabel; // place to put run statistics      startSite = fixHref(astartsite);       keywordList = new String[akeywordlist.length];      for(int i = 0; i< akeywordlist.length; i++)         keywordList[i] = akeywordlist[i].toUpperCase(); // use all upper case for matching            ipDomainList = new String[aipdomainlist.length];      for(int i = 0; i< aipdomainlist.length; i++)         ipDomainList[i] = aipdomainlist[i].toUpperCase(); // use all upper case for matching           siteLimit = asitelimit; // max number of sites to look at      depthLimit = adepthlimit; // max depth of recursion to use      DefaultMutableTreeNode root = new DefaultMutableTreeNode(new UrlTreeNode("Root"));       DefaultTreeModel treeModel = new DefaultTreeModel(root);  // create a tree model with a root      searchTree.setModel(treeModel);      searchTree.setCellRenderer(new UrlNodeRenderer()); // use a custom cell renderer      }    /**     *  start running the search in a new thread     */     public void run()    {      DefaultTreeModel treeModel = (DefaultTreeModel)searchTree.getModel(); // get our model      DefaultMutableTreeNode root = (DefaultMutableTreeNode)treeModel.getRoot();      String urllc = startSite.toLowerCase();      if(!urllc.startsWith("http://") && !urllc.startsWith("ftp://") &&           !urllc.startsWith("www."))        {         startSite = "file:///"+startSite;   // note you must have 3 slashes !        }        else // http missing ?         if(urllc.startsWith("www."))         {           startSite = "http://"+startSite; // tack on http://           }               startSite = startSite.replace('\\', '/'); // fix bad slashes        sitesFound = 0;      sitesSearched = 0;      updateStats();      searchWeb(root,startSite); // search the web      messageArea.append("Done!\n\n");     }/** *  search the url search tree to see if we've already visited the specified url * @param urlstring url to search for * @return true if the url is already in the tree */    public boolean urlHasBeenVisited(String urlstring){      String teststring = fixHref(urlstring);   DefaultTreeModel treeModel = (DefaultTreeModel)searchTree.getModel(); // get our model   DefaultMutableTreeNode root = (DefaultMutableTreeNode)treeModel.getRoot();   Enumeration etree = root.breadthFirstEnumeration();   while(etree.hasMoreElements())   {     UrlTreeNode node = (UrlTreeNode)(((DefaultMutableTreeNode)etree.nextElement()).getUserObject());     if(node instanceof UrlTreeNode && node.equals(teststring))       return true;   }   return false;} /** *  Check depth of search * @return true if depth limit exceeded * @param node search tree node to test the depth limit of */public boolean depthLimitExceeded(DefaultMutableTreeNode node){   if(node.getLevel() >= depthLimit)      return true;  else      return false; }/** *  add a node to the search tree  * @param parentnode parent to add the new node under * @param newnode node to be added to the tree *  */private DefaultMutableTreeNode addNode(DefaultMutableTreeNode parentnode, UrlTreeNode newnode){      DefaultMutableTreeNode node = new DefaultMutableTreeNode(newnode);      DefaultTreeModel treeModel = (DefaultTreeModel)searchTree.getModel(); // get our model      int index = treeModel.getChildCount(parentnode); // how many children are there already?      treeModel.insertNodeInto(node, parentnode,index);  // add as last child      TreePath tp = new TreePath(parentnode.getPath());      searchTree.expandPath(tp); // make sure the user can see the node just added      return node;    }/** * determines if the given url is in a one of the top level domains in the domain * search list * * @param url url to be checked * @return true if its ok, else false if url should be skipped */private boolean isDomainOk(URL url){  if(url.getProtocol().equals("file"))        return true;  // file protocol always ok    String host = url.getHost();  int lastdot = host.lastIndexOf(".");  if(lastdot <= 0)      return true;    String domain = host.substring(lastdot);  // just the .com or .edu part    if(ipDomainList.length == 0)      return true;    for(int i=0; i < ipDomainList.length; i++)  {    if(ipDomainList[i].equalsIgnoreCase("<any>"))        return true;    if(ipDomainList[i].equalsIgnoreCase(domain))        return true;  }  return false;   }/**  * upate statistics label */private void updateStats(){  statsLabel.setText("Sites searched : "+sitesSearched+"    Sites found : "+sitesFound); }  /**     * repairs a sloppy href, flips backwards /, adds missing /     * @return repaired web page reference     * @param href web site reference     */    public static String fixHref(String href)    {     String newhref = href.replace('\\', '/'); // fix sloppy web references     int lastdot = newhref.lastIndexOf('.');     int lastslash = newhref.lastIndexOf('/');     if(lastslash > lastdot)     {      if(newhref.charAt(newhref.length()-1) != '/')         newhref = newhref+"/";  // add on missing /     }         return newhref;                }/** * recursive routine to search the web * @param parentnode parentnode in the search tree * @param urlstr web page address to search */ public void searchWeb(DefaultMutableTreeNode parentnode, String urlstr){       if(urlHasBeenVisited(urlstr)) // have we been here?      return; // yes, just return      if(depthLimitExceeded(parentnode))       return;      if(sitesSearched > siteLimit)       return;      yield(); // allow the main program to run      if(stopSearch)    return;   messageArea.append("Searching :"+urlstr+" \n");   sitesSearched++;   updateStats();  //  // now look in the file  //  try{     URL url = new URL(urlstr); // create the url object from a string.          String protocol = url.getProtocol(); // ask the url for its protocol     if(!protocol.equalsIgnoreCase("http") && !protocol.equalsIgnoreCase("file"))     {        messageArea.append("    Skipping : "+urlstr+" not a http site\n\n");        return;     }           String path = url.getPath();  // ask the url for its path     int lastdot = path.lastIndexOf("."); // check for file extension     if(lastdot > 0)     {      String extension = path.substring(lastdot);  // just the file extension      if(!extension.equalsIgnoreCase(".html") && !extension.equalsIgnoreCase(".htm"))         return;  // skip everything but html files     }          if(!isDomainOk(url))     {        messageArea.append("    Skipping : "+urlstr+" not in domain list\n\n");        return;     }              UrlTreeNode newnode = new UrlTreeNode(url); // create the node      InputStream in = url.openStream(); // ask the url object to create an input stream     InputStreamReader isr = new InputStreamReader(in); // convert the stream to a reader.     DefaultMutableTreeNode treenode = addNode(parentnode, newnode);       SpiderParserCallback cb = new SpiderParserCallback(treenode); // create a callback object     ParserDelegator pd = new ParserDelegator(); // create the delegator     pd.parse(isr,cb,true); // parse the stream     isr.close();  // close the stream        } // end try   catch(MalformedURLException ex)    {     messageArea.append("    Bad URL encountered : "+urlstr+"\n\n");        }    catch(IOException e)    {     messageArea.append("    IOException, could not access site : "+e.getMessage()+"\n\n");       }   yield();   return;    }      /**  * Stops the search.  */  public void stopSearch() {    stopSearch = true; } /** * Inner class used to html handle parser callbacks */ public class SpiderParserCallback extends HTMLEditorKit.ParserCallback {     /** url node being parsed */     private UrlTreeNode node;     /** tree node */     private DefaultMutableTreeNode treenode;     /** contents of last text element */     private String lastText = "";     /**      * Creates a new instance of SpiderParserCallback      * @param atreenode search tree node that is being parsed      */    public SpiderParserCallback(DefaultMutableTreeNode atreenode) {           treenode = atreenode;           node = (UrlTreeNode)treenode.getUserObject();    }    /**     *  handle HTML tags that don't have a start and end tag     * @param t HTML tag     * @param a HTML attributes     * @param pos Position within file     */     public void handleSimpleTag(HTML.Tag t,                            MutableAttributeSet a,                            int pos)    {      if(t.equals(HTML.Tag.IMG))      {        node.addImages(1);        return;      }       if(t.equals(HTML.Tag.BASE))      {        Object value = a.getAttribute(HTML.Attribute.HREF);        if(value != null)         node.setBase(fixHref(value.toString()));       }                }    /**     *  take care of start tags     * @param t HTML tag     * @param a HTML attributes     * @param pos Position within file     */     public void handleStartTag(HTML.Tag t,                            MutableAttributeSet a,                            int pos)    {       if(t.equals(HTML.Tag.TITLE))      {        lastText="";        return;      }      if(t.equals(HTML.Tag.A))      {        Object value = a.getAttribute(HTML.Attribute.HREF);        if(value != null)        {         node.addLinks(1);          String href = value.toString();         href = fixHref(href);         try{           URL referencedURL = new URL(node.getBase(),href);           searchWeb(treenode, referencedURL.getProtocol()+"://"+referencedURL.getHost()+referencedURL.getPath());         }         catch (MalformedURLException e)         {           messageArea.append("    Bad URL encountered : "+href+"\n\n");              return;           }        }      }             }          /**     *  take care of start tags     * @param t HTML tag     * @param pos Position within file     */     public void handleEndTag(HTML.Tag t,                               int pos)    {      if(t.equals(HTML.Tag.TITLE) && lastText != null)      {        node.setTitle(lastText.trim());        DefaultTreeModel tm = (DefaultTreeModel)searchTree.getModel();        tm.nodeChanged(treenode);       }             }     /**      * take care of text between tags, check against keyword list for matches, if      * match found, set the node match status to true      * @param data Text between tags      * @param pos position of text within web page      */     public void handleText(char[] data, int pos)     {       lastText = new String(data);       node.addChars(lastText.length());       String text = lastText.toUpperCase();       for(int i = 0; i < keywordList.length; i++)       {         if(text.indexOf(keywordList[i]) >= 0)         {           if(!node.isMatch())           {            sitesFound++;            updateStats();           }           node.setMatch(keywordList[i]);            return;         }       }     }        } }
💿 文件大小 139 K
👤 上传用户 pangyi
📂 所属分类 Java编程
🏷️ 相关标签

#网络
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -