📄 searchcrawler.java

📁 这是Java编程艺术一书附带的源代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
        crawling = true;

        // Perform the actual crawling.
        crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
          searchString, caseCheckBox.isSelected());

        // Turn crawling flag off.
        crawling = false;

        // Close matches log file.
        try {
          logFileWriter.close();
        } catch (Exception e) {
          showError("Unable to close matches log file.");
        }

        // Mark search as done.
        crawlingLabel2.setText("Done");

        // Enable search controls.
        startTextField.setEnabled(true);
        maxComboBox.setEnabled(true);
        limitCheckBox.setEnabled(true);
        logTextField.setEnabled(true);
        searchTextField.setEnabled(true);
        caseCheckBox.setEnabled(true);
        
        // Switch search button back to "Search."
        searchButton.setText("Search");

        // Return to default cursor.
        setCursor(Cursor.getDefaultCursor());

        // Show message if search string not found.
        if (table.getRowCount() == 0) {
          JOptionPane.showMessageDialog(SearchCrawler.this,
            "Your Search String was not found. Please try another.",
            "Search String Not Found",
            JOptionPane.WARNING_MESSAGE);
        }
      }
    });
    thread.start();
  }

  // Show dialog box with error message.
  private void showError(String message) {
    JOptionPane.showMessageDialog(this, message, "Error",
      JOptionPane.ERROR_MESSAGE);
  }

  // Update crawling stats.
  private void updateStats(
    String crawling, int crawled, int toCrawl, int maxUrls)
  {
    crawlingLabel2.setText(crawling);
    crawledLabel2.setText("" + crawled);
    toCrawlLabel2.setText("" + toCrawl);

    // Update progress bar.
    if (maxUrls == -1) {
      progressBar.setMaximum(crawled + toCrawl);
    } else {
      progressBar.setMaximum(maxUrls);
    }
    progressBar.setValue(crawled);

    matchesLabel2.setText("" + table.getRowCount());
  }

  // Add match to matches table and log file.
  private void addMatch(String url) {
    // Add URL to matches table.
    DefaultTableModel model =
      (DefaultTableModel) table.getModel();
    model.addRow(new Object[]{url});

    // Add URL to matches log file.
    try {
      logFileWriter.println(url);
    } catch (Exception e) {
      showError("Unable to log match.");
    }
  }

  // Verify URL format.
  private URL verifyUrl(String url) {
    // Only allow HTTP URLs.
    if (!url.toLowerCase().startsWith("http://"))
      return null;

    // Verify format of URL.
    URL verifiedUrl = null;
    try {
      verifiedUrl = new URL(url);
    } catch (Exception e) {
      return null;
    }

    return verifiedUrl;
  }

  // Check if robot is allowed to access the given URL.
  private boolean isRobotAllowed(URL urlToCheck) {
    String host = urlToCheck.getHost().toLowerCase();

    // Retrieve host's disallow list from cache.
    ArrayList disallowList =
      (ArrayList) disallowListCache.get(host);

    // If list is not in the cache, download and cache it.
    if (disallowList == null) {
      disallowList = new ArrayList();

      try {
        URL robotsFileUrl =
          new URL("http://" + host + "/robots.txt");

        // Open connection to robot file URL for reading.
        BufferedReader reader =
          new BufferedReader(new InputStreamReader(
            robotsFileUrl.openStream()));

        // Read robot file, creating list of disallowed paths.
        String line;
        while ((line = reader.readLine()) != null) {
          if (line.indexOf("Disallow:") == 0) {
            String disallowPath =
              line.substring("Disallow:".length());

            // Check disallow path for comments and remove if present.
            int commentIndex = disallowPath.indexOf("#");
            if (commentIndex != - 1) {
              disallowPath =
                disallowPath.substring(0, commentIndex);
            }

            // Remove leading or trailing spaces from disallow path.
            disallowPath = disallowPath.trim();

            // Add disallow path to list.
            disallowList.add(disallowPath);
          }
        }

        // Add new disallow list to cache.
        disallowListCache.put(host, disallowList);
      }
      catch (Exception e) {
        /* Assume robot is allowed since an exception
           is thrown if the robot file doesn't exist. */
        return true;
      }
    }

    /* Loop through disallow list to see if the
       crawling is allowed for the given URL. */
    String file = urlToCheck.getFile();
    for (int i = 0; i < disallowList.size(); i++) {
      String disallow = (String) disallowList.get(i);
      if (file.startsWith(disallow)) {
        return false;
      }
    }

    return true;
  }

  // Download page at given URL.
  private String downloadPage(URL pageUrl) {
     try {
        // Open connection to URL for reading.
        BufferedReader reader =
          new BufferedReader(new InputStreamReader(
            pageUrl.openStream()));

        // Read page into buffer.
        String line;
        StringBuffer pageBuffer = new StringBuffer();
        while ((line = reader.readLine()) != null) {
          pageBuffer.append(line);
        }
        
        return pageBuffer.toString();
     } catch (Exception e) {
     }

     return null;
  }

  // Remove leading "www" from a URL's host if present.
  private String removeWwwFromUrl(String url) {
    int index = url.indexOf("://www.");
    if (index != -1) {
      return url.substring(0, index + 3) +
        url.substring(index + 7);
    }

    return (url);
  }

  // Parse through page contents and retrieve links.
  private ArrayList retrieveLinks(
    URL pageUrl, String pageContents, HashSet crawledList,
    boolean limitHost)
  {
    // Compile link matching pattern.
    Pattern p =
      Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
        Pattern.CASE_INSENSITIVE);
    Matcher m = p.matcher(pageContents);

    // Create list of link matches.
    ArrayList linkList = new ArrayList();
    while (m.find()) {
      String link = m.group(1).trim();

      // Skip empty links.
      if (link.length() < 1) {
        continue;
      }

      // Skip links that are just page anchors.
      if (link.charAt(0) == '#') {
        continue;
      }

      // Skip mailto links.
      if (link.indexOf("mailto:") != -1) {
        continue;
      }

      // Skip JavaScript links.
      if (link.toLowerCase().indexOf("javascript") != -1) {
        continue;
      }

      // Prefix absolute and relative URLs if necessary.
      if (link.indexOf("://") == -1) {
        // Handle absolute URLs.
        if (link.charAt(0) == '/') {
          link = "http://" + pageUrl.getHost() + link;
        // Handle relative URLs.
        } else {
          String file = pageUrl.getFile();
          if (file.indexOf('/') == -1) {
            link = "http://" + pageUrl.getHost() + "/" + link;
          } else {
            String path =
              file.substring(0, file.lastIndexOf('/') + 1);
            link = "http://" + pageUrl.getHost() + path + link;
          }
        }
      }

      // Remove anchors from link.
      int index = link.indexOf('#');
      if (index != -1) {
        link = link.substring(0, index);
      }

      // Remove leading "www" from URL's host if present.
      link = removeWwwFromUrl(link);

      // Verify link and skip if invalid.
      URL verifiedLink = verifyUrl(link);
      if (verifiedLink == null) {
        continue;
      }

      /* If specified, limit links to those
         having the same host as the start URL. */
      if (limitHost &&
          !pageUrl.getHost().toLowerCase().equals(
            verifiedLink.getHost().toLowerCase()))
      {
        continue;
      }

      // Skip link if it has already been crawled.
      if (crawledList.contains(link)) {
        continue;
      }

      // Add link to list.
      linkList.add(link);
    }

    return (linkList);
  }

  /* Determine whether or not search string is
     matched in the given page contents. */
  private boolean searchStringMatches(
    String pageContents, String searchString,
    boolean caseSensitive)
  {
    String searchContents = pageContents;

    /* If case sensitive search, lowercase
       page contents for comparison. */
    if (!caseSensitive) {
      searchContents = pageContents.toLowerCase();
    }

    // Split search string into individual terms.
    Pattern p = Pattern.compile("[\\s]+");
    String[] terms = p.split(searchString);

    // Check to see if each term matches.
    for (int i = 0; i < terms.length; i++) {
      if (caseSensitive) {
        if (searchContents.indexOf(terms[i]) == -1) {
          return false;
        }
      } else {
        if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
          return false;
        }
      }
    }

    return true;
  }

  // Perform the actual crawling, searching for the search string.
  public void crawl(
    String startUrl, int maxUrls, boolean limitHost,
    String searchString, boolean caseSensitive)
  {
    // Setup crawl lists.
    HashSet crawledList = new HashSet();
    LinkedHashSet toCrawlList = new LinkedHashSet();

    // Add start URL to the to crawl list.
    toCrawlList.add(startUrl);

    /* Perform actual crawling by looping
       through the to crawl list. */
    while (crawling && toCrawlList.size() > 0)
    {
      /* Check to see if the max URL count has
          been reached, if it was specified.*/
      if (maxUrls != -1) {
        if (crawledList.size() == maxUrls) {
          break;
        }
      }

      // Get URL at bottom of the list.
      String url = (String) toCrawlList.iterator().next();

      // Remove URL from the to crawl list.
      toCrawlList.remove(url);

      // Convert string url to URL object.
      URL verifiedUrl = verifyUrl(url);

      // Skip URL if robots are not allowed to access it.
      if (!isRobotAllowed(verifiedUrl)) {
        continue;
      }

      // Update crawling stats.
      updateStats(url, crawledList.size(), toCrawlList.size(),
        maxUrls);

      // Add page to the crawled list.
      crawledList.add(url);

      // Download the page at the given url.
      String pageContents = downloadPage(verifiedUrl);

      /* If the page was downloaded successfully, retrieve all of its
         links and then see if it contains the search string. */
      if (pageContents != null && pageContents.length() > 0)
      {
        // Retrieve list of valid links from page.
        ArrayList links =
          retrieveLinks(verifiedUrl, pageContents, crawledList,
            limitHost);

        // Add links to the to crawl list.
        toCrawlList.addAll(links);

        /* Check if search string is present in
           page and if so record a match. */
        if (searchStringMatches(pageContents, searchString,
             caseSensitive))
        {
          addMatch(url);
        }
      }

      // Update crawling stats.
      updateStats(url, crawledList.size(), toCrawlList.size(),
        maxUrls);
    }
  }

  // Run the Search Crawler.
  public static void main(String[] args) {
    SearchCrawler crawler = new SearchCrawler();
    crawler.show();
  }
}
上一页 12
💿 文件大小 66 K
👤 上传用户 wanghaihah
📂 所属分类 Java编程
🏷️ 相关标签

#Java #编程 #源代码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -