📄 searchcrawler.java
字号:
crawling = true;
// Perform the actual crawling.
crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
searchString, caseCheckBox.isSelected());
// Turn crawling flag off.
crawling = false;
// Close matches log file.
try {
logFileWriter.close();
} catch (Exception e) {
showError("Unable to close matches log file.");
}
// Mark search as done.
crawlingLabel2.setText("Done");
// Enable search controls.
startTextField.setEnabled(true);
maxComboBox.setEnabled(true);
limitCheckBox.setEnabled(true);
logTextField.setEnabled(true);
searchTextField.setEnabled(true);
caseCheckBox.setEnabled(true);
// Switch search button back to "Search."
searchButton.setText("Search");
// Return to default cursor.
setCursor(Cursor.getDefaultCursor());
// Show message if search string not found.
if (table.getRowCount() == 0) {
JOptionPane.showMessageDialog(SearchCrawler.this,
"Your Search String was not found. Please try another.",
"Search String Not Found",
JOptionPane.WARNING_MESSAGE);
}
}
});
thread.start();
}
// Show dialog box with error message.
private void showError(String message) {
JOptionPane.showMessageDialog(this, message, "Error",
JOptionPane.ERROR_MESSAGE);
}
// Update crawling stats.
private void updateStats(
String crawling, int crawled, int toCrawl, int maxUrls)
{
crawlingLabel2.setText(crawling);
crawledLabel2.setText("" + crawled);
toCrawlLabel2.setText("" + toCrawl);
// Update progress bar.
if (maxUrls == -1) {
progressBar.setMaximum(crawled + toCrawl);
} else {
progressBar.setMaximum(maxUrls);
}
progressBar.setValue(crawled);
matchesLabel2.setText("" + table.getRowCount());
}
// Add match to matches table and log file.
private void addMatch(String url) {
// Add URL to matches table.
DefaultTableModel model =
(DefaultTableModel) table.getModel();
model.addRow(new Object[]{url});
// Add URL to matches log file.
try {
logFileWriter.println(url);
} catch (Exception e) {
showError("Unable to log match.");
}
}
// Verify URL format.
private URL verifyUrl(String url) {
// Only allow HTTP URLs.
if (!url.toLowerCase().startsWith("http://"))
return null;
// Verify format of URL.
URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
} catch (Exception e) {
return null;
}
return verifiedUrl;
}
// Check if robot is allowed to access the given URL.
private boolean isRobotAllowed(URL urlToCheck) {
String host = urlToCheck.getHost().toLowerCase();
// Retrieve host's disallow list from cache.
ArrayList disallowList =
(ArrayList) disallowListCache.get(host);
// If list is not in the cache, download and cache it.
if (disallowList == null) {
disallowList = new ArrayList();
try {
URL robotsFileUrl =
new URL("http://" + host + "/robots.txt");
// Open connection to robot file URL for reading.
BufferedReader reader =
new BufferedReader(new InputStreamReader(
robotsFileUrl.openStream()));
// Read robot file, creating list of disallowed paths.
String line;
while ((line = reader.readLine()) != null) {
if (line.indexOf("Disallow:") == 0) {
String disallowPath =
line.substring("Disallow:".length());
// Check disallow path for comments and remove if present.
int commentIndex = disallowPath.indexOf("#");
if (commentIndex != - 1) {
disallowPath =
disallowPath.substring(0, commentIndex);
}
// Remove leading or trailing spaces from disallow path.
disallowPath = disallowPath.trim();
// Add disallow path to list.
disallowList.add(disallowPath);
}
}
// Add new disallow list to cache.
disallowListCache.put(host, disallowList);
}
catch (Exception e) {
/* Assume robot is allowed since an exception
is thrown if the robot file doesn't exist. */
return true;
}
}
/* Loop through disallow list to see if the
crawling is allowed for the given URL. */
String file = urlToCheck.getFile();
for (int i = 0; i < disallowList.size(); i++) {
String disallow = (String) disallowList.get(i);
if (file.startsWith(disallow)) {
return false;
}
}
return true;
}
// Download page at given URL.
private String downloadPage(URL pageUrl) {
try {
// Open connection to URL for reading.
BufferedReader reader =
new BufferedReader(new InputStreamReader(
pageUrl.openStream()));
// Read page into buffer.
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line);
}
return pageBuffer.toString();
} catch (Exception e) {
}
return null;
}
// Remove leading "www" from a URL's host if present.
private String removeWwwFromUrl(String url) {
int index = url.indexOf("://www.");
if (index != -1) {
return url.substring(0, index + 3) +
url.substring(index + 7);
}
return (url);
}
// Parse through page contents and retrieve links.
private ArrayList retrieveLinks(
URL pageUrl, String pageContents, HashSet crawledList,
boolean limitHost)
{
// Compile link matching pattern.
Pattern p =
Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(pageContents);
// Create list of link matches.
ArrayList linkList = new ArrayList();
while (m.find()) {
String link = m.group(1).trim();
// Skip empty links.
if (link.length() < 1) {
continue;
}
// Skip links that are just page anchors.
if (link.charAt(0) == '#') {
continue;
}
// Skip mailto links.
if (link.indexOf("mailto:") != -1) {
continue;
}
// Skip JavaScript links.
if (link.toLowerCase().indexOf("javascript") != -1) {
continue;
}
// Prefix absolute and relative URLs if necessary.
if (link.indexOf("://") == -1) {
// Handle absolute URLs.
if (link.charAt(0) == '/') {
link = "http://" + pageUrl.getHost() + link;
// Handle relative URLs.
} else {
String file = pageUrl.getFile();
if (file.indexOf('/') == -1) {
link = "http://" + pageUrl.getHost() + "/" + link;
} else {
String path =
file.substring(0, file.lastIndexOf('/') + 1);
link = "http://" + pageUrl.getHost() + path + link;
}
}
}
// Remove anchors from link.
int index = link.indexOf('#');
if (index != -1) {
link = link.substring(0, index);
}
// Remove leading "www" from URL's host if present.
link = removeWwwFromUrl(link);
// Verify link and skip if invalid.
URL verifiedLink = verifyUrl(link);
if (verifiedLink == null) {
continue;
}
/* If specified, limit links to those
having the same host as the start URL. */
if (limitHost &&
!pageUrl.getHost().toLowerCase().equals(
verifiedLink.getHost().toLowerCase()))
{
continue;
}
// Skip link if it has already been crawled.
if (crawledList.contains(link)) {
continue;
}
// Add link to list.
linkList.add(link);
}
return (linkList);
}
/* Determine whether or not search string is
matched in the given page contents. */
private boolean searchStringMatches(
String pageContents, String searchString,
boolean caseSensitive)
{
String searchContents = pageContents;
/* If case sensitive search, lowercase
page contents for comparison. */
if (!caseSensitive) {
searchContents = pageContents.toLowerCase();
}
// Split search string into individual terms.
Pattern p = Pattern.compile("[\\s]+");
String[] terms = p.split(searchString);
// Check to see if each term matches.
for (int i = 0; i < terms.length; i++) {
if (caseSensitive) {
if (searchContents.indexOf(terms[i]) == -1) {
return false;
}
} else {
if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
return false;
}
}
}
return true;
}
// Perform the actual crawling, searching for the search string.
public void crawl(
String startUrl, int maxUrls, boolean limitHost,
String searchString, boolean caseSensitive)
{
// Setup crawl lists.
HashSet crawledList = new HashSet();
LinkedHashSet toCrawlList = new LinkedHashSet();
// Add start URL to the to crawl list.
toCrawlList.add(startUrl);
/* Perform actual crawling by looping
through the to crawl list. */
while (crawling && toCrawlList.size() > 0)
{
/* Check to see if the max URL count has
been reached, if it was specified.*/
if (maxUrls != -1) {
if (crawledList.size() == maxUrls) {
break;
}
}
// Get URL at bottom of the list.
String url = (String) toCrawlList.iterator().next();
// Remove URL from the to crawl list.
toCrawlList.remove(url);
// Convert string url to URL object.
URL verifiedUrl = verifyUrl(url);
// Skip URL if robots are not allowed to access it.
if (!isRobotAllowed(verifiedUrl)) {
continue;
}
// Update crawling stats.
updateStats(url, crawledList.size(), toCrawlList.size(),
maxUrls);
// Add page to the crawled list.
crawledList.add(url);
// Download the page at the given url.
String pageContents = downloadPage(verifiedUrl);
/* If the page was downloaded successfully, retrieve all of its
links and then see if it contains the search string. */
if (pageContents != null && pageContents.length() > 0)
{
// Retrieve list of valid links from page.
ArrayList links =
retrieveLinks(verifiedUrl, pageContents, crawledList,
limitHost);
// Add links to the to crawl list.
toCrawlList.addAll(links);
/* Check if search string is present in
page and if so record a match. */
if (searchStringMatches(pageContents, searchString,
caseSensitive))
{
addMatch(url);
}
}
// Update crawling stats.
updateStats(url, crawledList.size(), toCrawlList.size(),
maxUrls);
}
}
// Run the Search Crawler.
public static void main(String[] args) {
SearchCrawler crawler = new SearchCrawler();
crawler.show();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -