📄 webspider.java~
字号:
import javax.swing.*;import java.net.*;import java.io.*;import javax.swing.text.html.*;import java.util.*;/* * WebSpider.java * * Created on March 31, 2004, 9:01 AM *//** * * @author mpenderg */public class WebSpider extends javax.swing.JFrame { /** Creates new form WebSpider */ public WebSpider() { initComponents(); setTitle("Demo Spider"); setSize(800,600); } /** This method is called from within the constructor to * initialize the form. * WARNING: Do NOT modify this code. The content of this method is * always regenerated by the Form Editor. */ private void initComponents() {//GEN-BEGIN:initComponents centerPanel = new javax.swing.JScrollPane(); resultArea = new javax.swing.JTextArea(); northPanel = new javax.swing.JPanel(); urlLabel = new javax.swing.JLabel(); urlField = new javax.swing.JTextField(); browseButton = new javax.swing.JButton(); keywordLabel = new javax.swing.JLabel(); keywordField = new javax.swing.JTextField(); searchButton = new javax.swing.JButton(); errorLabel = new javax.swing.JLabel(); addWindowListener(new java.awt.event.WindowAdapter() { public void windowClosing(java.awt.event.WindowEvent evt) { exitForm(evt); } }); centerPanel.setViewportView(resultArea); getContentPane().add(centerPanel, java.awt.BorderLayout.CENTER); northPanel.setLayout(new java.awt.FlowLayout(java.awt.FlowLayout.LEFT)); urlLabel.setFont(new java.awt.Font("Dialog", 1, 10)); urlLabel.setText("URL: "); northPanel.add(urlLabel); urlField.setColumns(20); urlField.setFont(new java.awt.Font("Dialog", 1, 10)); northPanel.add(urlField); browseButton.setFont(new java.awt.Font("Dialog", 1, 10)); browseButton.setText("Browse"); browseButton.setMargin(new java.awt.Insets(2, 2, 2, 2)); browseButton.addActionListener(new java.awt.event.ActionListener() { public void actionPerformed(java.awt.event.ActionEvent evt) { browseButtonActionPerformed(evt); } }); northPanel.add(browseButton); keywordLabel.setFont(new java.awt.Font("Dialog", 1, 10)); keywordLabel.setText("Keyword: "); northPanel.add(keywordLabel); keywordField.setColumns(15); keywordField.setFont(new java.awt.Font("Dialog", 1, 10)); northPanel.add(keywordField); searchButton.setFont(new java.awt.Font("Dialog", 1, 10)); searchButton.setText("Search"); searchButton.setMargin(new java.awt.Insets(2, 2, 2, 2)); searchButton.addActionListener(new java.awt.event.ActionListener() { public void actionPerformed(java.awt.event.ActionEvent evt) { searchButtonActionPerformed(evt); } }); northPanel.add(searchButton); getContentPane().add(northPanel, java.awt.BorderLayout.NORTH); errorLabel.setHorizontalAlignment(javax.swing.SwingConstants.LEFT); errorLabel.setText(" "); getContentPane().add(errorLabel, java.awt.BorderLayout.SOUTH); pack(); }//GEN-END:initComponentspublic boolean isHref(String str){ String strlc = str.toLowerCase(); int hrefindex = strlc.indexOf("href"); if(!(strlc.startsWith("a") && hrefindex >= 0)) // is this an A HREF tag??\ return false; // no int equalindex = str.indexOf('=',hrefindex+1); if(equalindex < 0) return false; // no equal sign return true;}public String getHrefLocation(String str){ // find equal sign int equalindex = str.indexOf('='); if(equalindex < 0) return null; // // if the next non space character is a " or ', then location is between quotes, else its // its between spaces // int locstart, locend; for(locstart = equalindex+1; locstart < str.length()-1; locstart++) if(str.charAt(locstart) != ' ') break; if(locstart >= str.length()) return null; if(str.charAt(locstart) == '"' || str.charAt(locstart) == '\'') // did it start with a quote? { locend = str.indexOf(str.charAt(locstart),locstart+1); // find ending quote locstart++; // skip over first quote } else // no just look for next space. locend = str.indexOf(' ',locstart+1); // find next space if(locend < 0) locend = str.length(); // assume the end of the string if not found return(str.substring(locstart,locend).trim()); }public String resolveHTTPReference(URL url, String location){ // // determine if location is relative or absolute // String loclc = location.toLowerCase(); // // is there a protocol? // if(!loclc.startsWith("file") && !loclc.startsWith("http") && !loclc.startsWith("ftp")&&!loclc.startsWith("www")) { // no, then its relative or else a fully qualified path on a local disk if(loclc.length() > 1 && loclc.charAt(1) == ':') // check for the colon on C:, etc location = "file:///"+location; // create a url from it else // must be relative. Add on this url and try that { String path = url.getPath(); // strip filename off path if(path.length() > 0) { if(!path.endsWith("\\") && !path.endsWith("/")) { int lastbk = path.lastIndexOf('/'); int lastfw = path.lastIndexOf('\\'); path = path.substring(0,Math.max(lastfw,lastbk)+1); } } // if location begins with a / or \, then it is relative to host, else it // is relative to current path if((location.startsWith("\\") || location.startsWith("/"))) location = url.getProtocol()+"://"+url.getHost()+location; // relative top host else location = url.getProtocol()+"://"+url.getHost()+path+location; // relative to current page } } else if(loclc.startsWith("www")) location = "http://"+location; int poundindex = location.indexOf('#'); if(poundindex >= 0) location = location.substring(0,poundindex); // strip off anchors return location;}public boolean urlHasBeenVisited(String urlstring){ String str = urlstring.trim().toLowerCase(); str = str.replaceAll("\\\\","/"); if(!str.endsWith("/")) str = str+"/"; if(visited.contains(str)) // have we been here return true; // yes, just exit visited.add(str); return false;}/** * recursive routine to search the web */ Vector visited = new Vector(300,10); // keeps track of where we've been Vector matchList = new Vector(300,10); // keeps track of urls that have the keyword private void searchWeb(String urlstr, String indent, String keyword){ if(urlHasBeenVisited(urlstr)) // have we been here return; // yes, just exit if(visited.size() >= 30) // only look at 30 sites return; // // update result area // resultArea.append(indent + urlstr+"\n"); // // now look in the file // try{ boolean matchfound = false; keyword = keyword.toLowerCase(); URL url = new URL(urlstr); // create a url System.out.println(urlstr); InputStream in = url.openStream(); InputStreamReader isr = new InputStreamReader(in); StreamTokenizer stok = new StreamTokenizer(isr); stok.resetSyntax(); stok.wordChars(' ','~'); stok.whitespaceChars('<','<'); stok.whitespaceChars('>','>'); while(stok.nextToken() != StreamTokenizer.TT_EOF) { if(stok.ttype != StreamTokenizer.TT_WORD) continue; // skip stream token if not a word token String match = stok.sval.toLowerCase(); if(match.indexOf(keyword) >= 0) matchfound = true; String str = stok.sval.trim(); // dispose of leading/trailing spaces if(!isHref(str)) continue; // skip tag if its not an href String location = getHrefLocation(str); if(location == null) continue; System.out.println(str+" [["+location+"]]"); // output for demo purposes location = resolveHTTPReference(url,location); System.out.println(" resolved url = "+location); if(indent.length() < 10) // only go ten deep searchWeb(location,indent+"~",keyword); }// end while isr.close(); // // record url in matchlist if it had the keyword // if(matchfound) matchList.add(urlstr); } // end try catch(MalformedURLException ex) { resultArea.append(indent+"Bad URL : "+urlstr+"\n"); } catch(IOException e) { resultArea.append(indent+"IOException : "+e.getMessage()+"\n"); } return; } private void searchButtonActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_searchButtonActionPerformed // Add your handling code here: String urlstr = urlField.getText(); if(urlstr.length() ==0) return;// // check to see if its a remote page// String urllc = urlstr.toLowerCase(); if(!urllc.startsWith("http://") && !urllc.startsWith("ftp://") && !urllc.startsWith("www.")) { urlstr = "file:///"+urlstr; // note you must have 3 slashes ! } else // http missing ? if(urllc.startsWith("www.")) { urlstr = "http://"+urlstr; // tack on http:// } visited.removeAllElements(); matchList.removeAllElements(); resultArea.setText(""); resultArea.setText("Searching "+urlstr+" for "+keywordField.getText()+"\n"); searchWeb(urlstr,"",keywordField.getText()); resultArea.append("\n\nMatches found @ \n"); for(int i=0; i< matchList.size(); i++) resultArea.append(matchList.get(i).toString()+"\n"); }//GEN-LAST:event_searchButtonActionPerformed private void browseButtonActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_browseButtonActionPerformed // Add your handling code here: JFileChooser jc = new JFileChooser(); int result = jc.showOpenDialog(this); if(result == JFileChooser.APPROVE_OPTION) { File f = jc.getSelectedFile(); String path = f.getPath(); urlField.setText(path); } }//GEN-LAST:event_browseButtonActionPerformed /** Exit the Application */ private void exitForm(java.awt.event.WindowEvent evt) {//GEN-FIRST:event_exitForm System.exit(0); }//GEN-LAST:event_exitForm /** * @param args the command line arguments */ public static void main(String args[]) { new WebSpider().show(); } // Variables declaration - do not modify//GEN-BEGIN:variables private javax.swing.JPanel northPanel; private javax.swing.JButton searchButton; private javax.swing.JScrollPane centerPanel; private javax.swing.JButton browseButton; private javax.swing.JTextField keywordField; private javax.swing.JLabel errorLabel; private javax.swing.JTextField urlField; private javax.swing.JLabel keywordLabel; private javax.swing.JTextArea resultArea; private javax.swing.JLabel urlLabel; // End of variables declaration//GEN-END:variables }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -