⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 crawlerweb.java

📁 一个用JAVA编写的小小爬虫,在做实验的时候觉得挺好的,拿来大家分享下,看看没什么损失的~`
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
						if (file.exists()) {							outputPrintLn("File specified already exists!");							if (LOGGING) {								err_out.write("File specified already exists!");								err_out.newLine();							}						}						else if (!file.canWrite()) {							outputPrintLn("I can't write to the specified file!");							if (LOGGING) {								err_out.write("I can't write to the specified file!");								err_out.newLine();							}						}						err_out.flush();					}				} // if (!binary)				else {					ins = new BufferedInputStream( HTTPconn.getInputStream() );					file_name = getFileName(URLAddress);					URLAddress = remFileName(URLAddress);					if (!makeFilePath(URLAddress, curr_dir).endsWith("/"))						file = new File(makeFilePath(URLAddress, curr_dir) + File.separatorChar + file_name);					else						file = new File(makeFilePath(URLAddress, curr_dir) + file_name);					try {						fileos = new FileOutputStream(file);					}					catch (FileNotFoundException e) {						outputPrintLn(e.toString());						continue;					}					outos = new BufferedOutputStream(fileos);					outputPrintLn("Saving to: " + file.getAbsolutePath());					if (LOGGING) {						err_out.write("Saving to: " + file.getAbsolutePath());						err_out.newLine();						err_out.flush();					}					while ( (r = ins.read()) != -1) {						outos.write(r);					}					outos.flush();					outos.close();					ins.close();				}				err_out.flush();			} // if (urlOK & fileOK)			else {				if (!urlOK) {					outputPrintLn("URL IS NOT VALID: " + URLAddress);					if (LOGGING) {						err_out.write("URL IS NOT VALID: " + URLAddress);						err_out.newLine();					}				}				if (!fileOK) {					outputPrintLn("FILE NOT FOUND: " + URLAddress);					if (LOGGING) {						err_out.write("FILE NOT FOUND: " + URLAddress);						err_out.newLine();					}				}				err_out.flush();			}		}	}	public void setValidChars() {		Character temp_Character = null;		char temp_char = 'a';		boolean ok = false;		VALID_CHARS = new HashSet(74);				while (temp_char <= 'z') {			temp_Character = new Character(temp_char);			ok = VALID_CHARS.add(temp_Character);			temp_char++;		}		temp_char = 'A';		while (temp_char <= 'Z') {			temp_Character = new Character(temp_char);			ok = VALID_CHARS.add(temp_Character);			temp_char++;		}		temp_char = '0';		while (temp_char <= '9') {			temp_Character = new Character(temp_char);			ok = VALID_CHARS.add(temp_Character);			temp_char++;		}		temp_Character = new Character(':');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('/');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('@');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('#');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('$');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('%');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('^');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('&');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('-');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('_');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('.');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character(',');		ok = VALID_CHARS.add(temp_Character);		temp_Character = new Character('?');		ok = VALID_CHARS.add(temp_Character);	}	public void setDirTextField(String text) {		dirText.setText(text);	}    public void InitAndStart(Crawlerweb app) throws IOException {		boolean ok;		ThreeStrings threes = null;		Integer temp = null;        FOREIGN_DOMAIN_ALLOWED = foreignCheckBox.isSelected();		LOGGING = logCheckBox.isSelected();        INFINITE = infCheckBox.isSelected();		SILENT = silentCheckBox.isSelected();        STARTING_URL = urlText.getText();        USER_DIR = dirText.getText();		temp = (Integer) depthSpinner.getValue();		DEPTH = temp.intValue();		if (USER_DIR.length() == 0) {			JOptionPane.showMessageDialog(app,										  "You must enter absolute path of directory!",										  "Warning!",										  JOptionPane.WARNING_MESSAGE);			return;		}		if (STARTING_URL.length() == 0) {			JOptionPane.showMessageDialog(app,										  "You must enter address of the site you want to retrieve!",										  "Warning!",										  JOptionPane.WARNING_MESSAGE);			return;		}		setValidChars();        DOMAIN = getDomain(STARTING_URL);        if (!StringContains(STARTING_URL, "://"))				STARTING_URL = "http://" + STARTING_URL;		if (IsDirectory(STARTING_URL)) {			if (!STARTING_URL.endsWith("/"))				STARTING_URL += "/";		}		err_output = new FileWriter("output.log");		err_out = new BufferedWriter(err_output);		ROOT_DIR_PATH = mkRootDirName(remFileName(STARTING_URL));		if (!USER_DIR.endsWith(File.separator))			USER_DIR = USER_DIR + File.separator;		ROOT_DIR = new File(USER_DIR);		ok = ROOT_DIR.mkdirs();				if (ROOT_DIR.canWrite()) {			TRIED_FILES = new HashSet(1000);			ok = TRIED_FILES.add(STARTING_URL);			threes = new ThreeStrings(STARTING_URL, ROOT_DIR_PATH, DOMAIN);			queue = new TQueue();			queue.Push(threes);			RWFactory(app);			outputPrintLn("--------------------------------------------------------------------------------");			JOptionPane.showMessageDialog(app,										  "Site retrieval finished!",										  "Information!",										  JOptionPane.INFORMATION_MESSAGE);			err_out.close();			return;		}		else {			JOptionPane.showMessageDialog(app,										  "Can't write to specified directory!",										  "Warning!",										  JOptionPane.WARNING_MESSAGE);			return;		}    }    public Component createComponents(Crawlerweb app) {                		numLabel = new JLabel("Number of links to follow:");        numLabel.setPreferredSize(new Dimension(180, 15));        numLabel.setMinimumSize(new Dimension(170, 15));        numLabel.setHorizontalAlignment(SwingConstants.LEFT);        depthSpinner = new JSpinner(new SpinnerNumberModel(0, 0, 10000, 1));        depthSpinner.setPreferredSize(new Dimension(60, 20));        depthSpinner.setMinimumSize(new Dimension(60, 20));		JPanel numPanel = new JPanel();        numPanel.setLayout(new BoxLayout(numPanel, BoxLayout.X_AXIS));        numPanel.add(Box.createHorizontalGlue());        numPanel.add(numLabel, BorderLayout.WEST);        numPanel.add(Box.createRigidArea(new Dimension(5, 0)));        numPanel.add(depthSpinner, BorderLayout.WEST);		numPanel.add(Box.createRigidArea(new Dimension(153, 0)));        numPanel.setBorder(BorderFactory.createEmptyBorder(10,10,10,10));		logCheckBox = new JCheckBox("Log to output.log?", false);		logCheckBox.setPreferredSize(new Dimension(180, 15));        logCheckBox.setMinimumSize(new Dimension(170, 15));		JPanel logPanel = new JPanel();		logPanel.setLayout(new BoxLayout(logPanel, BoxLayout.X_AXIS));		logPanel.add(Box.createHorizontalGlue());		logPanel.add(logCheckBox);		logPanel.add(Box.createRigidArea(new Dimension(215, 0)));		output = new JTextArea();		output.setPreferredSize(new Dimension(500, 150));		output.setMinimumSize(new Dimension(500, 140));		output.setBorder(BorderFactory.createLineBorder(Color.black, 1));		output.setEditable(false);		outputLabel = new JLabel("Output:");		outputLabel.setPreferredSize(new Dimension(70, 15));		outputLabel.setMinimumSize(new Dimension(60, 15));		JPanel temp_outPanel = new JPanel();		temp_outPanel.setLayout(new BoxLayout(temp_outPanel, BoxLayout.X_AXIS));		temp_outPanel.add(outputLabel);		temp_outPanel.add(Box.createRigidArea(new Dimension(435, 0)));		JPanel outputPanel = new JPanel();		outputPanel.setLayout(new BoxLayout(outputPanel, BoxLayout.Y_AXIS));		outputPanel.add(temp_outPanel);		outputPanel.add(Box.createRigidArea(new Dimension(0, 5)));		outputPanel.add(output);		urlLabel = new JLabel("Enter address of site/page you want to retrieve:");        urlLabel.setPreferredSize(new Dimension(200, 15));        urlLabel.setMinimumSize(new Dimension(200, 15));        urlLabel.setHorizontalAlignment(SwingConstants.LEFT);        urlText = new JTextField();    	urlText.setPreferredSize(new Dimension(200, 20));    	urlText.setMinimumSize(new Dimension(200, 20));    	urlText.setBorder(BorderFactory.createLineBorder(Color.black, 1));		JPanel temp_urlPanel = new JPanel();		temp_urlPanel.setLayout(new BoxLayout(temp_urlPanel, BoxLayout.X_AXIS));		temp_urlPanel.add(urlLabel);		temp_urlPanel.add(Box.createRigidArea(new Dimension(180, 0)));			ImageIcon att_icon = new ImageIcon("images/button.png");        main_button = new JButton(att_icon);        main_button.setHorizontalAlignment(SwingConstants.CENTER);        main_button.setPreferredSize(new Dimension(70, 70));        main_button.setMaximumSize(new Dimension(70, 70));        main_button.setMinimumSize(new Dimension(70, 70));        main_button.addActionListener(new MainListener(app));		main_button.setToolTipText("Start download!");		dirLabel = new JLabel("Directory to store files in:");        dirLabel.setPreferredSize(new Dimension(180, 15));        dirLabel.setMinimumSize(new Dimension(170, 15));        dirLabel.setHorizontalAlignment(SwingConstants.LEFT);		dirText = new JTextField();    	dirText.setPreferredSize(new Dimension(150, 20));    	dirText.setMinimumSize(new Dimension(150, 20));    	dirText.setBorder(BorderFactory.createLineBorder(Color.black, 1));		directory_button = new JButton("Open");		directory_button.setHorizontalAlignment(SwingConstants.CENTER);		directory_button.setPreferredSize(new Dimension(70, 20));        directory_button.setMaximumSize(new Dimension(70, 20));        directory_button.setMinimumSize(new Dimension(70, 20));		directory_button.addActionListener(new DirListener(app));		JPanel temp_dirPanel = new JPanel();		temp_dirPanel.setLayout(new BoxLayout(temp_dirPanel, BoxLayout.X_AXIS));		temp_dirPanel.add(dirLabel);		temp_dirPanel.add(Box.createRigidArea(new Dimension(320, 0)));		JPanel temp_dirPanel2 = new JPanel();		temp_dirPanel2.setLayout(new BoxLayout(temp_dirPanel2, BoxLayout.X_AXIS));		temp_dirPanel2.add(dirText);		temp_dirPanel2.add(Box.createRigidArea(new Dimension(10, 0)));		temp_dirPanel2.add(directory_button);        JPanel dirPanel = new JPanel();        dirPanel.setLayout(new BoxLayout(dirPanel, BoxLayout.Y_AXIS));        dirPanel.add(Box.createVerticalGlue());        dirPanel.add(temp_dirPanel);        dirPanel.add(Box.createRigidArea(new Dimension(0, 5)));        dirPanel.add(temp_dirPanel2);        dirPanel.add(Box.createRigidArea(new Dimension(0, 15)));        dirPanel.add(temp_urlPanel);        dirPanel.add(Box.createRigidArea(new Dimension(0, 5)));        dirPanel.add(urlText, BorderLayout.WEST);        dirPanel.setBorder(BorderFactory.createEmptyBorder(10,10,10,10));		foreignCheckBox = new JCheckBox("Allow following links to other sites?", false);		JPanel foreignPanel = new JPanel();		foreignPanel.setLayout(new BoxLayout(foreignPanel, BoxLayout.X_AXIS));		foreignPanel.add(Box.createHorizontalGlue());		foreignPanel.add(foreignCheckBox);		foreignPanel.add(Box.createRigidArea(new Dimension(140, 0)));		infCheckBox = new JCheckBox("Retrieve infinite number of links?", false);		JPanel infPanel = new JPanel();		infPanel.setLayout(new BoxLayout(infPanel, BoxLayout.X_AXIS));		infPanel.add(Box.createHorizontalGlue());		infPanel.add(infCheckBox);		infPanel.add(Box.createRigidArea(new Dimension(160, 0)));		silentCheckBox = new JCheckBox("Silent mode on?", true);		JPanel silentPanel = new JPanel();		silentPanel.setLayout(new BoxLayout(silentPanel, BoxLayout.X_AXIS));		silentPanel.add(Box.createHorizontalGlue());		silentPanel.add(silentCheckBox);		silentPanel.add(Box.createRigidArea(new Dimension(269, 0)));        JPanel optionsPanel = new JPanel();        optionsPanel.setLayout(new BoxLayout(optionsPanel, BoxLayout.Y_AXIS));        optionsPanel.add(Box.createHorizontalGlue());        optionsPanel.add(numPanel);        optionsPanel.add(foreignPanel);        optionsPanel.add(infPanel);        optionsPanel.add(silentPanel);		optionsPanel.add(logPanel);        optionsPanel.setBorder(BorderFactory.createEmptyBorder(10,10,10,10));                JPanel tempPanel = new JPanel();        tempPanel.setLayout(new BoxLayout(tempPanel, BoxLayout.X_AXIS));        tempPanel.add(Box.createHorizontalGlue());        tempPanel.add(main_button);        tempPanel.add(Box.createRigidArea(new Dimension(5, 0)));        tempPanel.add(optionsPanel);        JPanel mainPanel = new JPanel();        mainPanel.setLayout(new BoxLayout(mainPanel, BoxLayout.Y_AXIS));        mainPanel.add(tempPanel, BorderLayout.CENTER);        mainPanel.add(dirPanel, BorderLayout.WEST);		mainPanel.add(Box.createRigidArea(new Dimension(0, 5)));		mainPanel.add(outputPanel);        mainPanel.setBorder(BorderFactory.createEmptyBorder(10,10,10,10));                return mainPanel;    }    public static void main(String[] args) throws Exception {        String first_string = "";        String temp3 = "";        String inputLine = "", temp = "", type = "", temp2 = "";        TwoStrings ts = null;        ThreeStrings threes = null;        boolean ok = false, urlOK = true, fileOK = true;        URL URLconn = null;        Integer temp_depth;                try {            UIManager.setLookAndFeel(                UIManager.getCrossPlatformLookAndFeelClassName());        } catch (Exception e) { }        //Create the top-level container and add contents to it.        JFrame frame = new JFrame(APP_NAME + " " + APP_VERSION);        Crawlerweb app = new Crawlerweb();        Component contents = app.createComponents(app);        frame.getContentPane().add(contents, BorderLayout.CENTER);        //Finish setting up the frame, and show it.        frame.addWindowListener(new WindowAdapter() {            public void windowClosing(WindowEvent e) {                System.exit(0);            }        });        frame.pack();        frame.setVisible(true);    }}class DirListener implements ActionListener {	Crawlerweb app_ref = null;	JFileChooser fc = null;	DirectoryFilter df = null;	/**		@param app Reference to MainWindow so we can call MainWindow methods.	*/	public DirListener(Crawlerweb app) {		app_ref = app;		fc = new JFileChooser();		df = new DirectoryFilter();		fc.addChoosableFileFilter(df);		fc.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY);	}		/**		Implemented method from ActionListener interface.	*/	public void actionPerformed(ActionEvent e) {		int returnVal = fc.showOpenDialog(app_ref);				if (returnVal == JFileChooser.APPROVE_OPTION) { // Creates new dialog, file chooser			File file = fc.getSelectedFile();			app_ref.setDirTextField(file.getAbsolutePath());		}	}}class MainListener implements ActionListener, Runnable {	Crawlerweb app_ref = null;	public MainListener(Crawlerweb app) {		app_ref = app;	}	private void start_download() {		try {			app_ref.InitAndStart(app_ref);		}		catch (IOException e) {			JOptionPane.showMessageDialog(app_ref,										  e.toString(),										  "Warning!",										  JOptionPane.WARNING_MESSAGE);		}	}	public void run() {		this.start_download();	}	public void actionPerformed(ActionEvent e) {		// Create new thread.		Runnable instance = new MainListener(app_ref);		// Start it.		new Thread(instance).start();	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -