📄 main.java

📁 用java下载资源的代码代码为初级,涉及URL,REGX,UTIL
💻 JAVA
字号:


import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {

	/**
	 * 源文件的URL地址
	 */
	private static String URL = "http://www.cvh.org.cn/difangzhi/qinling/list.asp?pgno=";

	/**
	 * pdf的基本URL地址
	 */
	private static String PDFURL = "http://www.cvh.org.cn/difangzhi/qinling/";

	/**
	 * 将提取的所有文件链接保存在ArrayList里面就是pdfURLs
	 */
	private static ArrayList<String> pdfURLs = new ArrayList<String>();

	/**
	 * HTML内容
	 */
	private static String string = new String();
	
	/**
	 * 总的页码
	 */
	private static final int pageNum = 153;
	
	

	/**
	 * 主函数的入口
	 * @param args
	 */
	public static void main(String[] args) {
		//
		
		Main m = new Main();
		m.run();
		System.out.println("总的文件个数:" + pdfURLs.size());
//		for (String s : pdfURLs) {
//			m.getFileName(s);
//			System.out.println("LIST " + s);
//		}
	}

	/**
	 * 运行程序
	 *
	 */
	public void run() {
		try {
			Thread.sleep(2000);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		System.out.println("开始了");
		
		for (int i = 1; i < pageNum; i++) {
			String targetUrl = URL + i;
			System.out.println("正在提取目标网址...\n  " + targetUrl);
			this.getPDFURL(this.getContentByURL(targetUrl));// 提取PDF的url存放于list中
			System.out.println(targetUrl + "\n===提取完毕===");
			try {
				Thread.sleep(1000);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
		
		System.out.println("\n\n\n\n===网址全部提取完毕===\n\n\n\n");
		System.out.println("===正在下载并且保持文件===\n\n\n\n");
		for (String s : pdfURLs) {
			this.saveFile(PDFURL + s, "C:\\"+this.getFileName(s));
			try {
				Thread.sleep(3000);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 截取文件名称
	 * 如果截取出错,返回以R开头的随机数字,共5位
	 * @param s
	 * @return
	 */
	private String getFileName(String s) {
		int i = 0;
		i = s.lastIndexOf('/');
		if (i >= 0) {
			System.out.println("名称是:" + i + "\n====" + s.substring(i + 1));
			return s.substring(i + 1);
		}

		return ("R" + (new Random().nextInt())).substring(0,5);//如果截取出错,返回以R开头的随机数字,共5位
	}

	/**
	 * 用正则表达式提取pdf文件的 url <a href="page/1(5)/003.pdf"
	 * 将提取的所有文件链接保存在ArrayList里面就是pdfURLs
	 * 
	 * @param content
	 * @return
	 */
	public List getPDFURL(String content) {

		String patternString = "page/.+pdf";
		Pattern pattern = Pattern.compile(patternString,
				Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(content);
		System.out.println("match");
		while (matcher.find()) {
			int start = matcher.start();
			int end = matcher.end();
			String match = content.substring(start, end);
			pdfURLs.add(match);
			System.out.println(match);
		}

		return pdfURLs;
	}

	/**
	 * 获取网页的内容,
	 * @param fileWithURL
	 * @return 网页的内容HTML源代码!!!
	 */
	public String getContentByURL(String fileWithURL) {
		try {
			URL url = new URL(fileWithURL);
			URLConnection conn = url.openConnection();
			InputStream is = conn.getInputStream();
			int filelen = conn.getContentLength();
			byte[] filebuf = new byte[filelen];
			byte[] tmpbuf = new byte[1024];
			int readcount = 0;
			int readnum = 0;
			while (readcount < filelen && readnum != -1) {
				readnum = is.read(tmpbuf);
				if (readnum > -1) {
					System.arraycopy(tmpbuf, 0, filebuf, readcount, readnum);
					readcount = readcount + readnum;
				}
			}
			if (readcount < filelen) {
				System.out.println("download   error");
			}
			string = new String(filebuf);
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			System.out.println("Get HTML SRC OK:   ");
		}
		return string;
	}

	/**
	 * 
	 * 
	 * @param fileurl 文件url地址
	 * @param savepath 保存地址
	 * @return 文件是否保存成功
	 */
	public boolean saveFile(String fileurl, String savepath) {
		File savefile = new File(savepath);
		try {
			URL url = new URL(fileurl);
			URLConnection conn = url.openConnection();
			InputStream is = conn.getInputStream();
			int filelen = conn.getContentLength();
			byte[] filebuf = new byte[filelen];
			byte[] tmpbuf = new byte[1024];
			int readcount = 0;
			int readnum = 0;
			while (readcount < filelen && readnum != -1) {
				readnum = is.read(tmpbuf);
				if (readnum > -1) {
					System.arraycopy(tmpbuf, 0, filebuf, readcount, readnum);
					readcount = readcount + readnum;
				}
			}
			if (readcount < filelen) {
				System.out.println("download   error");
				return false;
			}
			if (!savefile.exists()) {
				savefile.createNewFile();
			}
			FileOutputStream fos = new FileOutputStream(savefile);
			fos.write(filebuf);
			fos.close();

		} catch (Exception e) {
			e.printStackTrace();
			return false;
		} finally {
			System.out.println("File Save OK:   " + savefile);
		}
		return true;
	}
}
💿 文件大小 10 K
👤 上传用户 abc171abc171
📂 所属分类 Java编程
🏷️ 相关标签

#java #REGX #UTIL #URL
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -