📄 crawlcomment.java

📁 用于抽取网页文本评论的源程序
💻 JAVA
字号:
package cn.casia.ailab.ldy.cmt;


import java.io.*;

/**
 * 
 * @author Ellen
 *
 */
public class CrawlComment {
//	private String productDir="";//extracted_link 的目录
	private String encoding = "gb18030";
	
/*	public CrawlComment(String aproductDir){
		productDir=aproductDir;
	}
	*/
	/**
	 * 读取aFolder文件夹中的所有文件
	 * @param aFolder
	 * @return 返回该文件夹下所有文件名//返回文件名，而不是文件路径
	 */
	public static String[] dirReader(String aFolder){
		File fl=new File(aFolder);
		String[] dirList=fl.list();
		return dirList;
	}
	
	/**
	 * 在aFolder所在的文件夹中建一个新的文件夹，名字为newFolderName
	 * 
	 * @param aFolder 文件夹路径必须以"\\"的格式结尾
	 * @param newFolderName
	 * @return	新建文件夹的绝对路径
	 */
	public static String dirCreate(String aFolder,String newFolderName){
		String newFolder="";
		if(aFolder.contains("txt")){
			newFolder=aFolder.substring(0,aFolder.lastIndexOf("\\")+1)+newFolderName;
		}else{
			newFolder=aFolder.substring(0,aFolder.lastIndexOf("\\", aFolder.lastIndexOf("\\")-1)+1)+newFolderName;
		}
		File fd=new File(newFolder);
		if(!fd.exists()){
			fd.mkdir();
		}
		return fd.getPath()+"\\";
	}
	
	/**
	 * 
	 * @param rFn 文件的绝对路径
	 * @param wFn 文件夹的绝对路径，内为各个品牌主页的内容
	 * @throws IOException
	 * @throws InterruptedException
	 */
	public void homePageDownload(String rFn, String wFn) throws IOException,InterruptedException {
		BufferedReader homePageRead = null;
		String homePageContent = new String(); // 存储论坛首页的html代码
		try {
			homePageRead = new BufferedReader(new InputStreamReader(
					new FileInputStream(rFn), encoding));
			Mql_WebDownloader wd = new Mql_WebDownloader();
			for (String homePageURL = homePageRead.readLine(); homePageURL != null; homePageURL = homePageRead
					.readLine()) {
				String s[] = homePageURL.split("\t");
				homePageContent=wd.webpageDownload(s[0], encoding);

				String hpDownWiteFile=wFn+s[1]+".txt";
				Mql_Write2File wf= new Mql_Write2File(hpDownWiteFile);
				wf.writeWebDownFile("<url>"+s[0]+"</url>\r\n");
				wf.writeWebDownFile("===========================================\r\n");
				wf.writeWebDownFile(homePageContent);
				
				CrawlComment extractNum= new CrawlComment();
				int pageNum;
				pageNum=extractNum.extract(hpDownWiteFile);
				for(int pageI=2;pageI<=pageNum;pageI++){
					String newURL=s[0].substring(0, s[0].indexOf(".html"))+"___hits__"+pageI+".html";
					homePageContent=wd.webpageDownload(newURL, encoding);
			//		String newDownWriteFile=wFn+s[1]+"_"+pageI+".txt";
			//		Mql_Write2File newf= new Mql_Write2File(newDownWriteFile);
			//		newf.writeWebDownFile(homePageContent);
					System.out.println(s[1]);
					wf.writeWebDownFile(homePageContent);
					wf.writeWebDownFile("===========================================\r\n");
				}
				System.out.println(s[1]+"is OK");
			}
		} catch (IOException ex) {
			System.err.println(ex);
		} finally {
			if (homePageRead != null) {
				homePageRead.close();
			}
		}
		
	}// firstPageDownload ends
	/**
	 * 
	 * @param r
	 * @return
	 * @throws IOException
	 * @throws InterruptedException
	 */
	
	public int extract (String r) throws IOException, InterruptedException {
		String readLine = "";
		int pageNum = 0;

		try {
			BufferedReader homePageRead = new BufferedReader(
					new InputStreamReader(new FileInputStream(r),encoding));

			for (readLine = homePageRead.readLine(); readLine != null; readLine = homePageRead
					.readLine()) {
				if (readLine.contains("<span class=\"page-changer\">"))
					break;
			}
			readLine=homePageRead.readLine();
			String s1 = readLine.substring(readLine.indexOf("<")-3,readLine.indexOf("<")-2);
			pageNum = Integer.parseInt(s1);
			
			
		}catch (IOException ex){
			System.err.println(ex);
		}
		return pageNum;
	}

	public void homePageExtract(String pathRead,String pathWrite) throws IOException, InterruptedException {
		File path= new File(pathRead);
		String[] fileList=path.list();
		for(int i=0;i<fileList.length;i++){
//			System.out.println(pathRead+"/"+fileList[i]);
			CrawlComment ex=new CrawlComment();
			ex.zhextract(pathRead+"/"+fileList[i], pathWrite+"/"+fileList[i]);
			System.out.println("抽取产品URL "+fileList[i]+" is ok");
		}
	}//homePageExtract



	public void zhextract(String r,String w) throws IOException, InterruptedException {
		String readLine = "";
		Mql_Write2File w2f = new Mql_Write2File(w);
	//	Mql_Write2File w2fparam = new Mql_Write2File(param);
		BufferedReader homePageRead=null;
		try {
			homePageRead = new BufferedReader(
					new InputStreamReader(new FileInputStream(r),encoding));
	
			for (readLine = homePageRead.readLine(); readLine != null; readLine = homePageRead
					.readLine()) {
				if (readLine.contains("<h3 class=\"p-name\"><a href=\"/digital_dc_"))
				{
					String s1 = "http://product.pchome.net"+readLine.substring(readLine.indexOf("/"), readLine.indexOf(".html"))+".html";
					String[] temp=readLine.split("\"external\">");
					if(temp.length>1){
						String s2 = temp[1].substring(0,temp[1].indexOf("</a>"));
						if(s2.contains("?")){//针对字符无法识别的情况，用文件名代替型号名
							String s3=s2.substring(s2.lastIndexOf("?")+1, s2.length());
							String fileName=w.substring(w.lastIndexOf("\\")+2, w.indexOf(".txt"));
							if(fileName.contains("(")){
								fileName=fileName.substring(0, fileName.indexOf("("));
							}
							s2=fileName+s3;
						}
						if(s2.contains("*")){//针对字符无法识别的情况，用文件名代替型号名
							String s3=s2.replace("*","");
							String fileName=w.substring(w.lastIndexOf("\\")+2, w.indexOf(".txt"));
							if(fileName.contains("(")){
								fileName=fileName.substring(0, fileName.indexOf("("));
							}
							s2=fileName+s3;
						}
						w2f.writeWebDownFile(s1+"\t"+s2+"\r\n");
					//	String param1=s1.substring(0,s1.lastIndexOf("/")+1)+"param_"+s1.substring(s1.lastIndexOf("/")+1,s1.length());
					//	w2fparam.writeWebDownFile(param1+"\t"+s2+"\r\n");
					}
					else continue;					
				}
			}
			homePageRead.close();
		} catch (IOException ex) {
			System.err.println(ex);
		}finally {
			if (homePageRead != null) {
				homePageRead.close();
			}
		}
	}//zhextract
		
	/**
	 * 注：wFile中各型号之间用“========相机品牌+型号====”标注
	 * @param rFile	包含有该品牌下所有产品链接的文件的绝对路径，文件名为品牌名
	 * @param wFile	写入该品牌下所有产品的主页内容的文件的绝对路径，文件名为品牌名
	 * @throws IOException
	 * @throws InterruptedException
	 */
	 
	public void productPageDown(String rFile,String wFile) throws IOException,InterruptedException{
		BufferedReader rFileRead= null;
		rFileRead=new BufferedReader(new InputStreamReader(
				new FileInputStream(rFile),encoding));
		String pageContent="";
		String rFileLine="";
		Mql_Write2File wr2f= new Mql_Write2File(wFile);
	//	int i=0;
		Mql_WebDownloader webDown= new Mql_WebDownloader();
		String[] urlLine;
		for(rFileLine=rFileRead.readLine();rFileLine!=null;rFileLine=rFileRead.readLine()){

			urlLine=rFileLine.split("\t");//urlLine[0]储存型号主页的url地址		urlLine[1]型号名
			if(urlLine.length>1){
			//	System.out.println(urlLine[1]);
				pageContent=webDown.webpageDownload(urlLine[0], encoding);
				urlLine[1]=urlLine[1].replace("/","_");
				System.out.println(urlLine[1]);
				wr2f.writeWebDownFile("========"+urlLine[1]+"=====\r\n");
				wr2f.writeWebDownFile(pageContent+"\r\n");
			}
	//		System.out.println(i);

				
		}//for(rFileLine)
		rFileRead.close();
	}// productPageRead ends
	
	
	
	public static void main(String[] args) throws IOException, InterruptedException{
		
		
	}//main
	

}
💿 文件大小 14 K
👤 上传用户 junjie_x
📂 所属分类 Java编程
🏷️ 相关标签

#页 #源程序
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -