📄 urlcommentextractor.java

📁 用于抽取网页文本评论的源程序
💻 JAVA
字号:
package cn.casia.ailab.ldy.cmt;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;




public class UrlCommentExtractor {
	private String encoding="gb18030";

	
	/**
	 * urlExtract
	 * (一定为8个=)
	 * @param rFile	txt文件的绝对路径,文件名为相机品牌名，该文件包含一品牌所有型号相机的网页内容，各型号之间用“========相机品牌+型号====”标注
	 * @param wFile	txt文件的绝对路径，文件名为相机品牌名，该文件用来保存该品牌所有型号相机的评论的url，以“http://------  \t相机品牌+型号”的形式储存
	 * @throws IOException
	 * @throws InterruptedException
	 */
	 
	public void urlExtract(String rFile,String wFile) throws IOException,	InterruptedException{
		BufferedReader pageContent=null;
		try{
			pageContent=new BufferedReader(new InputStreamReader(
					new FileInputStream(rFile),encoding));
			Mql_Write2File wUrl= new Mql_Write2File(wFile);
					
			String contentLine="";
			String urlComment="";	//某型号机的评论url
			String pcStyle="";	//该品牌下pc机的型号
			for(contentLine =pageContent.readLine();contentLine!=null;contentLine=pageContent.readLine()){
				if(contentLine.contains("========")){
					pcStyle=contentLine.substring(contentLine.indexOf("========")+8, 
							contentLine.lastIndexOf("====="));
					System.out.println(pcStyle+"url is ok");
				}//if
					
				if(contentLine.contains("http://comments")){
				//	System.out.println(contentLine);
					urlComment=contentLine.substring(contentLine.indexOf("http"),
							contentLine.indexOf("\"",contentLine.indexOf("http")));
				//	
					wUrl.writeWebDownFile(urlComment+"\t");
					wUrl.writeWebDownFile(pcStyle+"\r\n");
				}//if
			}//for
			
			pageContent.close();
		}catch (IOException ex){
			System.err.println(ex);
		}
	}//urlExtract
	
	/**
	 * commentCrawl
	 * @param rFile	文件的绝对路径，文件中存该品牌所有型号相机的评论的url，以“http://------  \t相机品牌+型号”的形式储存
	 * @param wFolder	comment网页内容的目标文件夹的路径
	 * @throws IOException
	 * @throws InterruptedException
	 */
	public void commentCrawl(String rFile,String wFolder) throws IOException, InterruptedException{
		try{
			BufferedReader brUrl= null;
			brUrl= new BufferedReader(new InputStreamReader(new FileInputStream(rFile),encoding));
			String urlComLine="";//rFile中的每行
			String[] urlLine;
			String comContent="";
			//~~~新建文件夹
			String fileName=rFile.substring(rFile.lastIndexOf("\\")+1,rFile.indexOf(".txt"));
			File fd=new File(wFolder+fileName+"\\");
			if(!fd.exists()){
				fd.mkdir();
			}
			wFolder=wFolder+fileName+"\\";
			//~~~
			Mql_WebDownloader webdown= new Mql_WebDownloader();
			for(urlComLine=brUrl.readLine();urlComLine!=null;urlComLine=brUrl.readLine()){
				urlLine=urlComLine.split("\t");
				urlLine[1].replace("/","_");
				//~~~~更改url，为标准评论页
				StringBuffer sbUrlLine= new StringBuffer(urlLine[0]);
				sbUrlLine.replace(urlLine[0].indexOf("index"),urlLine[0].indexOf("index")+5, "more");
				sbUrlLine.replace(urlLine[0].indexOf("quote")-1,urlLine[0].indexOf("quote=0&")+7, "");
				sbUrlLine.insert(urlLine[0].indexOf("&title")-9, "&page=0");
				//~~~~
				urlLine[0]=sbUrlLine.toString();
				comContent=webdown.webpageDownload(urlLine[0], encoding);
				
				Mql_Write2File wr2f= new Mql_Write2File(wFolder+urlLine[1]+".txt");
				wr2f.writeWebDownFile("========"+urlLine[1]+"=====\r\n");
				wr2f.writeWebDownFile(comContent+"\r\n");
				//~~~~~~~抓取第一页的评论，并抽取页数信息
				BufferedReader brMore=null;
				brMore= new BufferedReader(new InputStreamReader(new FileInputStream(wFolder+urlLine[1]+".txt"),encoding)); 
				String brMoreLine="";
				for(brMoreLine=brMore.readLine();brMoreLine!=null;brMoreLine=brMore.readLine()){
			//		System.out.println(brMoreLine);
					if(brMoreLine.contains("turn-page")){
						break;
					}
				}//for
				if(brMoreLine==null){
					System.out.println(urlLine[1]+"查找turn-page无结果");
					continue;
				}
				brMoreLine=brMore.readLine();
			//	System.out.println(brMoreLine);
				brMoreLine=brMoreLine.replace(" ", "");
				String[] lineInf=null;
				lineInf=brMoreLine.split("；");
		//		System.out.println(lineInf[0]);
				int pageNum=Integer.parseInt(lineInf[2].substring(1, lineInf[2].length()-1));//抽取页数
		//		int commentNum=Integer.parseInt(lineInf[0].substring(2, lineInf[0].length()-5));//抽取评论数
				//~~~~~~~	
				for(int pageI=1;pageI<pageNum;pageI++){
					urlLine[0]=sbUrlLine.replace(sbUrlLine.indexOf("page=")+5, 
							sbUrlLine.indexOf("page=")+6, String.valueOf(pageI)).toString();
					comContent=webdown.webpageDownload(urlLine[0], encoding);
					wr2f.writeWebDownFile("========"+"第"+(pageI+1)+"页"+"=====\r\n");
					wr2f.writeWebDownFile(comContent+"\r\n");
				}
			}//for(urlComLine)
			System.out.println("抓取  "+fileName+" 的评论网页 OK");
			brUrl.close();
		}catch(IOException ex){
			System.err.println(ex);
		}
	}//commentCrawl
	
	/**
	 * commentExtract
	 * @param rFolder 以品牌名命名的文件夹，文件夹内为各个型号的评论网页内容
	 * @param wFolder 以品牌名命名的文件夹，文件夹内为各个型号的评论内容
	 * @throws IOException
	 */
	public void commentExtract(String rFolder, String wFolder) throws IOException{
		String[] fileName=null;
		try{
			File fileList=new File(rFolder);
			fileName=fileList.list();
		}catch(Exception e){
			e.printStackTrace();
		}//catch
	
		String commentInf="";
		String brComLine="";
		BufferedReader brComment=null;
		for(int fileI=0;fileI<fileName.length;fileI++){
			brComment=new BufferedReader(new InputStreamReader(new FileInputStream(rFolder+fileName[fileI]),encoding));
			//	
			int commentI=0;
			Mql_Write2File wr2file=new Mql_Write2File(wFolder+fileName[fileI]);
			wr2file.writeWebDownFile("========\r\n");
			for(brComLine=brComment.readLine();brComLine!=null;brComLine=brComment.readLine()){
				if(brComLine.contains("comment-bar-a")){//抽取评论人，评论时间
					commentI++;//用来记录评论数，记在文件末
					brComLine=brComment.readLine();
					brComLine=brComLine.replace(" ","");//去掉代码中无用的空格
					if((brComLine.lastIndexOf(">")+1)==brComLine.length()){//留言人为注册网友时
						System.out.println("ok");
						brComLine=brComment.readLine();
						commentInf=brComLine.substring(brComLine.indexOf(">")+1, brComLine.lastIndexOf("<"));
					}else{
						commentInf=brComLine.substring(brComLine.lastIndexOf(">")+1, brComLine.length());
					}
					wr2file.writeWebDownFile("("+commentI+")  "+commentInf+"\t");//写入留言人
					brComLine=brComment.readLine();
					commentInf=brComLine.substring(brComLine.indexOf(">")+1,brComLine.lastIndexOf("<"));
			//		System.out.println(commentInf);
					wr2file.writeWebDownFile(commentInf+"\r\n");//写入留言日期
				}//if
				if(brComLine.contains("comment-bar-b")){//抽取评论内容
					commentInf="";
					if(!brComLine.contains("</div>")){
						commentInf=commentInf+brComLine.substring(brComLine.indexOf(">")+1, brComLine.length());
						brComLine=brComment.readLine();
						while(!brComLine.contains("</div>")){
							commentInf=commentInf+brComLine.substring(0,brComLine.length());
							brComLine=brComment.readLine();
						}
						commentInf=commentInf+brComLine.substring(0,brComLine.indexOf("<"));
					}else{
						commentInf=brComLine.substring(brComLine.indexOf(">")+1,brComLine.indexOf("</div>"));
					}
					wr2file.writeWebDownFile(commentInf+"\r\n");
					wr2file.writeWebDownFile("***************************************************\r\n");
					
				//	System.out.println(brComLine);					
				}//if
				
			}//for(brComLine)
			wr2file.writeWebDownFile("========共"+commentI+"条评论\r\n");
			System.out.println("抽取  "+fileName[fileI]+"  的评论内容is OK");
		}//for(fileI)
	}//commentExtract
	
	
/*	public static void main(String[] args) throws IOException, InterruptedException {
		String rFrom="D:\\科研工作\\nikon.txt";
		String wTo="D:\\科研工作\\nikoncomment.txt";
		String wComContent="D:\\科研工作\\test\\";
		urlCommentExtractor uCE= new urlCommentExtractor();
	//	uCE.urlExtract(rFrom, wTo);
		uCE.commentCrawl(wTo,wComContent);
		uCE.commentExtract("D:\\科研工作\\test\\", "D:\\科研工作\\test\\");
		
		}//main
*/
	public static void main(String[] args) throws IOException{
		UrlCommentExtractor uCE= new UrlCommentExtractor();
	//	uCE.commentExtract("D:\\863项目\\crawl~~\\评论网页5\\", "D:\\863项目\\crawl~~\\评论内容6\\");
		String[] commentDir=CrawlComment.dirReader("D:\\863项目\\crawl~~\\评论网页5\\");
		for(int commentI=0;commentI<commentDir.length;commentI++){
		//	System.out.println(commentDir[commentI]);
			File subfolder= new File("D:\\863项目\\crawl~~\\评论内容6\\"+commentDir[commentI]+"\\");
			if(!subfolder.exists()){
				subfolder.mkdir();
			}
			uCE.commentExtract("D:\\863项目\\crawl~~\\评论网页5\\"+commentDir[commentI]+"\\", "D:\\863项目\\crawl~~\\评论内容6\\"+commentDir[commentI]+"\\");

		}//for(commentI)
	}

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -