⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 update.java

📁 用于抽取网页文本评论的源程序
💻 JAVA
字号:
package cn.casia.ailab.ldy.cmt;
import java.io.*;
/**
 * 通过比对当前的评论数,更新评论信息
 * @author Ellen
 *
 */
public class Update {
	private String crawlDir="";
	private String encoding="gb18030";
	public Update(String aCrawlDir){
		crawlDir=aCrawlDir;
	}
	/**
	 * 
	 * @param pc 相机牌子
	 * @param pcStyle 相机型号
	 * @return 旧评论数
	 * @throws IOException
	 */
	public int getOldNum(String pc,String pcStyle) throws IOException{
		String dir=crawlDir+"评论内容6\\"+pc+"\\"+"comment_"+pcStyle+".txt";
		BufferedReader dirReader=new BufferedReader(new InputStreamReader(new FileInputStream(dir),encoding));
		String dirReaderLine="";
		int oldNum=0;
		for(dirReaderLine=dirReader.readLine();dirReaderLine!=null;dirReaderLine=dirReader.readLine()){
			if(dirReaderLine.contains("====")){
				oldNum=Integer.parseInt(dirReaderLine.substring(9, dirReaderLine.length()-3));
			}
		}
		return oldNum;
	}
	/**
	 * 
	 * @param rFile
	 * @param wFolder
	 * @throws IOException
	 * @throws InterruptedException
	 */
	public void updateCrawl(String rFile,String wFolder) throws IOException, InterruptedException{
		try{
			BufferedReader brUrl= null;
			brUrl= new BufferedReader(new InputStreamReader(new FileInputStream(rFile),encoding));
			String urlComLine="";//rFile中的每行
			String[] urlLine;
			String comContent="";
			//~~~新建文件夹
			String fileName=rFile.substring(rFile.lastIndexOf("\\")+1,rFile.indexOf(".txt"));
			File fd=new File(wFolder+fileName+"\\");
			if(!fd.exists()){
				fd.mkdir();
			}
			wFolder=wFolder+fileName+"\\";
			//~~~
			Mql_WebDownloader webdown= new Mql_WebDownloader();
			for(urlComLine=brUrl.readLine();urlComLine!=null;urlComLine=brUrl.readLine()){
				urlLine=urlComLine.split("\t");
				urlLine[1]=urlLine[1].replace("/","_");
				//~~~~更改url,为标准评论页
				StringBuffer sbUrlLine= new StringBuffer(urlLine[0]);
				sbUrlLine.replace(urlLine[0].indexOf("index"),urlLine[0].indexOf("index")+5, "more");
				sbUrlLine.replace(urlLine[0].indexOf("quote")-1,urlLine[0].indexOf("quote=0&")+7, "");
				sbUrlLine.insert(urlLine[0].indexOf("&title")-9, "&page=0");
				//~~~~
				urlLine[0]=sbUrlLine.toString();
				comContent=webdown.webpageDownload(urlLine[0], encoding);
				
				Mql_Write2File wr2f= new Mql_Write2File(wFolder+urlLine[1]+".txt");
				wr2f.writeNewFile("========"+urlLine[1]+"=====\r\n");
				wr2f.writeWebDownFile(comContent+"\r\n");
				//~~~~~~~抓取第一页的评论,并抽取页数信息
				BufferedReader brMore=null;
				brMore= new BufferedReader(new InputStreamReader(new FileInputStream(wFolder+urlLine[1]+".txt"),encoding)); 
				String brMoreLine="";
				for(brMoreLine=brMore.readLine();brMoreLine!=null;brMoreLine=brMore.readLine()){
			//		System.out.println(brMoreLine);
					if(brMoreLine.contains("turn-page")){
						break;
					}
				}//for
				if(brMoreLine==null){
					System.out.println(urlLine[1]+"查找turn-page无结果");
					continue;
				}
				brMoreLine=brMore.readLine();

				brMoreLine=brMoreLine.replace(" ", "");
				String[] lineInf=null;
				lineInf=brMoreLine.split(";");
		//		System.out.println(lineInf[0]);
				int pageNum=Integer.parseInt(lineInf[2].substring(1, lineInf[2].length()-1));//抽取页数
				int commentNum=Integer.parseInt(lineInf[0].substring(2, lineInf[0].length()-5));//抽取评论数
				
				//~~~~~~~	
				int oldCommentNum=getOldNum(fileName,urlLine[1]);
				if(commentNum>oldCommentNum){
					int newcommNum=commentNum-oldCommentNum;
					int pageDownNum=(commentNum-oldCommentNum)/10+1;
					wr2f.writeWebDownFile("=========="+newcommNum+"=====*"+commentNum+"\r\n");
					for(int pageI=1;pageI<pageDownNum;pageI++){
						urlLine[0]=sbUrlLine.replace(sbUrlLine.indexOf("page=")+5, 
								sbUrlLine.indexOf("page=")+6, String.valueOf(pageI)).toString();
						comContent=webdown.webpageDownload(urlLine[0], encoding);
						wr2f.writeWebDownFile("========"+"第"+(pageI+1)+"页"+"=====\r\n");
						wr2f.writeWebDownFile(comContent+"\r\n");
					}
				}//if(commentNum>oldCommentNum)
				
			}//for(urlComLine)
			System.out.println("抓取  "+fileName+" 的评论网页 OK");
	
			brUrl.close();
			System.out.println(fileName+"is OK");
		}catch(IOException ex){
			System.err.println(ex);
			
		}//updateCrawl
	}//commentCrawl
	/**
	 * 
	 * @param rFolder
	 * @param wFolder
	 * @throws IOException
	 */
	public void updateComment(String rFolder, String wFolder) throws IOException{
		String[] fileName=null;
		try{
			File fileList=new File(rFolder);
			fileName=fileList.list();
		}catch(Exception e){
			e.printStackTrace();
		}//catch
	
		String commentInf="";
		String brComLine="";
		BufferedReader brComment=null;
		BufferedReader brComment1=null;
		Mql_Write2File w2update= new Mql_Write2File(crawlDir+"updatelist"+".txt");
		int pageDownNum=0;
		int commentNum=0;
		int commentI=0;
		for(int fileI=0;fileI<fileName.length;fileI++){
			brComment1=new BufferedReader(new InputStreamReader(new FileInputStream(rFolder+fileName[fileI]),encoding));
			commentNum=0;
			pageDownNum=0;
			for(brComLine=brComment1.readLine();brComLine!=null;brComLine=brComment1.readLine()){
				if(brComLine.contains("==========")){
					pageDownNum=Integer.parseInt(brComLine.substring(10,brComLine.indexOf("=====*")));
					commentNum=Integer.parseInt(brComLine.substring(brComLine.indexOf("=====*")+6));
				}
			}
			brComment=new BufferedReader(new InputStreamReader(new FileInputStream(rFolder+fileName[fileI]),encoding));
			
			Mql_Write2File wr2file=new Mql_Write2File(wFolder+"comment_"+fileName[fileI]);
			if(pageDownNum!=0){
				wr2file.writeWebDownFile("\r\n===============================");
				wr2file.writeWebDownFile("共"+pageDownNum+"条新评价\r\n");
				w2update.writeWebDownFile(fileName[fileI]+"\r\n");
			}
			for(commentI=0,brComLine=brComment.readLine();commentI<pageDownNum&&brComLine!=null;brComLine=brComment.readLine()){
				if(brComLine.contains("comment-bar-a")){//抽取评论人,评论时间
					//用来记录评论数,记在文件末
					brComLine=brComment.readLine();
					brComLine=brComLine.replace(" ","");//去掉代码中无用的空格
					if((brComLine.lastIndexOf(">")+1)==brComLine.length()){//留言人为注册网友时
				//		System.out.println("ok");
						brComLine=brComment.readLine();
						commentInf=brComLine.substring(brComLine.indexOf(">")+1, brComLine.lastIndexOf("<"));
					}else{
						commentInf=brComLine.substring(brComLine.lastIndexOf(">")+1, brComLine.length());
					}
					wr2file.writeWebDownFile("("+commentI+")  "+commentInf+"\t");//写入留言人
					brComLine=brComment.readLine();
					commentInf=brComLine.substring(brComLine.indexOf(">")+1,brComLine.lastIndexOf("<"));
				//	System.out.println(commentInf);
					wr2file.writeWebDownFile(commentInf+"\r\n");//写入留言日期
				}//if
				if(brComLine.contains("comment-bar-b")){//抽取评论内容
					commentI++;
					commentInf="";
					if(!brComLine.contains("</div>")){
						commentInf=commentInf+brComLine.substring(brComLine.indexOf(">")+1, brComLine.length());
						brComLine=brComment.readLine();
						while(!brComLine.contains("</div>")){
							commentInf=commentInf+brComLine.substring(0,brComLine.length());
							brComLine=brComment.readLine();
						}
						commentInf=commentInf+brComLine.substring(0,brComLine.indexOf("<"));
					}else{
						commentInf=brComLine.substring(brComLine.indexOf(">")+1,brComLine.indexOf("</div>"));
					}
					wr2file.writeWebDownFile(commentInf+"\r\n");
					wr2file.writeWebDownFile("***************************************************\r\n");
					
				//	System.out.println(brComLine);					
				}//if
				
			}//for(brComLine)
			if(pageDownNum!=0){
				wr2file.writeWebDownFile("========共"+commentNum+"条评论");
				System.out.println("抽取  "+fileName[fileI]+"  的评论内容is OK");
			}
			
			
		}//for(fileI)
		brComment1.close();
		brComment.close();
	}//commentExtract
	public void updateTest() throws IOException, InterruptedException{

		String newComPage=crawlDir+"新评论网页\\";

	
		String[] urlList=(new File(crawlDir+"评论URL4\\")).list();
		for(int urlI=0;urlI<urlList.length;urlI++){
			updateCrawl(crawlDir+"评论URL4\\"+urlList[urlI], newComPage);
		}
		
		String[] pageDirList=(new File(newComPage)).list();
		for(int pageI=0;pageI<pageDirList.length;pageI++){
			updateComment(newComPage+pageDirList[pageI]+"\\", crawlDir+"评论内容\\"+pageDirList[pageI]+"\\");
		}
	//	up.updateCrawl("D:\\863项目\\crawl~~\\评论URL4\\适马(SIGMA).txt","D:\\863项目\\crawl~~\\新评论网页\\");
	//	up.updateComment("D:\\863项目\\crawl~~\\新评论网页\\适马(SIGMA)\\", "D:\\863项目\\crawl~~\\适马(SIGMA)\\");
		
	}
	public static void main(String args[]) throws IOException, InterruptedException{
		Update updt=new Update("D:\\863项目\\crawl~~\\");
		updt.updateTest();
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -