⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 newupdate.java

📁 用于抽取网页文本评论的源程序
💻 JAVA
字号:
package cn.casia.ailab.ldy.cmt;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

/**
 * 通过比对最新的评论时间,更新评论信息
 * @author Ellen
 *
 */
public class NewUpdate {
	private static String encoding="gb18030";
	
	public NewUpdate() {
		
	}
	
	/**
	 * 
	 * @param pcName 相机品牌
	 * @param pcStyle 相机型号的文件
	 * @return
	 * @throws IOException
	 */
	public String getLatestTime(String pcName,String pcStyle) throws IOException{
		BufferedReader brComment =new BufferedReader(new InputStreamReader(
				new FileInputStream("D:\\863项目\\crawl~~\\评论内容\\"+pcName+"/"+pcStyle+".txt"),encoding));
		String brCommentLine="";
		String latestTime="";
		for(brCommentLine=brComment.readLine();brCommentLine!=null;brCommentLine=brComment.readLine()){
			while(brCommentLine.contains("=======")){
				brCommentLine=brComment.readLine();
				if(brCommentLine!=null&&brCommentLine.contains("=======")){
					continue;
				}else if(brCommentLine==null){
					break;
				}else{
					latestTime=brCommentLine.substring(brCommentLine.lastIndexOf(" ",brCommentLine.lastIndexOf(" ")-1)+1);
				}
			}
		}
		brComment.close();
		return latestTime;
	}//getLatestTime
	
	public String updateComment(String url, String pcStyle,String oldLatestTime) throws IOException, InterruptedException{
		String comContent="";//保存更新的评论内容,return comContent
		Mql_WebDownloader webdown=new Mql_WebDownloader();
		String comPage=webdown.webpageDownload(url, encoding);
		Mql_Write2File w2f=new Mql_Write2File("D:\\863项目\\crawl~~\\新评论网页\\"+pcStyle+".txt");
		w2f.writeNewFile(comPage);
		BufferedReader brComPage=new BufferedReader(new InputStreamReader(
				new FileInputStream("D:\\863项目\\crawl~~\\新评论网页\\"+pcStyle+".txt"),encoding));
		Boolean sign=false;
		int countNum=0;//记录更新过的评论数
		int pageI=0;
		String brComPageLine="";
		String commentInfPerson="";//留言人
		String commentInfTime="";//留言时间
		String commentInf="";
		String newLatestTime="";
		for(brComPageLine=brComPage.readLine();brComPageLine!=null;brComPageLine=brComPage.readLine()){
			if(brComPageLine.contains("comment-bar-a")){//抽取评论人,评论时间
//				commentI++;//用来记录评论数,记在文件末
				brComPageLine=brComPage.readLine();
				brComPageLine=brComPageLine.replace(" ","");//去掉代码中无用的空格
				if((brComPageLine.lastIndexOf(">")+1)==brComPageLine.length()){//留言人为注册网友时
					System.out.println("ok");
					brComPageLine=brComPage.readLine();
					commentInfPerson=brComPageLine.substring(brComPageLine.indexOf(">")+1, brComPageLine.lastIndexOf("<"));
				}else{
					commentInfPerson=brComPageLine.substring(brComPageLine.lastIndexOf(">")+1, brComPageLine.length());
				}
				brComPageLine=brComPage.readLine();
				commentInfTime=brComPageLine.substring(brComPageLine.indexOf(">")+1,brComPageLine.lastIndexOf("<"));
				newLatestTime=commentInfTime.substring(commentInfTime.lastIndexOf(" ",commentInfTime.lastIndexOf(" ")-1)+1);
				if(newLatestTime.compareTo(oldLatestTime)<=0){//与当前的最新评论比较时间
					continue;
				}else{
					sign=true;
					comContent=comContent+commentInfPerson+"\t";
					comContent=comContent+commentInfTime+"\r\n";
					countNum++;
					continue;
				}
			}//if(brComPageLine.contains("comment-bar-a"))
			if(brComPageLine.contains("comment-bar-b")&&sign==true){//抽取评论内容
				commentInf="";
				if(!brComPageLine.contains("</div>")){
					commentInf=commentInf+brComPageLine.substring(brComPageLine.indexOf(">")+1, brComPageLine.length());
					brComPageLine=brComPage.readLine();
					while(!brComPageLine.contains("</div>")){
						commentInf=commentInf+brComPageLine.substring(0,brComPageLine.length());
						brComPageLine=brComPage.readLine();
					}
					commentInf=commentInf+brComPageLine.substring(0,brComPageLine.indexOf("<"));
				}else{
					commentInf=brComPageLine.substring(brComPageLine.indexOf(">")+1,brComPageLine.indexOf("</div>"));
				}
				comContent=comContent+commentInf+"\r\n***************************************************\r\n";
				sign=false;
			} //if(brComPageLine.contains("comment-bar-b")&&sign==true)
		}//	for(brComPageLine)
		brComPage.close();
		if (countNum==10){
			pageI=Integer.parseInt(url.substring(url.indexOf("page=")+5, url.indexOf("&title")));
			pageI++;
			comContent=comContent+updateComment(url.replace("page=0", "page="+pageI),pcStyle,oldLatestTime);
		}
		return comContent;
	}//updateComment
	
	public void update(String urlTxt) throws IOException, InterruptedException{
		String pcName=urlTxt.substring(urlTxt.lastIndexOf("\\")+1,urlTxt.indexOf(".txt"));
		BufferedReader brUrlTxt=new BufferedReader(new InputStreamReader(
				new FileInputStream(urlTxt),encoding));
		Mql_Write2File w2file=new Mql_Write2File("D:\\863项目\\crawl~~\\评论内容\\更新产品列表.txt");
		String comCnt="";
		String brUrlTxtLine="";
		String[] urlLine=null;
		String oldLatest="";
		for(brUrlTxtLine=brUrlTxt.readLine();brUrlTxtLine!=null;brUrlTxtLine=brUrlTxt.readLine()){
			urlLine=brUrlTxtLine.split("\t");
			oldLatest=getLatestTime(pcName,urlLine[1]);
			//~~~~更改url,为标准评论页
			StringBuffer sbUrlLine= new StringBuffer(urlLine[0]);
			sbUrlLine.replace(urlLine[0].indexOf("index"),urlLine[0].indexOf("index")+5, "more");
			sbUrlLine.replace(urlLine[0].indexOf("quote")-1,urlLine[0].indexOf("quote=0&")+7, "");
			sbUrlLine.insert(urlLine[0].indexOf("&title")-9, "&page=0");
			urlLine[0]=sbUrlLine.toString();
			//~~~~
			comCnt=updateComment(urlLine[0],urlLine[1],oldLatest);
			Mql_Write2File w2f=new Mql_Write2File("D:\\863项目\\crawl~~\\评论内容\\"+pcName+"\\"+urlLine[1]+".txt");
			w2f.writeWebDownFile(comCnt+"========\r\n");
			if(comCnt!=""){
				w2file.writeWebDownFile(urlLine[1]+"\r\n");
			}
			System.out.println(urlLine[1]);
		}
		brUrlTxt.close();
	}//update(String urlTxt)
	public static void main(String[] args) throws IOException, InterruptedException{
		NewUpdate newup= new NewUpdate();
//		String test=newup.getLatestTime("索尼(SONY)","索尼 DSC-T300.txt");
//		System.out.println(test);/
//		String testCon=newup.updateComment("http://comments.pchome.net/more.php?comment_id=4&type_id=2775&product_id=2775&page=1&title=%BB%DD%C6%D5+Photosmart+612",
//				"惠普 Photosmart 612","2008-05-07 17:12:38");
//		System.out.println(testCon);
		File fl=new File("D:\\863项目\\crawl~~\\评论URL4\\");
		String[] dirList=fl.list();
	//	newup.update("D:\\863项目\\crawl~~\\评论URL4\\惠普(HP).txt");
		for(int dirI=0;dirI<dirList.length;dirI++){
			newup.update("D:\\863项目\\crawl~~\\评论URL4\\"+dirList[dirI]);
		}
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -