📄 newupdate.java
字号:
package cn.casia.ailab.ldy.cmt;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
/**
* 通过比对最新的评论时间,更新评论信息
* @author Ellen
*
*/
public class NewUpdate {
private static String encoding="gb18030";
public NewUpdate() {
}
/**
*
* @param pcName 相机品牌
* @param pcStyle 相机型号的文件
* @return
* @throws IOException
*/
public String getLatestTime(String pcName,String pcStyle) throws IOException{
BufferedReader brComment =new BufferedReader(new InputStreamReader(
new FileInputStream("D:\\863项目\\crawl~~\\评论内容\\"+pcName+"/"+pcStyle+".txt"),encoding));
String brCommentLine="";
String latestTime="";
for(brCommentLine=brComment.readLine();brCommentLine!=null;brCommentLine=brComment.readLine()){
while(brCommentLine.contains("=======")){
brCommentLine=brComment.readLine();
if(brCommentLine!=null&&brCommentLine.contains("=======")){
continue;
}else if(brCommentLine==null){
break;
}else{
latestTime=brCommentLine.substring(brCommentLine.lastIndexOf(" ",brCommentLine.lastIndexOf(" ")-1)+1);
}
}
}
brComment.close();
return latestTime;
}//getLatestTime
public String updateComment(String url, String pcStyle,String oldLatestTime) throws IOException, InterruptedException{
String comContent="";//保存更新的评论内容,return comContent
Mql_WebDownloader webdown=new Mql_WebDownloader();
String comPage=webdown.webpageDownload(url, encoding);
Mql_Write2File w2f=new Mql_Write2File("D:\\863项目\\crawl~~\\新评论网页\\"+pcStyle+".txt");
w2f.writeNewFile(comPage);
BufferedReader brComPage=new BufferedReader(new InputStreamReader(
new FileInputStream("D:\\863项目\\crawl~~\\新评论网页\\"+pcStyle+".txt"),encoding));
Boolean sign=false;
int countNum=0;//记录更新过的评论数
int pageI=0;
String brComPageLine="";
String commentInfPerson="";//留言人
String commentInfTime="";//留言时间
String commentInf="";
String newLatestTime="";
for(brComPageLine=brComPage.readLine();brComPageLine!=null;brComPageLine=brComPage.readLine()){
if(brComPageLine.contains("comment-bar-a")){//抽取评论人,评论时间
// commentI++;//用来记录评论数,记在文件末
brComPageLine=brComPage.readLine();
brComPageLine=brComPageLine.replace(" ","");//去掉代码中无用的空格
if((brComPageLine.lastIndexOf(">")+1)==brComPageLine.length()){//留言人为注册网友时
System.out.println("ok");
brComPageLine=brComPage.readLine();
commentInfPerson=brComPageLine.substring(brComPageLine.indexOf(">")+1, brComPageLine.lastIndexOf("<"));
}else{
commentInfPerson=brComPageLine.substring(brComPageLine.lastIndexOf(">")+1, brComPageLine.length());
}
brComPageLine=brComPage.readLine();
commentInfTime=brComPageLine.substring(brComPageLine.indexOf(">")+1,brComPageLine.lastIndexOf("<"));
newLatestTime=commentInfTime.substring(commentInfTime.lastIndexOf(" ",commentInfTime.lastIndexOf(" ")-1)+1);
if(newLatestTime.compareTo(oldLatestTime)<=0){//与当前的最新评论比较时间
continue;
}else{
sign=true;
comContent=comContent+commentInfPerson+"\t";
comContent=comContent+commentInfTime+"\r\n";
countNum++;
continue;
}
}//if(brComPageLine.contains("comment-bar-a"))
if(brComPageLine.contains("comment-bar-b")&&sign==true){//抽取评论内容
commentInf="";
if(!brComPageLine.contains("</div>")){
commentInf=commentInf+brComPageLine.substring(brComPageLine.indexOf(">")+1, brComPageLine.length());
brComPageLine=brComPage.readLine();
while(!brComPageLine.contains("</div>")){
commentInf=commentInf+brComPageLine.substring(0,brComPageLine.length());
brComPageLine=brComPage.readLine();
}
commentInf=commentInf+brComPageLine.substring(0,brComPageLine.indexOf("<"));
}else{
commentInf=brComPageLine.substring(brComPageLine.indexOf(">")+1,brComPageLine.indexOf("</div>"));
}
comContent=comContent+commentInf+"\r\n***************************************************\r\n";
sign=false;
} //if(brComPageLine.contains("comment-bar-b")&&sign==true)
}// for(brComPageLine)
brComPage.close();
if (countNum==10){
pageI=Integer.parseInt(url.substring(url.indexOf("page=")+5, url.indexOf("&title")));
pageI++;
comContent=comContent+updateComment(url.replace("page=0", "page="+pageI),pcStyle,oldLatestTime);
}
return comContent;
}//updateComment
public void update(String urlTxt) throws IOException, InterruptedException{
String pcName=urlTxt.substring(urlTxt.lastIndexOf("\\")+1,urlTxt.indexOf(".txt"));
BufferedReader brUrlTxt=new BufferedReader(new InputStreamReader(
new FileInputStream(urlTxt),encoding));
Mql_Write2File w2file=new Mql_Write2File("D:\\863项目\\crawl~~\\评论内容\\更新产品列表.txt");
String comCnt="";
String brUrlTxtLine="";
String[] urlLine=null;
String oldLatest="";
for(brUrlTxtLine=brUrlTxt.readLine();brUrlTxtLine!=null;brUrlTxtLine=brUrlTxt.readLine()){
urlLine=brUrlTxtLine.split("\t");
oldLatest=getLatestTime(pcName,urlLine[1]);
//~~~~更改url,为标准评论页
StringBuffer sbUrlLine= new StringBuffer(urlLine[0]);
sbUrlLine.replace(urlLine[0].indexOf("index"),urlLine[0].indexOf("index")+5, "more");
sbUrlLine.replace(urlLine[0].indexOf("quote")-1,urlLine[0].indexOf("quote=0&")+7, "");
sbUrlLine.insert(urlLine[0].indexOf("&title")-9, "&page=0");
urlLine[0]=sbUrlLine.toString();
//~~~~
comCnt=updateComment(urlLine[0],urlLine[1],oldLatest);
Mql_Write2File w2f=new Mql_Write2File("D:\\863项目\\crawl~~\\评论内容\\"+pcName+"\\"+urlLine[1]+".txt");
w2f.writeWebDownFile(comCnt+"========\r\n");
if(comCnt!=""){
w2file.writeWebDownFile(urlLine[1]+"\r\n");
}
System.out.println(urlLine[1]);
}
brUrlTxt.close();
}//update(String urlTxt)
public static void main(String[] args) throws IOException, InterruptedException{
NewUpdate newup= new NewUpdate();
// String test=newup.getLatestTime("索尼(SONY)","索尼 DSC-T300.txt");
// System.out.println(test);/
// String testCon=newup.updateComment("http://comments.pchome.net/more.php?comment_id=4&type_id=2775&product_id=2775&page=1&title=%BB%DD%C6%D5+Photosmart+612",
// "惠普 Photosmart 612","2008-05-07 17:12:38");
// System.out.println(testCon);
File fl=new File("D:\\863项目\\crawl~~\\评论URL4\\");
String[] dirList=fl.list();
// newup.update("D:\\863项目\\crawl~~\\评论URL4\\惠普(HP).txt");
for(int dirI=0;dirI<dirList.length;dirI++){
newup.update("D:\\863项目\\crawl~~\\评论URL4\\"+dirList[dirI]);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -