📄 update.java
字号:
package cn.casia.ailab.ldy.cmt;
import java.io.*;
/**
* 通过比对当前的评论数,更新评论信息
* @author Ellen
*
*/
public class Update {
private String crawlDir="";
private String encoding="gb18030";
public Update(String aCrawlDir){
crawlDir=aCrawlDir;
}
/**
*
* @param pc 相机牌子
* @param pcStyle 相机型号
* @return 旧评论数
* @throws IOException
*/
public int getOldNum(String pc,String pcStyle) throws IOException{
String dir=crawlDir+"评论内容6\\"+pc+"\\"+"comment_"+pcStyle+".txt";
BufferedReader dirReader=new BufferedReader(new InputStreamReader(new FileInputStream(dir),encoding));
String dirReaderLine="";
int oldNum=0;
for(dirReaderLine=dirReader.readLine();dirReaderLine!=null;dirReaderLine=dirReader.readLine()){
if(dirReaderLine.contains("====")){
oldNum=Integer.parseInt(dirReaderLine.substring(9, dirReaderLine.length()-3));
}
}
return oldNum;
}
/**
*
* @param rFile
* @param wFolder
* @throws IOException
* @throws InterruptedException
*/
public void updateCrawl(String rFile,String wFolder) throws IOException, InterruptedException{
try{
BufferedReader brUrl= null;
brUrl= new BufferedReader(new InputStreamReader(new FileInputStream(rFile),encoding));
String urlComLine="";//rFile中的每行
String[] urlLine;
String comContent="";
//~~~新建文件夹
String fileName=rFile.substring(rFile.lastIndexOf("\\")+1,rFile.indexOf(".txt"));
File fd=new File(wFolder+fileName+"\\");
if(!fd.exists()){
fd.mkdir();
}
wFolder=wFolder+fileName+"\\";
//~~~
Mql_WebDownloader webdown= new Mql_WebDownloader();
for(urlComLine=brUrl.readLine();urlComLine!=null;urlComLine=brUrl.readLine()){
urlLine=urlComLine.split("\t");
urlLine[1]=urlLine[1].replace("/","_");
//~~~~更改url,为标准评论页
StringBuffer sbUrlLine= new StringBuffer(urlLine[0]);
sbUrlLine.replace(urlLine[0].indexOf("index"),urlLine[0].indexOf("index")+5, "more");
sbUrlLine.replace(urlLine[0].indexOf("quote")-1,urlLine[0].indexOf("quote=0&")+7, "");
sbUrlLine.insert(urlLine[0].indexOf("&title")-9, "&page=0");
//~~~~
urlLine[0]=sbUrlLine.toString();
comContent=webdown.webpageDownload(urlLine[0], encoding);
Mql_Write2File wr2f= new Mql_Write2File(wFolder+urlLine[1]+".txt");
wr2f.writeNewFile("========"+urlLine[1]+"=====\r\n");
wr2f.writeWebDownFile(comContent+"\r\n");
//~~~~~~~抓取第一页的评论,并抽取页数信息
BufferedReader brMore=null;
brMore= new BufferedReader(new InputStreamReader(new FileInputStream(wFolder+urlLine[1]+".txt"),encoding));
String brMoreLine="";
for(brMoreLine=brMore.readLine();brMoreLine!=null;brMoreLine=brMore.readLine()){
// System.out.println(brMoreLine);
if(brMoreLine.contains("turn-page")){
break;
}
}//for
if(brMoreLine==null){
System.out.println(urlLine[1]+"查找turn-page无结果");
continue;
}
brMoreLine=brMore.readLine();
brMoreLine=brMoreLine.replace(" ", "");
String[] lineInf=null;
lineInf=brMoreLine.split(";");
// System.out.println(lineInf[0]);
int pageNum=Integer.parseInt(lineInf[2].substring(1, lineInf[2].length()-1));//抽取页数
int commentNum=Integer.parseInt(lineInf[0].substring(2, lineInf[0].length()-5));//抽取评论数
//~~~~~~~
int oldCommentNum=getOldNum(fileName,urlLine[1]);
if(commentNum>oldCommentNum){
int newcommNum=commentNum-oldCommentNum;
int pageDownNum=(commentNum-oldCommentNum)/10+1;
wr2f.writeWebDownFile("=========="+newcommNum+"=====*"+commentNum+"\r\n");
for(int pageI=1;pageI<pageDownNum;pageI++){
urlLine[0]=sbUrlLine.replace(sbUrlLine.indexOf("page=")+5,
sbUrlLine.indexOf("page=")+6, String.valueOf(pageI)).toString();
comContent=webdown.webpageDownload(urlLine[0], encoding);
wr2f.writeWebDownFile("========"+"第"+(pageI+1)+"页"+"=====\r\n");
wr2f.writeWebDownFile(comContent+"\r\n");
}
}//if(commentNum>oldCommentNum)
}//for(urlComLine)
System.out.println("抓取 "+fileName+" 的评论网页 OK");
brUrl.close();
System.out.println(fileName+"is OK");
}catch(IOException ex){
System.err.println(ex);
}//updateCrawl
}//commentCrawl
/**
*
* @param rFolder
* @param wFolder
* @throws IOException
*/
public void updateComment(String rFolder, String wFolder) throws IOException{
String[] fileName=null;
try{
File fileList=new File(rFolder);
fileName=fileList.list();
}catch(Exception e){
e.printStackTrace();
}//catch
String commentInf="";
String brComLine="";
BufferedReader brComment=null;
BufferedReader brComment1=null;
Mql_Write2File w2update= new Mql_Write2File(crawlDir+"updatelist"+".txt");
int pageDownNum=0;
int commentNum=0;
int commentI=0;
for(int fileI=0;fileI<fileName.length;fileI++){
brComment1=new BufferedReader(new InputStreamReader(new FileInputStream(rFolder+fileName[fileI]),encoding));
commentNum=0;
pageDownNum=0;
for(brComLine=brComment1.readLine();brComLine!=null;brComLine=brComment1.readLine()){
if(brComLine.contains("==========")){
pageDownNum=Integer.parseInt(brComLine.substring(10,brComLine.indexOf("=====*")));
commentNum=Integer.parseInt(brComLine.substring(brComLine.indexOf("=====*")+6));
}
}
brComment=new BufferedReader(new InputStreamReader(new FileInputStream(rFolder+fileName[fileI]),encoding));
Mql_Write2File wr2file=new Mql_Write2File(wFolder+"comment_"+fileName[fileI]);
if(pageDownNum!=0){
wr2file.writeWebDownFile("\r\n===============================");
wr2file.writeWebDownFile("共"+pageDownNum+"条新评价\r\n");
w2update.writeWebDownFile(fileName[fileI]+"\r\n");
}
for(commentI=0,brComLine=brComment.readLine();commentI<pageDownNum&&brComLine!=null;brComLine=brComment.readLine()){
if(brComLine.contains("comment-bar-a")){//抽取评论人,评论时间
//用来记录评论数,记在文件末
brComLine=brComment.readLine();
brComLine=brComLine.replace(" ","");//去掉代码中无用的空格
if((brComLine.lastIndexOf(">")+1)==brComLine.length()){//留言人为注册网友时
// System.out.println("ok");
brComLine=brComment.readLine();
commentInf=brComLine.substring(brComLine.indexOf(">")+1, brComLine.lastIndexOf("<"));
}else{
commentInf=brComLine.substring(brComLine.lastIndexOf(">")+1, brComLine.length());
}
wr2file.writeWebDownFile("("+commentI+") "+commentInf+"\t");//写入留言人
brComLine=brComment.readLine();
commentInf=brComLine.substring(brComLine.indexOf(">")+1,brComLine.lastIndexOf("<"));
// System.out.println(commentInf);
wr2file.writeWebDownFile(commentInf+"\r\n");//写入留言日期
}//if
if(brComLine.contains("comment-bar-b")){//抽取评论内容
commentI++;
commentInf="";
if(!brComLine.contains("</div>")){
commentInf=commentInf+brComLine.substring(brComLine.indexOf(">")+1, brComLine.length());
brComLine=brComment.readLine();
while(!brComLine.contains("</div>")){
commentInf=commentInf+brComLine.substring(0,brComLine.length());
brComLine=brComment.readLine();
}
commentInf=commentInf+brComLine.substring(0,brComLine.indexOf("<"));
}else{
commentInf=brComLine.substring(brComLine.indexOf(">")+1,brComLine.indexOf("</div>"));
}
wr2file.writeWebDownFile(commentInf+"\r\n");
wr2file.writeWebDownFile("***************************************************\r\n");
// System.out.println(brComLine);
}//if
}//for(brComLine)
if(pageDownNum!=0){
wr2file.writeWebDownFile("========共"+commentNum+"条评论");
System.out.println("抽取 "+fileName[fileI]+" 的评论内容is OK");
}
}//for(fileI)
brComment1.close();
brComment.close();
}//commentExtract
public void updateTest() throws IOException, InterruptedException{
String newComPage=crawlDir+"新评论网页\\";
String[] urlList=(new File(crawlDir+"评论URL4\\")).list();
for(int urlI=0;urlI<urlList.length;urlI++){
updateCrawl(crawlDir+"评论URL4\\"+urlList[urlI], newComPage);
}
String[] pageDirList=(new File(newComPage)).list();
for(int pageI=0;pageI<pageDirList.length;pageI++){
updateComment(newComPage+pageDirList[pageI]+"\\", crawlDir+"评论内容\\"+pageDirList[pageI]+"\\");
}
// up.updateCrawl("D:\\863项目\\crawl~~\\评论URL4\\适马(SIGMA).txt","D:\\863项目\\crawl~~\\新评论网页\\");
// up.updateComment("D:\\863项目\\crawl~~\\新评论网页\\适马(SIGMA)\\", "D:\\863项目\\crawl~~\\适马(SIGMA)\\");
}
public static void main(String args[]) throws IOException, InterruptedException{
Update updt=new Update("D:\\863项目\\crawl~~\\");
updt.updateTest();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -