📄 crawlcomment.java
字号:
package cn.casia.ailab.ldy.cmt;
import java.io.*;
/**
*
* @author Ellen
*
*/
public class CrawlComment {
// private String productDir="";//extracted_link 的目录
private String encoding = "gb18030";
/* public CrawlComment(String aproductDir){
productDir=aproductDir;
}
*/
/**
* 读取aFolder文件夹中的所有文件
* @param aFolder
* @return 返回该文件夹下所有文件名//返回文件名,而不是文件路径
*/
public static String[] dirReader(String aFolder){
File fl=new File(aFolder);
String[] dirList=fl.list();
return dirList;
}
/**
* 在aFolder所在的文件夹中建一个新的文件夹,名字为newFolderName
*
* @param aFolder 文件夹路径必须以"\\"的格式结尾
* @param newFolderName
* @return 新建文件夹的绝对路径
*/
public static String dirCreate(String aFolder,String newFolderName){
String newFolder="";
if(aFolder.contains("txt")){
newFolder=aFolder.substring(0,aFolder.lastIndexOf("\\")+1)+newFolderName;
}else{
newFolder=aFolder.substring(0,aFolder.lastIndexOf("\\", aFolder.lastIndexOf("\\")-1)+1)+newFolderName;
}
File fd=new File(newFolder);
if(!fd.exists()){
fd.mkdir();
}
return fd.getPath()+"\\";
}
/**
*
* @param rFn 文件的绝对路径
* @param wFn 文件夹的绝对路径,内为各个品牌主页的内容
* @throws IOException
* @throws InterruptedException
*/
public void homePageDownload(String rFn, String wFn) throws IOException,InterruptedException {
BufferedReader homePageRead = null;
String homePageContent = new String(); // 存储论坛首页的html代码
try {
homePageRead = new BufferedReader(new InputStreamReader(
new FileInputStream(rFn), encoding));
Mql_WebDownloader wd = new Mql_WebDownloader();
for (String homePageURL = homePageRead.readLine(); homePageURL != null; homePageURL = homePageRead
.readLine()) {
String s[] = homePageURL.split("\t");
homePageContent=wd.webpageDownload(s[0], encoding);
String hpDownWiteFile=wFn+s[1]+".txt";
Mql_Write2File wf= new Mql_Write2File(hpDownWiteFile);
wf.writeWebDownFile("<url>"+s[0]+"</url>\r\n");
wf.writeWebDownFile("===========================================\r\n");
wf.writeWebDownFile(homePageContent);
CrawlComment extractNum= new CrawlComment();
int pageNum;
pageNum=extractNum.extract(hpDownWiteFile);
for(int pageI=2;pageI<=pageNum;pageI++){
String newURL=s[0].substring(0, s[0].indexOf(".html"))+"___hits__"+pageI+".html";
homePageContent=wd.webpageDownload(newURL, encoding);
// String newDownWriteFile=wFn+s[1]+"_"+pageI+".txt";
// Mql_Write2File newf= new Mql_Write2File(newDownWriteFile);
// newf.writeWebDownFile(homePageContent);
System.out.println(s[1]);
wf.writeWebDownFile(homePageContent);
wf.writeWebDownFile("===========================================\r\n");
}
System.out.println(s[1]+"is OK");
}
} catch (IOException ex) {
System.err.println(ex);
} finally {
if (homePageRead != null) {
homePageRead.close();
}
}
}// firstPageDownload ends
/**
*
* @param r
* @return
* @throws IOException
* @throws InterruptedException
*/
public int extract (String r) throws IOException, InterruptedException {
String readLine = "";
int pageNum = 0;
try {
BufferedReader homePageRead = new BufferedReader(
new InputStreamReader(new FileInputStream(r),encoding));
for (readLine = homePageRead.readLine(); readLine != null; readLine = homePageRead
.readLine()) {
if (readLine.contains("<span class=\"page-changer\">"))
break;
}
readLine=homePageRead.readLine();
String s1 = readLine.substring(readLine.indexOf("<")-3,readLine.indexOf("<")-2);
pageNum = Integer.parseInt(s1);
}catch (IOException ex){
System.err.println(ex);
}
return pageNum;
}
public void homePageExtract(String pathRead,String pathWrite) throws IOException, InterruptedException {
File path= new File(pathRead);
String[] fileList=path.list();
for(int i=0;i<fileList.length;i++){
// System.out.println(pathRead+"/"+fileList[i]);
CrawlComment ex=new CrawlComment();
ex.zhextract(pathRead+"/"+fileList[i], pathWrite+"/"+fileList[i]);
System.out.println("抽取产品URL "+fileList[i]+" is ok");
}
}//homePageExtract
public void zhextract(String r,String w) throws IOException, InterruptedException {
String readLine = "";
Mql_Write2File w2f = new Mql_Write2File(w);
// Mql_Write2File w2fparam = new Mql_Write2File(param);
BufferedReader homePageRead=null;
try {
homePageRead = new BufferedReader(
new InputStreamReader(new FileInputStream(r),encoding));
for (readLine = homePageRead.readLine(); readLine != null; readLine = homePageRead
.readLine()) {
if (readLine.contains("<h3 class=\"p-name\"><a href=\"/digital_dc_"))
{
String s1 = "http://product.pchome.net"+readLine.substring(readLine.indexOf("/"), readLine.indexOf(".html"))+".html";
String[] temp=readLine.split("\"external\">");
if(temp.length>1){
String s2 = temp[1].substring(0,temp[1].indexOf("</a>"));
if(s2.contains("?")){//针对字符无法识别的情况,用文件名代替型号名
String s3=s2.substring(s2.lastIndexOf("?")+1, s2.length());
String fileName=w.substring(w.lastIndexOf("\\")+2, w.indexOf(".txt"));
if(fileName.contains("(")){
fileName=fileName.substring(0, fileName.indexOf("("));
}
s2=fileName+s3;
}
if(s2.contains("*")){//针对字符无法识别的情况,用文件名代替型号名
String s3=s2.replace("*","");
String fileName=w.substring(w.lastIndexOf("\\")+2, w.indexOf(".txt"));
if(fileName.contains("(")){
fileName=fileName.substring(0, fileName.indexOf("("));
}
s2=fileName+s3;
}
w2f.writeWebDownFile(s1+"\t"+s2+"\r\n");
// String param1=s1.substring(0,s1.lastIndexOf("/")+1)+"param_"+s1.substring(s1.lastIndexOf("/")+1,s1.length());
// w2fparam.writeWebDownFile(param1+"\t"+s2+"\r\n");
}
else continue;
}
}
homePageRead.close();
} catch (IOException ex) {
System.err.println(ex);
}finally {
if (homePageRead != null) {
homePageRead.close();
}
}
}//zhextract
/**
* 注:wFile中各型号之间用“========相机品牌+型号====”标注
* @param rFile 包含有该品牌下所有产品链接的文件的绝对路径,文件名为品牌名
* @param wFile 写入该品牌下所有产品的主页内容的文件的绝对路径,文件名为品牌名
* @throws IOException
* @throws InterruptedException
*/
public void productPageDown(String rFile,String wFile) throws IOException,InterruptedException{
BufferedReader rFileRead= null;
rFileRead=new BufferedReader(new InputStreamReader(
new FileInputStream(rFile),encoding));
String pageContent="";
String rFileLine="";
Mql_Write2File wr2f= new Mql_Write2File(wFile);
// int i=0;
Mql_WebDownloader webDown= new Mql_WebDownloader();
String[] urlLine;
for(rFileLine=rFileRead.readLine();rFileLine!=null;rFileLine=rFileRead.readLine()){
urlLine=rFileLine.split("\t");//urlLine[0]储存型号主页的url地址 urlLine[1]型号名
if(urlLine.length>1){
// System.out.println(urlLine[1]);
pageContent=webDown.webpageDownload(urlLine[0], encoding);
urlLine[1]=urlLine[1].replace("/","_");
System.out.println(urlLine[1]);
wr2f.writeWebDownFile("========"+urlLine[1]+"=====\r\n");
wr2f.writeWebDownFile(pageContent+"\r\n");
}
// System.out.println(i);
}//for(rFileLine)
rFileRead.close();
}// productPageRead ends
public static void main(String[] args) throws IOException, InterruptedException{
}//main
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -