catchthread.java
来自「用java实现的一个bbs的portal」· Java 代码 · 共 410 行 · 第 1/2 页
JAVA
410 行
System.err.println(itemPageUrl+"\n此错误页面没有保存");
}
}
}
else{
System.out.println(itemPageUrl+"\n此页面已经不是所要的时间段");
finished=true;
catchPage.setPageOk(false);
return ;
}
}
private BBSInfo getXMLFromHTML(String htmlFile){
// BotTool.saveToFile(
// titleString+"\n"+
// contentString+"\n"+
// authorString+"\n"+
// aliasString+"\n"+
// dateString+"\n"+
// ipString+"\n",
// dirPath+titleString+".txt"
// );
String itemPage=htmlFile;
if (articleRedirectionMask!=null&&!articleRedirectionMask.equals("")){
String redirectionURL=getItemValue(htmlFile,0,articleRedirectionMask,articleRedirectionBeginMask,articleRedirectionEndMask).trim();
if (!redirectionURL.startsWith("http")){
redirectionURL=URLBase+redirectionURL;
}
try {
itemPage = BotTool.doGet(redirectionURL);
//for SMTH
itemPage=itemPage.replaceAll("\\Qdocument.write('\\E","");
itemPage=itemPage.replaceAll("\\Q');\\E","");
//for YTHT
itemPage=itemPage.replaceAll("\\Q<!--\\E","");
itemPage=itemPage.replaceAll("\\Qdocument.write(\"\\E","");
itemPage=itemPage.replaceAll("\\Q\");\\E","");
itemPage=itemPage.replaceAll("\\Q//-->\\E","");
itemPage=itemPage.replace('\\',' ');
} catch (Exception e) {
System.out.println("此页面没有读出来");
}
}
String titleString=null;
String contentString=null;
String authorString=null;
String aliasString=null;
String dateString=null;
String ipString=null;
try {
titleString = getItemValue(itemPage,0,titleMask,titleBeginMask,titleEndMask).trim();
titleString=tidyContent(titleString);
contentString = getItemValue(itemPage,0,contentMask,contentBeginMask,contentEndMask,contentEndMask2).trim();
contentString=tidyContent(contentString);
authorString = getItemValue(itemPage,0,authorMask,authorBeginMask,authorEndMask).trim();
aliasString = getItemValue(itemPage,0,aliasMask,aliasBeginMask,aliasEndMask).trim();
aliasString=tidyContent(aliasString);
dateString = getItemValue(itemPage,0,dateMask,dateBeginMask,dateEndMask).trim();
dateString=tidyContent(dateString);
ipString = getItemValue(itemPage,0,ipMask,ipBeginMask,ipEndMask).trim();
} catch (Exception e) {
//e.printStackTrace(); //To change body of catch statement use Options | File Templates.
System.err.println("exception is happen during changing html to xml");
titleString="";
contentString="";
authorString="";
aliasString="";
dateString="";
ipString="";
return null;
}
BBSInfo bbsInfo=new BBSInfo();
bbsInfo.addArticle(bbsString,boardString,authorString,aliasString,titleString,contentString,dateString,ipString);
int index=-1;
if (!articleDivider.equals("")){
index=htmlFile.indexOf(articleDivider);
index=htmlFile.indexOf(articleDivider,index+articleDivider.length());
}
while(index!=-1){
if (articleRedirectionMask!=null&&!articleRedirectionMask.equals("")){
String redirectionURL=getItemValue(htmlFile,index,articleRedirectionMask,articleRedirectionBeginMask,articleRedirectionEndMask).trim();
if (!redirectionURL.startsWith("http")){
redirectionURL=URLBase+redirectionURL;
}
try {
itemPage = BotTool.doGet(redirectionURL);
//for SMTH
itemPage=itemPage.replaceAll("\\Qdocument.write('\\E","");
itemPage=itemPage.replaceAll("\\Q');\\E","");
//for YTHT
itemPage=itemPage.replaceAll("\\Q<!--\\E","");
itemPage=itemPage.replaceAll("\\Qdocument.write(\"\\E","");
itemPage=itemPage.replaceAll("\\Q\");\\E","");
itemPage=itemPage.replaceAll("\\Q//-->\\E","");
itemPage=itemPage.replace('\\',' ');
} catch (Exception e) {
System.out.println("此页面没有读出来");
}
}
else{
itemPage=htmlFile.substring(index);
}
String reContentString=null;
String reAuthorString=null;
String reAliasString=null;
String reDateString=null;
String reIpString=null;
try {
reContentString = getItemValue(itemPage,0,contentMask,contentBeginMask,contentEndMask,contentEndMask2).trim();
reContentString=tidyContent(reContentString);
reAuthorString = getItemValue(itemPage,0,authorMask,authorBeginMask,authorEndMask).trim();
reAliasString = getItemValue(itemPage,0,aliasMask,aliasBeginMask,aliasEndMask).trim();
reAliasString=tidyContent(reAliasString);
reDateString = getItemValue(itemPage,0,dateMask,dateBeginMask,dateEndMask).trim();
reDateString=tidyContent(reDateString);
reIpString = getItemValue(itemPage,0,ipMask,ipBeginMask,ipEndMask).trim();
} catch (Exception e) {
System.err.println("exception is happen during changing html to xml");
reContentString=" ";
reAuthorString="";
reAliasString="";
reDateString="";
reIpString="";
return null;
}
bbsInfo.addRe(reAuthorString,reAliasString,reContentString,reDateString,reIpString);
index=itemPage.indexOf(articleDivider,index+articleDivider.length());
}
return bbsInfo;
}
private String tidyContent(String contentString){
contentString=contentString.replaceAll(tidyBeginTag+"[^"+tidyBeginTag+tidyEndTag+"]*"+tidyEndTag,"");
contentString=contentString.replaceAll(" "," ");
contentString=contentString.replaceAll("&","&");
contentString=contentString.replaceAll("<","<");
contentString=contentString.replaceAll(">",">");
contentString=contentString.replaceAll("\"",""");
return contentString;
}
private String getItemValue(String page,int index,String itemMask,String itemBeginMask,String itemEndMask){
int idx=page.indexOf(itemMask,index);
int beginIdx = page.indexOf(itemBeginMask,idx);
int endIdx = page.indexOf(itemEndMask,beginIdx);
try {
return page.substring(beginIdx + itemBeginMask.length(), endIdx);
} catch (Exception e) {
return null;
}
}
private String getItemValue(String page,int index,String itemMask,String itemBeginMask,String itemEndMask,String itemEndMask2){
int idx=page.indexOf(itemMask,index);
int beginIdx = page.indexOf(itemBeginMask,idx);
int endIdx = page.indexOf(itemEndMask,beginIdx);
if (endIdx==-1){
endIdx=page.indexOf(itemEndMask2,beginIdx);
}
try {
return page.substring(beginIdx + itemBeginMask.length(), endIdx);
} catch (Exception e) {
return null;
}
}
public void run() {
int time=0;
while(!finished){
if(!catchPage.noURL()){
String itemPageUrl=catchPage.getUrlFromVector();
System.out.println("线程:"+threadName+"抓取"+itemPageUrl);
catchItem(itemPageUrl);
time=0;
}
else{
try {
sleep(1000);
System.out.println("线程:"+threadName+"等待中"+"urlVector.size is"+catchPage.urlVector.size());
time++;
if (time>30){
finished=true;
catchPage.setPageOk(false);
System.out.println("线程:"+threadName+"超时结束");
}
} catch (InterruptedException e) {
e.printStackTrace(); //To change body of catch statement use Options | File Templates.
}
}
}
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?