catchthread.java
来自「用java实现的一个bbs的portal」· Java 代码 · 共 410 行 · 第 1/2 页
JAVA
410 行
package BBSSpider;
import java.util.Date;
/**
* Created by IntelliJ IDEA.
* User: ADMINISTRATOR
* Date: 2003-7-18
* Time: 14:34:26
* To change this template use Options | File Templates.
*/
public class CatchThread extends Thread{
/**
* If this thread was constructed using a separate
* <code>Runnable</code> run object, then that
* <code>Runnable</code> object's <code>run</code> method is called;
* otherwise, this method does nothing and returns.
* <p>
* Subclasses of <code>Thread</code> should override this method.
*
* @see Thread#start()
* @see Thread#stop()
* @see Thread#Thread(ThreadGroup,
* Runnable, String)
* @see Runnable#run()
*/
Date latestDate;
String bbsString;
String boardString;
String URLBase;
String articleDivider;
String articleRedirectionMask;
String articleRedirectionBeginMask;
String articleRedirectionEndMask;
String contentMask;
String contentBeginMask;
String contentEndMask;
String contentEndMask2;
String tidyBeginTag;
String tidyEndTag;
String titleMask;
String titleBeginMask;
String titleEndMask;
String authorMask;
String authorBeginMask;
String authorEndMask;
String aliasMask;
String aliasBeginMask;
String aliasEndMask;
String dateMask;
String dateBeginMask;
String dateEndMask;
String dateStrBeginMask;
String dateStrEndMask;
String dateRepresentForm;
String ipMask;
String ipBeginMask;
String ipEndMask;
String dirPath;
String errPath;
CatchPage catchPage;
boolean finished=false;
String threadName;
public CatchThread(CatchPage catPage,String name){
bbsString=catPage.bbsString;
boardString=catPage.boardString;
threadName=name;
catchPage=catPage;
latestDate=catPage.latestDate;
dirPath=catPage.dirPath;
errPath=catPage.errPath;
URLBase=catPage.URLbase;
articleDivider=catPage.articleDivider;
articleRedirectionMask=catPage.articleRedirectionMask;
articleRedirectionBeginMask=catPage.articleRedirectionBeginMask;
articleRedirectionEndMask=catPage.articleRedirectionEndMask;
contentMask=catPage.contentMask;
contentBeginMask=catPage.contentBeginMask;
contentEndMask=catPage.contentEndMask;
contentEndMask2=catPage.contentEndMask2;
tidyBeginTag=catPage.tidyBeginTag;
tidyEndTag=catPage.tidyEndTag;
titleMask=catPage.titleMask;
titleBeginMask=catPage.titleBeginMask;
titleEndMask=catPage.titleEndMask;
authorMask=catPage.authorMask;
authorBeginMask=catPage.authorBeginMask;
authorEndMask=catPage.authorEndMask;
aliasMask=catPage.aliasMask;
aliasBeginMask=catPage.aliasBeginMask;
aliasEndMask=catPage.aliasEndMask;
dateMask=catPage.dateMask;
dateBeginMask=catPage.dateBeginMask;
dateEndMask=catPage.dateEndMask;
dateStrBeginMask=catPage.dateStrBeginMask;
dateStrEndMask=catPage.dateStrEndMask;
dateRepresentForm=catPage.dateRepresentForm;
ipMask=catPage.ipMask;
ipBeginMask=catPage.ipBeginMask;
ipEndMask=catPage.ipEndMask;
}
public void catchItem(String itemPageUrl){
String itemPage = null;
String itemPageHTML=null;
try {
System.out.println(itemPageUrl);
itemPageHTML = BotTool.doGet(itemPageUrl);
} catch (Exception e) {
System.out.println("此页面没有读出来");
return;
//clover
//setBreakPoint();
}
itemPage=itemPageHTML;
if (articleRedirectionMask!=null&&!articleRedirectionMask.equals("")){
String redirectionURL=getItemValue(itemPageHTML,0,articleRedirectionMask,articleRedirectionBeginMask,articleRedirectionEndMask).trim();
if (!redirectionURL.startsWith("http")){
redirectionURL=URLBase+redirectionURL;
}
try {
itemPage = BotTool.doGet(redirectionURL);
//for SMTH
itemPage=itemPage.replaceAll("\\Qdocument.write('\\E","");
itemPage=itemPage.replaceAll("\\Q');\\E","");
//for YTHT
itemPage=itemPage.replaceAll("\\Q<!--\\E","");
itemPage=itemPage.replaceAll("\\Qdocument.write(\"\\E","");
itemPage=itemPage.replaceAll("\\Q\");\\E","");
itemPage=itemPage.replaceAll("\\Q//-->\\E","");
itemPage=itemPage.replace('\\',' ');
} catch (Exception e) {
System.out.println("此页面没有读出来");
}
}
String dateStr="";
try {
String firstDateString=getItemValue(itemPage,0,dateMask,dateBeginMask,dateEndMask).trim();
int dateStrBegin=firstDateString.indexOf(dateStrBeginMask);
int dateStrEnd=0;
if (!dateStrEndMask.equals("")){
dateStrEnd=firstDateString.indexOf(dateStrEndMask,dateStrBegin+1);
}
else{
dateStrEnd=firstDateString.length();
}
dateStr = firstDateString.substring(dateStrBegin+dateStrBeginMask.length(),dateStrEnd).trim();
} catch (Exception e) {
System.err.println("error during parsing the dateString");
}
dateStr=dateStr.replaceAll(" "," ");
dateStr=dateStr.replaceAll(" ","-");
dateStr=dateStr.replaceAll("--","-");
Date date=BotTool.String2Date(dateStr,dateRepresentForm);
if (date == null){
System.err.println(itemPageUrl+"\n此页面的时间表示没有解析出来!");
try {
BotTool.saveToFile(itemPage,errPath+bbsString+"_"+boardString
+BotTool.Date2String(latestDate,"yyyy-MM-dd_HHmmss")+"_err"
+".html");
} catch (Exception e) {
System.err.println(itemPageUrl+"\n此错误页面没有保存");
}
return;
}
System.out.println("date = " + BotTool.Date2String(date,"yyyy-MM-dd HH:mm:ss"));
System.out.println("latestdate = " + BotTool.Date2String(latestDate,"yyyy-MM-dd HH:mm:ss"));
if (date.after(latestDate)){
catchPage.updateFirstArticleDate(date);
BBSInfo bbsInfo=getXMLFromHTML(itemPageHTML);
if (bbsInfo!=null){
bbsInfo.saveToFile(dirPath);
}
else{
System.err.println(itemPageUrl+"\n此页面没有解析出来");
try {
BotTool.saveToFile(itemPage,errPath+bbsString+"_"+boardString
+BotTool.Date2String(date,"yyyy-MM-dd_HHmmss")+"_err"
+".html");
} catch (Exception e) {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?