catchthread.java

来自「用java实现的一个bbs的portal」· Java 代码 · 共 410 行 · 第 1/2 页

JAVA
410
字号
package BBSSpider;

import java.util.Date;

/**
 * Created by IntelliJ IDEA.
 * User: ADMINISTRATOR
 * Date: 2003-7-18
 * Time: 14:34:26
 * To change this template use Options | File Templates.
 */
public class CatchThread extends Thread{
    /**
     * If this thread was constructed using a separate
     * <code>Runnable</code> run object, then that
     * <code>Runnable</code> object's <code>run</code> method is called;
     * otherwise, this method does nothing and returns.
     * <p>
     * Subclasses of <code>Thread</code> should override this method.
     *
     * @see     Thread#start()
     * @see     Thread#stop()
     * @see     Thread#Thread(ThreadGroup,
            *          Runnable, String)
     * @see     Runnable#run()
     */
    Date latestDate;

    String bbsString;
    String boardString;
    String URLBase;
    String articleDivider;
    String articleRedirectionMask;
    String articleRedirectionBeginMask;
    String articleRedirectionEndMask;

    String contentMask;
    String contentBeginMask;
    String contentEndMask;
    String contentEndMask2;
    String tidyBeginTag;
    String tidyEndTag;

    String titleMask;
    String titleBeginMask;
    String titleEndMask;

    String authorMask;
    String authorBeginMask;
    String authorEndMask;

    String aliasMask;
    String aliasBeginMask;
    String aliasEndMask;

    String dateMask;
    String dateBeginMask;
    String dateEndMask;

    String dateStrBeginMask;
    String dateStrEndMask;
    String dateRepresentForm;

    String ipMask;
    String ipBeginMask;
    String ipEndMask;

    String dirPath;
    String errPath;
    CatchPage catchPage;
    boolean finished=false;
    String threadName;

    public CatchThread(CatchPage catPage,String name){
        bbsString=catPage.bbsString;
        boardString=catPage.boardString;
        threadName=name;
        catchPage=catPage;
        latestDate=catPage.latestDate;
        dirPath=catPage.dirPath;
        errPath=catPage.errPath;

        URLBase=catPage.URLbase;
        articleDivider=catPage.articleDivider;

        articleRedirectionMask=catPage.articleRedirectionMask;
        articleRedirectionBeginMask=catPage.articleRedirectionBeginMask;
        articleRedirectionEndMask=catPage.articleRedirectionEndMask;

        contentMask=catPage.contentMask;
        contentBeginMask=catPage.contentBeginMask;
        contentEndMask=catPage.contentEndMask;
        contentEndMask2=catPage.contentEndMask2;
        tidyBeginTag=catPage.tidyBeginTag;
        tidyEndTag=catPage.tidyEndTag;


        titleMask=catPage.titleMask;
        titleBeginMask=catPage.titleBeginMask;
        titleEndMask=catPage.titleEndMask;

        authorMask=catPage.authorMask;
        authorBeginMask=catPage.authorBeginMask;
        authorEndMask=catPage.authorEndMask;

        aliasMask=catPage.aliasMask;
        aliasBeginMask=catPage.aliasBeginMask;
        aliasEndMask=catPage.aliasEndMask;

        dateMask=catPage.dateMask;
        dateBeginMask=catPage.dateBeginMask;
        dateEndMask=catPage.dateEndMask;
        dateStrBeginMask=catPage.dateStrBeginMask;
        dateStrEndMask=catPage.dateStrEndMask;
        dateRepresentForm=catPage.dateRepresentForm;

        ipMask=catPage.ipMask;
        ipBeginMask=catPage.ipBeginMask;
        ipEndMask=catPage.ipEndMask;

    }

    public void catchItem(String itemPageUrl){
        String itemPage = null;
        String itemPageHTML=null;
        try {
            System.out.println(itemPageUrl);
            itemPageHTML = BotTool.doGet(itemPageUrl);
        } catch (Exception e) {
            System.out.println("此页面没有读出来");
            return;
            //clover
            //setBreakPoint();
        }
        itemPage=itemPageHTML;
        if (articleRedirectionMask!=null&&!articleRedirectionMask.equals("")){
            String redirectionURL=getItemValue(itemPageHTML,0,articleRedirectionMask,articleRedirectionBeginMask,articleRedirectionEndMask).trim();
            if (!redirectionURL.startsWith("http")){
                redirectionURL=URLBase+redirectionURL;
            }
            try {
                itemPage = BotTool.doGet(redirectionURL);
                //for SMTH
                itemPage=itemPage.replaceAll("\\Qdocument.write('\\E","");
                itemPage=itemPage.replaceAll("\\Q');\\E","");

                //for YTHT
                itemPage=itemPage.replaceAll("\\Q<!--\\E","");
                itemPage=itemPage.replaceAll("\\Qdocument.write(\"\\E","");
                itemPage=itemPage.replaceAll("\\Q\");\\E","");
                itemPage=itemPage.replaceAll("\\Q//-->\\E","");
                itemPage=itemPage.replace('\\',' ');

            } catch (Exception e) {
                System.out.println("此页面没有读出来");
            }
        }

        String dateStr="";
        try {
            String firstDateString=getItemValue(itemPage,0,dateMask,dateBeginMask,dateEndMask).trim();
            int dateStrBegin=firstDateString.indexOf(dateStrBeginMask);
            int dateStrEnd=0;
            if (!dateStrEndMask.equals("")){
                dateStrEnd=firstDateString.indexOf(dateStrEndMask,dateStrBegin+1);
            }
            else{
                dateStrEnd=firstDateString.length();
            }
            dateStr = firstDateString.substring(dateStrBegin+dateStrBeginMask.length(),dateStrEnd).trim();
        } catch (Exception e) {
            System.err.println("error during parsing the dateString");
        }
        dateStr=dateStr.replaceAll("&nbsp;"," ");
        dateStr=dateStr.replaceAll(" ","-");
        dateStr=dateStr.replaceAll("--","-");
        Date date=BotTool.String2Date(dateStr,dateRepresentForm);
        if (date == null){
            System.err.println(itemPageUrl+"\n此页面的时间表示没有解析出来!");
            try {
                BotTool.saveToFile(itemPage,errPath+bbsString+"_"+boardString
                        +BotTool.Date2String(latestDate,"yyyy-MM-dd_HHmmss")+"_err"
                        +".html");
            } catch (Exception e) {
                System.err.println(itemPageUrl+"\n此错误页面没有保存");
            }
            return;
        }
        System.out.println("date = " + BotTool.Date2String(date,"yyyy-MM-dd HH:mm:ss"));
        System.out.println("latestdate = " + BotTool.Date2String(latestDate,"yyyy-MM-dd HH:mm:ss"));

        if (date.after(latestDate)){
            catchPage.updateFirstArticleDate(date);

            BBSInfo bbsInfo=getXMLFromHTML(itemPageHTML);
            if (bbsInfo!=null){
                bbsInfo.saveToFile(dirPath);
            }
            else{
                System.err.println(itemPageUrl+"\n此页面没有解析出来");
                try {
                    BotTool.saveToFile(itemPage,errPath+bbsString+"_"+boardString
                            +BotTool.Date2String(date,"yyyy-MM-dd_HHmmss")+"_err"
                            +".html");
                } catch (Exception e) {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?