catchthread.java

来自「用java实现的一个bbs的portal」· Java 代码 · 共 410 行 · 第 1/2 页

JAVA
410
字号
                    System.err.println(itemPageUrl+"\n此错误页面没有保存");
                }
            }
        }
        else{
            System.out.println(itemPageUrl+"\n此页面已经不是所要的时间段");
            finished=true;
            catchPage.setPageOk(false);
            return ;
        }

    }

    private BBSInfo getXMLFromHTML(String htmlFile){

//            BotTool.saveToFile(
//                    titleString+"\n"+
//                    contentString+"\n"+
//                    authorString+"\n"+
//                    aliasString+"\n"+
//                    dateString+"\n"+
//                    ipString+"\n",
//                    dirPath+titleString+".txt"
//            );
        String itemPage=htmlFile;
        if (articleRedirectionMask!=null&&!articleRedirectionMask.equals("")){
            String redirectionURL=getItemValue(htmlFile,0,articleRedirectionMask,articleRedirectionBeginMask,articleRedirectionEndMask).trim();
            if (!redirectionURL.startsWith("http")){
                redirectionURL=URLBase+redirectionURL;
            }
            try {
                itemPage = BotTool.doGet(redirectionURL);
                //for SMTH
                itemPage=itemPage.replaceAll("\\Qdocument.write('\\E","");
                itemPage=itemPage.replaceAll("\\Q');\\E","");

                //for YTHT
                itemPage=itemPage.replaceAll("\\Q<!--\\E","");
                itemPage=itemPage.replaceAll("\\Qdocument.write(\"\\E","");
                itemPage=itemPage.replaceAll("\\Q\");\\E","");
                itemPage=itemPage.replaceAll("\\Q//-->\\E","");
                itemPage=itemPage.replace('\\',' ');

            } catch (Exception e) {
                System.out.println("此页面没有读出来");
            }
        }

        String titleString=null;
        String contentString=null;
        String authorString=null;
        String aliasString=null;
        String dateString=null;
        String ipString=null;
        try {
            titleString = getItemValue(itemPage,0,titleMask,titleBeginMask,titleEndMask).trim();
            titleString=tidyContent(titleString);
            contentString = getItemValue(itemPage,0,contentMask,contentBeginMask,contentEndMask,contentEndMask2).trim();
            contentString=tidyContent(contentString);
            authorString = getItemValue(itemPage,0,authorMask,authorBeginMask,authorEndMask).trim();
            aliasString = getItemValue(itemPage,0,aliasMask,aliasBeginMask,aliasEndMask).trim();
            aliasString=tidyContent(aliasString);
            dateString = getItemValue(itemPage,0,dateMask,dateBeginMask,dateEndMask).trim();
            dateString=tidyContent(dateString);
            ipString = getItemValue(itemPage,0,ipMask,ipBeginMask,ipEndMask).trim();
        } catch (Exception e) {
            //e.printStackTrace();  //To change body of catch statement use Options | File Templates.
            System.err.println("exception is happen during changing html to xml");
            titleString="";
            contentString="";
            authorString="";
            aliasString="";
            dateString="";
            ipString="";
            return null;

        }

        BBSInfo bbsInfo=new BBSInfo();
        bbsInfo.addArticle(bbsString,boardString,authorString,aliasString,titleString,contentString,dateString,ipString);

        int index=-1;
        if (!articleDivider.equals("")){
            index=htmlFile.indexOf(articleDivider);
            index=htmlFile.indexOf(articleDivider,index+articleDivider.length());
        }
        while(index!=-1){
            if (articleRedirectionMask!=null&&!articleRedirectionMask.equals("")){
                String redirectionURL=getItemValue(htmlFile,index,articleRedirectionMask,articleRedirectionBeginMask,articleRedirectionEndMask).trim();
                if (!redirectionURL.startsWith("http")){
                    redirectionURL=URLBase+redirectionURL;
                }
                try {
                    itemPage = BotTool.doGet(redirectionURL);
                    //for SMTH
                    itemPage=itemPage.replaceAll("\\Qdocument.write('\\E","");
                    itemPage=itemPage.replaceAll("\\Q');\\E","");

                    //for YTHT
                    itemPage=itemPage.replaceAll("\\Q<!--\\E","");
                    itemPage=itemPage.replaceAll("\\Qdocument.write(\"\\E","");
                    itemPage=itemPage.replaceAll("\\Q\");\\E","");
                    itemPage=itemPage.replaceAll("\\Q//-->\\E","");
                    itemPage=itemPage.replace('\\',' ');

                } catch (Exception e) {
                    System.out.println("此页面没有读出来");
                }
            }
            else{
                itemPage=htmlFile.substring(index);
            }



            String reContentString=null;
            String reAuthorString=null;
            String reAliasString=null;
            String reDateString=null;
            String reIpString=null;
            try {
                reContentString = getItemValue(itemPage,0,contentMask,contentBeginMask,contentEndMask,contentEndMask2).trim();
                reContentString=tidyContent(reContentString);
                reAuthorString = getItemValue(itemPage,0,authorMask,authorBeginMask,authorEndMask).trim();
                reAliasString = getItemValue(itemPage,0,aliasMask,aliasBeginMask,aliasEndMask).trim();
                reAliasString=tidyContent(reAliasString);
                reDateString = getItemValue(itemPage,0,dateMask,dateBeginMask,dateEndMask).trim();
                reDateString=tidyContent(reDateString);
                reIpString = getItemValue(itemPage,0,ipMask,ipBeginMask,ipEndMask).trim();
            } catch (Exception e) {
                System.err.println("exception is happen during changing html to xml");
                reContentString=" ";
                reAuthorString="";
                reAliasString="";
                reDateString="";
                reIpString="";
                return null;
            }
            bbsInfo.addRe(reAuthorString,reAliasString,reContentString,reDateString,reIpString);
            index=itemPage.indexOf(articleDivider,index+articleDivider.length());
        }
        return bbsInfo;
    }
    private String tidyContent(String contentString){
        contentString=contentString.replaceAll(tidyBeginTag+"[^"+tidyBeginTag+tidyEndTag+"]*"+tidyEndTag,"");
        contentString=contentString.replaceAll("&nbsp;"," ");
        contentString=contentString.replaceAll("&","&amp;");
        contentString=contentString.replaceAll("<","&lt;");
        contentString=contentString.replaceAll(">","&gt;");
        contentString=contentString.replaceAll("\"","&quot;");
        return contentString;
    }

    private String getItemValue(String page,int index,String itemMask,String itemBeginMask,String itemEndMask){
        int idx=page.indexOf(itemMask,index);
        int beginIdx = page.indexOf(itemBeginMask,idx);
        int endIdx = page.indexOf(itemEndMask,beginIdx);
        try {
            return page.substring(beginIdx + itemBeginMask.length(), endIdx);
        } catch (Exception e) {
            return null;
        }
    }

    private String getItemValue(String page,int index,String itemMask,String itemBeginMask,String itemEndMask,String itemEndMask2){
        int idx=page.indexOf(itemMask,index);
        int beginIdx = page.indexOf(itemBeginMask,idx);
        int endIdx = page.indexOf(itemEndMask,beginIdx);
        if (endIdx==-1){
            endIdx=page.indexOf(itemEndMask2,beginIdx);
        }
        try {
            return page.substring(beginIdx + itemBeginMask.length(), endIdx);
        } catch (Exception e) {
            return null;
        }
    }

    public void run() {
        int time=0;
        while(!finished){
            if(!catchPage.noURL()){
                String itemPageUrl=catchPage.getUrlFromVector();
                System.out.println("线程:"+threadName+"抓取"+itemPageUrl);
                catchItem(itemPageUrl);
                time=0;
            }
            else{
                try {
                    sleep(1000);
                    System.out.println("线程:"+threadName+"等待中"+"urlVector.size is"+catchPage.urlVector.size());
                    time++;
                    if (time>30){
                        finished=true;
                        catchPage.setPageOk(false);
                        System.out.println("线程:"+threadName+"超时结束");
                    }
                } catch (InterruptedException e) {
                    e.printStackTrace();  //To change body of catch statement use Options | File Templates.
                }
            }
        }
    }
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?