⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 testpageparsermanager1.java

📁 用来为垂直搜索引擎抓取数据的采集系统
💻 JAVA
字号:
/*
 * *****************************************************
 * Copyright (c) 2005 IIM Lab. All  Rights Reserved.
 * Created by xuehao at 2005-10-12
 * Contact: zxuehao@mail.ustc.edu.cn
 * *****************************************************
 */

package org.indigo.tests.parser;

import java.util.ArrayList;

import junit.framework.TestCase;

import org.indigo.pages.CollectedIdsPage;
import org.indigo.pages.CollectedPage;
import org.indigo.pages.VisitPage;
import org.indigo.parser.PageParserManager;
import org.indigo.parser.Parser;

public class TestPageParserManager1 extends TestCase
{
    public void testPageParserManager1()
    {
        String url = "http://www.ahnw.gov.cn/scxx/schq/?datetime=&page=1&zl=&diqu=&chanpin=&dl=&NewDay=0";

        VisitPage vPage = new VisitPage("page");
        vPage.setBeginUrl(url);
        vPage.setParameters(1, 3, 1);

        CollectedPage cPage = new CollectedPage("page");
        cPage.setBeginUrl( url );

        CollectedIdsPage idsPage = new CollectedIdsPage();
        idsPage.setVisitPage( vPage );

        Parser parser = new Parser();
        PageParserManager pageMag = new PageParserManager(true);
        pageMag.setParser(parser);

        String startStr, endStr;

        startStr = "<td class=\"z\" width=\"24%\" height=20 style=\"border-right:1 solid #FFFFFF;border-bottom: 1 solid #FFFFFF\">&nbsp;";
        endStr = "</td>";
        pageMag.addField(startStr, endStr);

        startStr = "<td width=\"11%\" class=\"z\" style=\"border-right:1 solid #FFFFFF;border-bottom: 1 solid #FFFFFF\">&nbsp;";
        pageMag.addField(startStr, endStr);

        startStr = "<td width=\"12%\" class=\"z\" style=\"border-right:1 solid #FFFFFF;border-bottom: 1 solid #FFFFFF\">&nbsp;";
        pageMag.addField(startStr, endStr);

        startStr = "<td width=\"45%\" class=\"z\" style=\"border-right:1 solid #FFFFFF;border-bottom: 1 solid #FFFFFF\">&nbsp;";
        pageMag.addField(startStr, endStr);

        startStr = "<td width=\"8%\" class=\"z\" align=\"center\" style=\"border-right:1 solid #FFFFFF;border-bottom: 1 solid #FFFFFF\" nowrap>";
        pageMag.addField( startStr, endStr );
        
        
        String aItem = null;
        String nextUrl = null;
        
        url = vPage.getCurrentLink();
        while( url!=null )
        {
	        idsPage.setUrl( url );
	        ArrayList ids=null;
	        ids = idsPage.getIds();

	        for( int i=0; i<ids.size(); i++ )
	        {
	            String id=null;
	            id = (String) ids.get(i);
	            url = cPage.getCollectedUrl( id );
	            System.out.println( url );
	            
	            pageMag.setCollectedUrl(url);
	            pageMag.open();
	            do
	            {
	                aItem = pageMag.getAItem();
	                if (aItem != null)
	                    System.out.println(aItem);

	            } while (aItem != null);
	            pageMag.close();
	            
	        }      
	        url = vPage.getNextVisitLink();
        }
        
        System.out.println( "TestCollectedPage1 over." );
    }
    
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -