⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 model.java

📁 用来为垂直搜索引擎抓取数据的采集系统
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
				itsCollectedPage.setIdStrategy( s );
			} catch (ClassNotFoundException e) 
			{
				e.printStackTrace();
			}catch (InstantiationException e1) 
			{
				e1.printStackTrace();
			} catch (IllegalAccessException e1)
			{
				e1.printStackTrace();
			}
        }

        idFactory=new CollectedIdPageFactory();
        
//        if(  itsIdFront==null || itsIdBack==null || itsIdFront.equals("") || itsIdBack.equals("") )
//        {
//            itsIdsPage = new CollectedIdsPage();
//        }else
//        {
//            itsIdsPage = new CollectedIdsPage( itsIdFront, itsIdBack );
//        }
        itsIdsPage=idFactory.getCollectedIdsPage(itsIdFront, itsIdBack, itsIdBodyFront, itsIdBodyBack);
        itsIdsPage.setVisitPage( itsVisitPage );
        itsIdsPage.setIsByPost(IsIncludeJscript);
        itsIdsPage.setItsEncoding(itsEncoding);
        
        Parser parser = new Parser();
        parser.setItsEncoding(itsEncoding);
        itsPageMag = new PageParserManager( itsFilterFlag );       
        itsPageMag.setParser(parser);
        itsPageMag.setIsByPost(IsIncludeJscript);
        itsPageMag.setIsByLogin(IsByLogin);
        itsPageMag.setPairList( itsList );
        
    }
    
    public void setView( IView view )
    {
        itsView = view;
    }
    public void doStrategy()
    {
        boolean bExit = false;
        boolean bNotGet=false;
        boolean ReachLast=false;
        String nextUrl, url, aItem, tmpLastItem = null,firstItem=null,tmpUrl=null,firstUrl=null;
        readProperties();
        init();
    
        // if I cannot get the page till valveDate then stop
        int valveData=5;
        String valveStr = MainConfig.getInstance().getProperty("ValveDate");
        if( valveStr==null || valveStr.equals("") )
        {
            valveData = 5;
        }else
            valveData = Integer.parseInt( valveStr );
                
        
        int notGetCount=0;
        int count=0,j;
        url = itsVisitPage.getCurrentLink();
//        while( url!=null  )
        ArrayList ids=null;
        String temp="";
        boolean bGetFlag=false;
        while( url!=null && !bLastReached && !bExit )
        {
	        itsIdsPage.setUrl( url );
//	        System.out.println( url );
	        
	        ids = itsIdsPage.getIds();
//	        System.out.println( "ids="+ids );
	        if(existNest)// added by wbz for ids conversion;
	        	ids=itsCollectedPage.getIDs(ids, childFront, childBack);
	        for( int i=0; i<ids.size() && !bLastReached && !bExit; i++ )
	        {
	            String id=null;
	            id = (String) ids.get(i);
//	            System.out.println(ids.size()+" "+id);
	            if( id==null || id.equals("") )
	            	continue;
	            itsCollectedPage.setIsByPost(IsIncludeJscript);
	            if(isIdFilter)
	            {
	            	itsPageMag.clearTable();
	            id=IdFilter.Filter(id, itsPageMag);
	            }
	            url = itsCollectedPage.getCollectedUrl( id );
//	            url="http://www.iim.ac.cn/kaoqin3/new01.asp";
//	            System.out.println( url );
//	            url="http://www.csh.gov.cn/article_79275.html";
	            if(IsIncludeJscript)
	            {
	            this.ModifyPair(keyParam, url);
	            itsPageMag.setValuePair(data);
	            itsPageMag.setCollectedUrl(formUrl);
	            }else if(IsByLogin)
	            {
	            itsPageMag.setValuePair(data);
	            itsPageMag.setCollectedFromUrl(formUrl);
	            itsPageMag.setCollectedUrl(url);	            	
	            }
	            else
	            {
	            	itsPageMag.setCollectedUrl(url);
	            }
	           
	            bGetFlag = itsPageMag.open();
	            if( !bGetFlag )
	            {
	                notGetCount++;
	                if( notGetCount>=valveData )
	                {
	                    bExit = true;
	                    bNotGet=true;
	                }
	                continue;
	            }
	            
	            do
	            {
	            	if(itsInfoBodyFront!=null&&itsInfoBodyBack!=null)// add by wbz for useful information;
	            	{
	            		if(!itsInfoBodyFront.equals("")&&!itsInfoBodyBack.equals(""))
	            		itsPageMag.setItsPageStr(itsInfoBodyFront,itsInfoBodyBack);
	            	}//
	                aItem = itsPageMag.getAItem();
	                if (aItem != null)
	                {
	                    String str;
	                    j = aItem.lastIndexOf( ItemParser.GAP_TOKEN );
	                    str = aItem.substring( 0, j );

	                    if(useUrlMark)// add by wbz for marking lastItem with URL;
	                    {
	                    	 if(firstUrl!=null)
	 	                    {
	 	                    	if(firstUrl.equals(url))
	 	                    	{
	 	                    		bExit=true;
	 	                    		break;
	 	                    	}
	 	                    }
	 	                    else
	 	                    {
	 	                    	firstUrl=url;
	 	                    }
	                    	 if( lastItemUrl.equals( url) )
	 	                    {
	 	                    	temp=url;
	 	                    	
	 	                        bLastReached = true;
	 	                        break;
	 	                    }

	 	                    if( tmpUrl==null )
	 	                        tmpUrl = url;
	                    }//
	                   
	                    if( firstItem!=null )
		                {
//	                        System.out.println( firstItem + "," + str );
	                        if( firstItem.equals(str))
	                        {
	                            bExit = true;
//	                            System.out.println( "bExit=true" );
	                          //  ReachLast=true;
	                            break;
	                        }

		                }else
		                {
		                    firstItem = str;
		                }
	                    
                        str = getAShorterItem( str );

	                    if( bLastItemAvailable && str.equals( itsLastItem ) )
	                    {
	                    	temp=str;
	                    	
	                        bLastReached = true;
	                        break;
	                    }

	                    if( tmpLastItem==null )
	                        tmpLastItem = str;
	                   
	                    count++;
//	                    System.out.println( count + ")  " + aItem);
	                    setChanged();
	                    notifyObservers( aItem );
	                }

	            } while (aItem != null);
//	            if(itsPageMag.getErrorField()!=null)
//	            	System.out.println(itsPropertyFile.toString()+": "+itsPageMag.getErrorMark()+": "+itsPageMag.getErrorField());
	            itsPageMag.close();
//	            break;//edited by wbz for debug.
	        }      
	        url = itsVisitPage.getNextVisitLink();
//	        break;//edited by wbz for debug.
        }
        
        if( bLastItemAvailable )// modified by wbz and add the function that marking lastItem with URL;
        {
        	TaskProperties props = new TaskProperties( itsPropertyFile );
        	if( tmpLastItem!=null )
        	{ itsLastItem = tmpLastItem;
	        
	        props.changeProperty( "LastItem", itsLastItem );}
        	if(tmpUrl!=null)
	        props.changeProperty("LastItemUrl", tmpUrl);
        }
        String str = itsPropertyFile + ": Get Total Items Count=" + count;
        LogEntity log=new LogEntity();
        String whyCode=null;
        String zeroCode=null;
        String Total=count+"";
        log.setPATH(itsPropertyFile);
        log.setRePath(itsPropertyFile);
        log.setTOTAL(Total);
        //        FileLogger.getInstance().info( str );
        // add by wbz for more particular logs;
        if(bNotGet||!CollectedIdsPage.isOpen)
        {
        	whyCode="获取ids时网络链接失败!";
          FileLoggerParam.getInstance().info( str+"  "+"获取ids时网络链接失败!" );
          CollectedIdsPage.isOpen=false;
        }
        else if(ids.size()==0&&CollectedIdsPage.isOpen)
        {
        	whyCode="获取ids字段配置错误!";
        	FileLoggerParam.getInstance().info( str+"  "+"获取ids字段配置错误!" );
        }
        else 
        	{
        	if( bLastReached)
        	{
        	
        		FileLoggerParam.getInstance().info( str+"  "+"已经到达上次采集数据!");//+temp+"<-->"+itsLastItem );
        	 whyCode="已经到达上次采集数据!";
        	//FileLoggerParam.getInstance().info(temp+"<-->"+itsLastItem);
        	
        	}
        else if(firstItem==null&&bGetFlag)
        	{
        	whyCode=" 根据模板无法采集到数据!在第"+itsPageMag.getErrorMark()+
			"字段处: "+itsPageMag.getErrorField();
        	FileLoggerParam.getInstance().info( str+"  "+whyCode);
        	 
        	}
        else if(firstItem==null&&!bGetFlag)
        {
        	whyCode="网络连接失败!";
        	FileLoggerParam.getInstance().info( str+"  "+whyCode);
        }
        else if(firstItem!=null&&!bLastReached)
        {
        	whyCode=" 模板中设置的结束页面小于最后一条所在的页面!";
        	FileLoggerParam.getInstance().info( str+"  "+" 模板中设置的结束页面小于最后一条所在的页面!" );
        }
        	}
        if( 0==count )
        	//FileLoggerNumZero.getInstance().info( str );
        { if(bNotGet||!CollectedIdsPage.isOpen)
        {
        	zeroCode="获取ids时网络链接失败!";
        	FileLoggerNumZero.getInstance().info( str+"  "+"获取ids时网络链接失败!" );
            CollectedIdsPage.isOpen=false;
          }
          else if(ids.size()==0&&CollectedIdsPage.isOpen)
          {
        	  zeroCode="获取ids字段配置错误!" ;
        	  FileLoggerNumZero.getInstance().info( str+"  "+"获取ids字段配置错误!" );
          }
          else 
          	{
          	if( bLastReached)
          		{
          		zeroCode="已经到达上次采集数据!";
          		FileLoggerNumZero.getInstance().info( str+"  "+"已经到达上次采集数据!");//+temp+"<-->"+itsLastItem );
          		
          		}
          	else if(firstItem==null&&bGetFlag)
          	{
          			zeroCode=" 根据模板无法采集到数据!在第"+itsPageMag.getErrorMark()+
          			"字段处: "+itsPageMag.getErrorField();
        	  FileLoggerNumZero.getInstance().info( str+"  "+zeroCode);
          	 
          	}
          	 else if(firstItem==null&&!bGetFlag)
             {
             	zeroCode="网络连接失败!";
             	FileLoggerParam.getInstance().info( str+"  "+whyCode);
             }
          else if(firstItem!=null&&!bLastReached)
          {
        	  zeroCode=" 模板中设置的结束页面小于最后一条所在的页面!";
        	  FileLoggerNumZero.getInstance().info( str+"  "+" 模板中设置的结束页面小于最后一条所在的页面!" );
          }
          	}
        }
        String totalStr=null;
        if(whyCode!=null)
        	{
        	log.setWHYCODE(whyCode);
        	totalStr=str+"  "+whyCode;
        	}
        if(zeroCode!=null)
        	{
        	log.setZeroCode(zeroCode);
        	totalStr=str+"  "+zeroCode;
        	}
        if(totalStr!=null)
        	log.setWHYSTRING(totalStr);
        String isTest = MainConfig.getInstance().getProperty("IsTest");
        if(isTest==null||isTest.equalsIgnoreCase("false"))
        { 
        	//System.out.println(isTest);
        	log.Execute();
        }
        
    }
    public NameValuePair[] ModifyPair(String key,String value)
    {
    	int length=data.length;
    	for(int i=0;i<length;i++)
    	{
    		if(data[i].getName().equals(key))
    			data[i]=new NameValuePair(key,value);
    	}
    	return data;
    	
    }
    public String getAShorterItem( String item )
    {
        String tmp="",str="";
        StringTokenizer tok = new StringTokenizer( item, ItemParser.GAP_TOKEN );
        while( tok.hasMoreTokens() )
        {
            tmp = tok.nextToken();
            if( tmp.length()>itsMaxSize )
            {
                tmp = tmp.substring( 0, itsMaxSize );
            }
            str += tmp + ItemParser.GAP_TOKEN; 
        }
//        System.out.println( "len="+str.length() );
//        System.out.println( "str="+str );

        if( str==null || str.equals("") )
        	return "";
        
        str = str.substring( 0, str.length()-1 );
        return str;
    }
}



⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -