⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 xmltablereader.java

📁 用来为垂直搜索引擎抓取数据的采集系统
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/*
 * *****************************************************
 * Copyright (c) 2005 IIM Lab. All  Rights Reserved.
 * Created by xuehao at Dec 1, 2005
 * Contact: zxuehao@mail.ustc.edu.cn
 * *****************************************************
 */
package org.indigo.xml;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;

import org.indigo.db.*;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.indigo.util.*;
/**
 * 此类完成从xml文件中读取采集到的数据
 * 并根据xml中内容分别封装到不同的对象中,
 * 为后来的插入数据库做准备。
 * @author wbz
 *
 */
public class XmlTableReader
{
    private Document itsDoc=null;
    private String itsXmlName=null;
    private HashMap itsMap=null;
    private NewsInfo itsNewsInfo=null;
    private NewsInfo itsMarketNewsInfo=null;
    private TechInfo itsTechInfo=null;
    private JobInfo itsJobInfo=null;
    private PriceInfo itsPriceInfo=null;
    private SplDemInfo itsSplDemInfo=null;
    private QuestionInfo itsQuestionInfo=null;
    private CompanyInfo itsCompanyInfo=null;
    private static final String PRICE_PREFIX = "agriprice_tbl.";    
    private static final String SPlDEM_PREFIX = "gqxx_tbl.";
    private static final String NEWS_PREFIX = "news_tbl.";
    private static final String TECH_PRIFIX = "tech_tbl.";
    private static final String MARKET_PRIFIX = "market_tbl.";
    private static final String JOB_PRIFIX="job_tbl.";
    private static final String QUESTION_PRIFIX="question_tbl.";
    private static final String COMPANY_PRIFIX="company_tbl.";
    public XmlTableReader( String xmlName )
    {
        
        File file = new File( xmlName );
        if( !file.exists() )
        {
            System.out.println( xmlName + " doesnot exists!" );
            itsXmlName = null;
        }
        else
            itsXmlName = xmlName;
        
        if( itsXmlName==null )
            return;
        
        itsMap = new HashMap();
        
        SAXBuilder builder = new SAXBuilder();

        try
        {
            itsDoc = builder.build(new InputStreamReader(
                    new FileInputStream(itsXmlName)));
        } catch (FileNotFoundException e1)
        {
            e1.printStackTrace();
        } catch (JDOMException e1)
        {
            e1.printStackTrace();
        } catch (IOException e1)
        {
            e1.printStackTrace();
        }

    }
    public void readAll()
    {
        if( itsDoc==null )
            return;
        
        // 1. do with the definition.
        Element definition = itsDoc.getRootElement().getChild("Definition");
        
        String tableType="unknown";

        int ii=0;
        Element em = null;
        em = definition.getChild( "tabletype" );
        if( em!=null )
        {
            tableType = em.getText();
            itsMap.put( "tabletype", tableType );
        }
        
        int i,j;
        List list = definition.getChildren();
        for( i=1; i<list.size(); i++ )
        {
            em = definition.getChild( "col"+i );
            itsMap.put( em.getName(), em.getText() );
        }

        // 2. do with the data.
        Element data = itsDoc.getRootElement().getChild("Data");
        Element rows = data.getChild("rows");
        
        
        String posValue;
        int pos;
        List listRow = rows.getChildren();
        for(  i=0; i<listRow.size(); i++ )
        {
            Element row = (Element) listRow.get(i);
            List listCol = row.getChildren();
            // do with the columns of a row
            for( j=0; j<listCol.size(); j++ )
            {

                em = (Element)listCol.get(j);
                String name,text;
                name = em.getName().trim().toLowerCase();
                text = em.getText().trim().toLowerCase();

                if( itsNewsInfo==null && tableType.equalsIgnoreCase("agri_news") )
                {
                    itsNewsInfo = new NewsInfo();
                }
                if( itsMarketNewsInfo==null && tableType.equalsIgnoreCase("agri_newsmarket") )
                {
                	itsMarketNewsInfo = new NewsInfo();
                }
                if( itsTechInfo==null && tableType.equalsIgnoreCase("agri_tech") )
                {
                    itsTechInfo = new TechInfo();
                }
                if( itsPriceInfo==null && tableType.equalsIgnoreCase("agri_jghq") )
	            {
                    itsPriceInfo = new PriceInfo();
	            }else
	            if( itsSplDemInfo==null && tableType.equalsIgnoreCase("agri_gqxx") )
	            {
	                itsSplDemInfo = new SplDemInfo();
	            }
                if( itsJobInfo==null && tableType.equalsIgnoreCase("job_caiji") )
	            {
	                itsJobInfo = new JobInfo();
	            }
                if(itsQuestionInfo==null&&tableType.equalsIgnoreCase("question_caiji"))
                {
                	itsQuestionInfo=new QuestionInfo();
                }
                if(itsCompanyInfo==null&&tableType.equalsIgnoreCase("company_caiji"))
                {
                	itsCompanyInfo=new CompanyInfo();
                }
	            if( tableType.equalsIgnoreCase("agri_news") )
	            {
	                name = this.NEWS_PREFIX + name;
	                posValue = DBConfig.getInstance().getProperty( name );
	                if( !(posValue==null || posValue.equalsIgnoreCase("")) )
	                {
	                    pos = Integer.parseInt( posValue );
//		                System.out.println( name + "=>" + text + ", pos=" + pos );
		                switch( pos )
		                {
	                	case 1:
	                	    itsNewsInfo.setTitle( text );
	                	    break;
	                	case 2:
	                	    itsNewsInfo.setContent( text );
	                	    break;
	                	case 3:
	                	    itsNewsInfo.setTypeId( Integer.parseInt(text) );
	                	    break;
	                	case 4:
	                	    itsNewsInfo.setPublisher( text );
	                	    break;
	                	case 5:
	                		itsNewsInfo.setItsUrl(text);
	                	    break;
		                }
	                }   
	            }else
	            if( tableType.equalsIgnoreCase("agri_newsmarket") )
		            {
		                name = this.MARKET_PRIFIX + name;
		                posValue = DBConfig.getInstance().getProperty( name );
		                if( !(posValue==null || posValue.equalsIgnoreCase("")) )
		                {
		                    pos = Integer.parseInt( posValue );
//			                System.out.println( name + "=>" + text + ", pos=" + pos );
			                switch( pos )
			                {
		                	case 1:
		                		itsMarketNewsInfo.setTitle( text );
		                	    break;
		                	case 2:
		                		itsMarketNewsInfo.setContent( text );
		                	    break;
		                	case 3:
		                		itsMarketNewsInfo.setTypeId( Integer.parseInt(text) );
		                	    break;
		                	case 4:
		                		itsMarketNewsInfo.setPublisher( text );
		                	    break;
		                	case 5:
		                		itsMarketNewsInfo.setItsUrl(text);
		                		break;
			                }
		                }   
		        }else
	            if( tableType.equalsIgnoreCase("agri_tech") )
	            {
	                name = this.TECH_PRIFIX + name;
	                posValue = DBConfig.getInstance().getProperty( name );
	                if( !(posValue==null || posValue.equalsIgnoreCase("")) )
	                {
	                    pos = Integer.parseInt( posValue );
	                    switch( pos )
	                    {
	                    case 1:
	                        itsTechInfo.setTitle( text );
	                        break;
	                    case 2:
//	                    	System.out.println( "content="+text );
	                        itsTechInfo.setContent( text );
	                        break;
	                    case 3:
//	                    	System.out.println( "type="+text );
	                        itsTechInfo.setTypeId( text );
	                        break;
	                    case 4:
	                        itsTechInfo.setPublisher( text );
	                        break;
	                    case 5:
	                    	itsTechInfo.setItsUrl(text);
	                        break;
	                    }
	                }
	            }else
	            if( tableType.equalsIgnoreCase("agri_jghq") )
	            {
	                name = this.PRICE_PREFIX + name;
	                posValue = DBConfig.getInstance().getProperty( name );
	                if( !(posValue==null || posValue.equalsIgnoreCase("")) )
	                {
	                    pos = Integer.parseInt( posValue );
//		                System.out.println( name + "=>" + text + ", pos=" + pos );
		                switch( pos )
		                {
	                	case 1:
	                	    itsPriceInfo.setAgriClass( text );
	                	    break;
	                	case 2:
	                	    itsPriceInfo.setAgriCategory( text );
	                	    break;
	                	case 3:
	                	    itsPriceInfo.setAgriMarket( text );
//	                	    System.out.println( text );
	                	    break;
	                	case 4:
	                	    itsPriceInfo.setAgriPrice( text );

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -