📄 miningcsvstream.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
        // first try both ways
        MiningDataSpecification mds_create = recognize(MiningCsvStream.COLUMN_NAME_CREATE);
        MiningDataSpecification mds_first = recognize(MiningCsvStream.COLUMN_NAME_FIRST_LINE);

        if(mds_create==null || mds_first==null || mds_create.getAttributesNumber()!=mds_first.getAttributesNumber()) {
          // something really went wrong
          reset();
          usedColumnNameType = MiningCsvStream.COLUMN_NAME_CREATE;
          return mds_create;
        }

        for(int i=0; i < mds_create.getAttributesNumber() && i < mds_first.getAttributesNumber(); i++) {
          MiningAttribute att_c = mds_create.getMiningAttribute(i);
          MiningAttribute att_f = mds_first.getMiningAttribute(i);

          // find a clue for using the first line for column names
          if(NumericAttribute.class.isInstance(att_f) && CategoricalAttribute.class.isInstance(att_c)) {
            reset();
            usedColumnNameType = MiningCsvStream.COLUMN_NAME_FIRST_LINE;
            return mds_first;
          }
        }
        reset();
        usedColumnNameType = MiningCsvStream.COLUMN_NAME_CREATE;
        return mds_create;

      }

      reset();
      return recognize(usedColumnNameType);
  }

  /**
   * Gets new meta data using the first {@link #nLines} lines.
   * Called by {@link #recognize}.
   *
   * @param m_usedColumnNameType Specifies either to use the first line of the input file as the attributes names or to create them.
   * @return the recognized meta data
   * @throws MiningException could not recognize file
   */
  protected MiningDataSpecification recognize(short m_usedColumnNameType) throws MiningException {

    MiningDataSpecification m_metaData;

    reset();
    String columnNames[] = null;

    Vector attrs = new Vector();

    for(int i=0; this.nLines==-1 || i < this.nLines; i++) {
      try
      {
        if(curLine==0 && m_usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE) {
          // Get column names and continue with second line
          parse.parseString(inReader.readLine());
          curLine++;
          columnNames = new String[parse.countTokens()];
          for(int j=0; j < parse.countTokens(); j++)
            if(!parse.getToken(j+1).equals(""))
              columnNames[j] = parse.getToken(j+1);
            else {
              columnNames[j] = "field"+ String.valueOf(j+1);
//              columnNames = null;
//              break;
            }
        }
        parse.parseString(inReader.readLine());
        curLine++;
      }
      catch (java.io.IOException ex)
      {
        break;
      }
      if(parse.countTokens()<1)
        continue;

      for(int j=0; j < parse.countTokens(); j++) {
        if(attrs.size() > j) { // already object with current index
          Object obj = attrs.get(j);
          // If obj instanceof CategoricalAttribute, parsing already failed one time, so skip current "field".
          if(obj!=null && CategoricalAttribute.class.isInstance(obj))
            continue;
        }

        try
        {
          double val = Double.parseDouble(parse.getToken(j+1));
        }
        catch (NumberFormatException ex)
        { // If some checks succeed, create CategoricalAttribute for the current "column"
          String token = parse.getToken(j+1).trim();
          if(token.equals("") || isMissingValue(token)) { // If missing value leave attribute for this field untouched
            continue;
          }

          if(attrs.size() <= j)
            attrs.setSize(j+1); // Expand vector

          String attName;
          if(m_usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE && columnNames!=null && columnNames.length >j)
            attName = columnNames[j];
          else
            attName = "field"+ String.valueOf(j+1);
          CategoricalAttribute catt = new CategoricalAttribute(attName);
          if(this.categoriesType==CATEGORIES_UNSTORED)
            catt.setUnstoredCategories(true);
          else
            catt.setUnboundedCategories(true);
          attrs.setElementAt(catt,j);
          continue;
        }

        // Current value is double, so create NumericAttribute

        if(attrs.size() <= j)
            attrs.setSize(j+1); // Expand vector

        String attName;
        if(m_usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE && columnNames!=null && columnNames.length >j)
          attName = columnNames[j];
        else
          attName = "field"+ String.valueOf(j+1);
        attrs.setElementAt(new NumericAttribute(attName), j);
      }
    }

    m_metaData = new MiningDataSpecification();
    for(int i=0; i < attrs.size(); i++) {
      if(attrs.elementAt(i)==null) { // If the attribute for one field has not been set yet
                                     // then set a CategoricalAttribute (most secure).
        String attName;
        if(m_usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE && columnNames!=null && columnNames.length >i)
          attName = columnNames[i];
        else
          attName = "field"+ String.valueOf(i+1);

        CategoricalAttribute catt = new CategoricalAttribute(attName);
        if(this.categoriesType==CATEGORIES_UNSTORED)
          catt.setUnstoredCategories(true);
        else
          catt.setUnboundedCategories(true);
        attrs.setElementAt(catt, i);
      }
      m_metaData.addMiningAttribute((MiningAttribute) attrs.elementAt(i));
    }
    if(this.fileName!=null)
      m_metaData.setRelationName(this.fileName);

    return m_metaData;
  }

  // -----------------------------------------------------------------------
  //  Methods of cursor positioning
  // -----------------------------------------------------------------------
  /**
   * Places the cursor before first row.
   * This is done by closing and reopening the file reader.
   *
   * @throws MiningException
   */
  public void reset() throws MiningException
  {
      super.reset();
      curLine = 0;
      inReader = new BufferedReader(reader);

      cursorPosition = -1;
  }

  /**
 * Advances cursor by one position.
 *
 * @return true if next vector exists, else false
 * @exception MiningException if the meta data does not match the current line or is invalid.
 */
  public boolean next() throws MiningException {
    cursorVector = null;
    if(inReader==null)
      return false;

    if(metaData==null)
      this.metaData = this.recognize();

    String nextLine;
    try
    {
      if(curLine==0 && usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE) {
        inReader.readLine();
        curLine++;
      }

      nextLine = inReader.readLine();
      curLine++;
    }
    catch (java.io.IOException ex)
    {
      ex.printStackTrace();
      return false;
    }
    if(nextLine==null)
      return false;

    parse.parseString(nextLine);
    if(parse.countTokens() < 1)
      return next();

    double[] instance = new double[ metaData.getAttributesNumber() ];
    // Get values for all attributes.
    for (int i = 0; i < metaData.getAttributesNumber(); i++)
    {
        if(parse.countTokens()>=i+1) {
          String token = parse.getToken(i+1);
          MiningAttribute attribute = metaData.getMiningAttribute( i );
          if(attribute instanceof CategoricalAttribute)
          {
              if(isMissingValue(token)) {
                instance[i] = Category.MISSING_VALUE;
              }
              else {
                Category cat = new Category( token.toString());
                double key = ( (CategoricalAttribute) attribute).getKey(cat);
                if (Category.isMissingValue(key))
                  key = ( (CategoricalAttribute) attribute).addCategory(cat);
                instance[i] = key;
              }
          }
          else if(attribute instanceof NumericAttribute) {
            if(isMissingValue(token)) {
              instance[i] = Category.MISSING_VALUE;
            }
            else {
              try {
                instance[i] = Double.parseDouble(token);
              }
              catch (NumberFormatException ex) {
//                throw new MiningException("Value of field \""+attribute.getName()+"\" at line "+this.curLine+" is not a numeric value (\""+token+"\")");
                instance[i] = Category.MISSING_VALUE;
              }
            }

          }
          else {
            throw new MiningException("Unknown attribute type");
          }
        }
        else { // File contains less columns than attributes defined in metaData
          instance[i] = Category.MISSING_VALUE;
        }

//        if( instance[i] == Category.MISSING_VALUE )
//        {
//            missingValues = true;
//        }
    }

    // Add instance to dataset
    cursorVector = new MiningVector(instance);
    cursorVector.setMetaData( metaData );

    cursorPosition++;

    return true;
  }

  // -----------------------------------------------------------------------
  //  Methods of reading from the stream
  // -----------------------------------------------------------------------
  /**
   * Reads current data vector.
   *
   * @return data vector at current cursor position
   * @exception MiningException never thrown.
   */
  public MiningVector read() throws MiningException
  {
    return this.cursorVector;
  }

  // -----------------------------------------------------------------------
  //  Methods of writing into the stream
  // -----------------------------------------------------------------------
  /**
   * Sets new meta data to this stream.
   *
   * @param metaData new meta data of stream
   * @exception MiningException if an error occurs
   */
  public void updateSetMetaData(MiningDataSpecification metaData) throws MiningException
  {
    if(metaData==null)
      throw new MiningException("Invalid MiningDataSpecification: null");

    if(metaData.getAttributesNumber() < 1)
      throw new MiningException("Invalid MiningDataSpecification: Specification does not contain attribute(s)");

    this.metaData = metaData;
  };

  /**
   * Removes all mining vectors from this stream. Note that metadata is not
   * affected by this operation since it is fixed for any stream.
   *
   * @exception MiningException if an error occurs
   */
  public void updateRemoveAllVectors() throws MiningException {
    throw new MiningException("not supported yet");
  }

  /**
   * Appends new mining vector to this stream.
   *
   * @param vector new mining vector to append
   * @exception MiningException if an error occurs
   */
  public void updateAppendVector(MiningVector vector) throws MiningException {
    throw new MiningException("not supported yet");
  }

  //<<Frank J. Xu, 16/02/2005
  //Add method to reset the categorical attributes' type of csv stream.
  public void updateCategoricalAttrsType()throws MiningException{
  	MiningVector mv;
  	reset();
  	MiningDataSpecification metaData = this.getMetaData();
  	MiningAttribute[] attrs = metaData.getAttributesArray();
  	Vector categoricalAttrs = new Vector();
  	
  	for(int i = 0; i < attrs.length; i++){
  		if(attrs[i] instanceof CategoricalAttribute){
  			categoricalAttrs.add(attrs[i]);
  		}
  	}
  	
  	if(categoricalAttrs.size() > 0){
  	  	Vector[] categoricalVal = new Vector[categoricalAttrs.size()];
  	    try{
  	    	while (this.next() ) {
  	    		mv = this.read();
  	    		for(int j = 0; j < categoricalAttrs.size(); j++){
  	        		Category catObj = mv.getValueCategory((CategoricalAttribute)(categoricalAttrs.get(j)));
  	        		//add distinct value.
  	        		if(null == categoricalVal[j]){
  	        			categoricalVal[j] = new Vector();
  	        			if(catObj != null)
  	        				categoricalVal[j].add(catObj);  	        			
  	        		}  	        			
  	        		else{
  	        			//<<13/04/2005, Frank J. Xu
  	        			/*  	        		
 						int k = 0;
  	        			for(k = 0; k < categoricalVal[j].size(); k++){
  	        				if(null == catObj)
  	        					break;
  	        				else if(categoricalVal[j].get(k).equals(catObj))
  	        					break;
  	        			}
  	        			if((k == categoricalVal[j].size()) && (catObj != null))
  	        				categoricalVal[j].add(catObj);
  	        			 */
  	        			//The order of categorical value affects the correctness of 
  	        			//assessment operations, so adjust the order from the 
  	        			//data importing operations.  	        			
  	        			insertCatValByOrder(categoricalVal[j], catObj);
  	        			//>>13/04/2005, Frank J. Xu
  	        		}
  	    		}
  	    	} 
  	    	
  	    	//update categorical attributes type based on the threshold.
  	    	for(int j = 0; j < categoricalAttrs.size(); j++)
  	    	{
    			if((categoricalVal[j].size() > 0) && (categoricalVal[j].size() < MiningInputStream.CATEGORICAL_ATTRIBUTE_BOUND))
    				((CategoricalAttribute)(categoricalAttrs.get(j))).setUnboundedCategories(false);
    			ArrayList catVals = new ArrayList();
    			for(int k = 0; k < categoricalVal[j].size(); k++){
    				catVals.add(categoricalVal[j].get(k));
    			}
    			((CategoricalAttribute)(categoricalAttrs.get(j))).setValues(catVals);
  	    	}
  	    	
  	    	//update attributes array.
  	    	int categoricalAttrsIndex = 0;
  	    	for(int i = 0; i < attrs.length; i++){
  	    		if(attrs[i] instanceof CategoricalAttribute){
  	    			attrs[i] = (MiningAttribute) categoricalAttrs.get(categoricalAttrsIndex);
  	    			categoricalAttrsIndex++;
  	    		}
  	    	}
  	    	//update metadata.
  	    	metaData.setAttributesArray(attrs);
  	    	this.metaData = metaData;
  	    }
  	    catch (MiningException ex){
  	    	ex.printStackTrace();
  	    }  	  	
  	}
  }

  private void insertCatValByOrder(Vector categoricalVal, Category catObj)
  {
  	int k = 0;
  	int insertIndex = -1;  	
  	if(catObj != null)
  	{
  		String srcDisplayName = catObj.getDisplayValue();
  	  	for(k = 0; k < categoricalVal.size(); k++)
  	  	{
  	  		if(categoricalVal.get(k).equals(catObj))
  	  			break;
  	  		else
  	  		{
  	  			String tarDisplayName = ((Category)categoricalVal.get(k)).getDisplayValue();
  	  			if(srcDisplayName.compareTo(tarDisplayName) < 0){
  	  				insertIndex = k;
  	  				break;
  	  			}  				  			
  	  		}
  	  	}
  	  	
  	  	if(k == categoricalVal.size())
  	  		insertIndex = k;
  	  	
  	  	if((insertIndex >=0))
  	  	{
  	  		categoricalVal.insertElementAt(catObj, insertIndex);
  	  	}  		
  	  } 	  	  
  	}
  //Frank J. Xu, 16/02/2005>>

  // -----------------------------------------------------------------------
  //  Test
  // -----------------------------------------------------------------------
  /**
   * Test of CSV stream.
   *
   * @param args arguments (ignored)
   */
  public static void main(String[] args) {
    try
    {
      MiningCsvStream csvStream = new MiningCsvStream("data/csv/vowel.csv");
      csvStream.setColumnNameType(MiningCsvStream.COLUMN_NAME_FIRST_LINE);
//      csvStream.setSeparator(';');
//      csvStream.setQuotationMark('\"');
      csvStream.setNumberTestLines(20);
      csvStream.open();
      System.out.println( csvStream.getMetaData() );
      while ( csvStream.next() ) {
        MiningVector mv = csvStream.read();
        System.out.println(mv);
      }
    }
    catch (MiningException ex)
    {
      ex.printStackTrace();
    }
  }
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -