📄 stringtowordvector.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
    throws Exception {
    super.setInputFormat(instanceInfo);
    m_FirstBatchDone = false;
    return false;
  }

  /**
   * Input an instance for filtering. Filter requires all
   * training instances be read before producing output.
   *
   * @param instance the input instance.
   * @return true if the filtered instance may now be
   * collected with output().
   * @exception IllegalStateException if no input structure has been defined.
   */
  public boolean input(Instance instance) throws Exception {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }
    if (m_NewBatch) {
      resetQueue();
      m_NewBatch = false;
    }
    if (m_FirstBatchDone) {
      convertInstance(instance);
      return true;
    } else {
      bufferInput(instance);
      return false;
    }
  }

  /**
   * Signify that this batch of input to the filter is finished. 
   * If the filter requires all instances prior to filtering,
   * output() may now be called to retrieve the filtered instances.
   *
   * @return true if there are instances pending output.
   * @exception IllegalStateException if no input structure has been defined.
   */
  public boolean batchFinished() throws Exception {

    if (getInputFormat() == null) {
      throw new IllegalStateException("No input instance format defined");
    }

    // Determine the dictionary
    if (!m_FirstBatchDone) {
      determineDictionary();
    }

    // Convert pending input instances.
    if(this.m_normalizeDocLength==false || m_FirstBatchDone==true) {
      for(int i = 0; i < getInputFormat().numInstances(); i++) {
          convertInstance(getInputFormat().instance(i));
      }
      flushInput();
    }
    else {
      FastVector fv = new FastVector();
      int firstCopy=0;
      Instances inputFormat = getInputFormat();
      avgDocLength = 0;
      for(int i=0; i<inputFormat.numInstances(); i++)
          firstCopy = convertInstancewoDocNorm(inputFormat.instance(i), fv);
      
      //Now normalizing document length
      for(int i=0; i<fv.size(); i++) {
        
        Instance inst = (Instance) fv.elementAt(i);
        
        double docLength = 0;
        double val=0;
        for(int j=0; j<inst.numValues(); j++) {
          if(inst.index(j)>=firstCopy) {
            val = inst.valueSparse(j);
            docLength += val*val;
          }
        }        
        docLength = Math.sqrt(docLength);
        avgDocLength += docLength;
        for(int j=0; j<inst.numValues(); j++) {
          if(inst.index(j)>=firstCopy) {
            val = inst.valueSparse(j);
            val /= docLength;
//            if(i==0)
//              System.err.println("Instance "+i+
//              ": "+
//              "length: "+docLength+
//              " setting value "+inst.index(j)+
//              " from "+inst.valueSparse(j)+
//              " to "+val);
            inst.setValueSparse(j, val);
            if(val==0){
              System.err.println("setting value "+inst.index(j)+" to zero.");
              j--;
            }
          }
        }
        
      }
      avgDocLength /= inputFormat.numInstances();
      
      for(int i=0; i<fv.size(); i++) {
        Instance inst = (Instance) fv.elementAt(i);
        double val=0;
        for(int j=0; j<inst.numValues(); j++) {
          if(inst.index(j)>=firstCopy) {
            val = inst.valueSparse(j);
            val = val * avgDocLength;
//            if(i==0)
//              System.err.println("Instance "+i+
//              ": "+
//              "avgDocLength: "+avgDocLength+
//              " setting value "+inst.index(j)+
//              " from "+inst.valueSparse(j)+
//              " to "+val);            
            inst.setValueSparse(j, val);
            if(val==0) {
              System.err.println("setting value "+inst.index(j)+" to zero.");
              j--;
            }
          }
        }
        push(inst);
      }
      flushInput();
    }

    m_NewBatch = true;
    m_FirstBatchDone = true;
    return (numPendingOutput() != 0);
  }

  /**
   * Returns a string describing this filter
   * @return a description of the filter suitable for
   * displaying in the explorer/experimenter gui
   */  
  public String globalInfo() {
    return "Converts String attributes into a set of attributes representing "+
           "word occurrence information from the text contained in the "+
           "strings. The set of words (attributes) is determined by the first "+
           "batch filtered (typically training data).";
  }  
  
  /**
   * Gets whether output instances contain 0 or 1 indicating word
   * presence, or word counts.
   *
   * @return true if word counts should be output.
   */
  public boolean getOutputWordCounts() {
    return m_OutputCounts;
  }

  /**
   * Sets whether output instances contain 0 or 1 indicating word
   * presence, or word counts.
   *
   * @param outputWordCounts true if word counts should be output.
   */
  public void setOutputWordCounts(boolean outputWordCounts) {
    m_OutputCounts = outputWordCounts;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String outputWordCountsTipText() {
      return "Output word counts rather than boolean 0 or 1"+
             "(indicating presence or absence of a word).";
  }

  /**
   * Get the value of delimiters.
   *
   * @return Value of delimiters.
   */
  public String getDelimiters() {
    return delimiters;
  }
    
  /**
   * Set the value of delimiters.
   *
   * @param newdelimiters Value to assign to delimiters.
   */
  public void setDelimiters(String newDelimiters) {
    delimiters = newDelimiters;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String delimitersTipText() {
      return "Set of delimiter characters to use in tokenizing "+
             "(default: \" \\n\\t.,:'\\\"()?!\"). "+
             "This option is ignored if onlyAlphabeticTokens option is set to"+
             " true.";
  }

  /**
   * Get the value of m_SelectedRange.
   *
   * @return Value of m_SelectedRange.
   */
  public Range getSelectedRange() {
    return m_SelectedRange;
  }
    
  /**
   * Set the value of m_SelectedRange.
   *
   * @param newSelectedRange Value to assign to m_SelectedRange.
   */
  public void setSelectedRange(String newSelectedRange) {
    m_SelectedRange = new Range(newSelectedRange);
  }

  /**
   * Get the attribute name prefix.
   *
   * @return The current attribute name prefix.
   */
  public String getAttributeNamePrefix() {
    return m_Prefix;
  }
    
  /**
   * Set the attribute name prefix.
   *
   * @param newPrefix String to use as the attribute name prefix.
   */
  public void setAttributeNamePrefix(String newPrefix) {
    m_Prefix = newPrefix;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String attributeNamePrefixTipText() {
      return "Prefix for the created attribute names. "+
             "(default: \"\")";
  }

  /**
   * Gets the number of words (per class if there is a class attribute
   * assigned) to attempt to keep.
   *
   * @return the target number of words in the output vector (per class if
   * assigned).
   */
  public int getWordsToKeep() {
    return m_WordsToKeep;
  }
  
  /**
   * Sets the number of words (per class if there is a class attribute
   * assigned) to attempt to keep.
   *
   * @param newWordsToKeep the target number of words in the output 
   * vector (per class if assigned).
   */
  public void setWordsToKeep(int newWordsToKeep) {
    m_WordsToKeep = newWordsToKeep;
  }
  
  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String wordsToKeepTipText() {
      return "The number of words (per class if there is a class attribute "+
             "assigned) to attempt to keep.";
  }

  /** Gets whether if the word frequencies should be transformed into
   *  log(1+fij) where fij is the frequency of word i in document(instance) j.
   *
   * @return true if word frequencies are to be transformed.
   */
  public boolean getTFTransform() {
      return this.m_TFTransform;
  }
  
  /** Sets whether if the word frequencies should be transformed into
   *  log(1+fij) where fij is the frequency of word i in document(instance) j.
   *
   * @param true if word frequencies are to be transformed.
   */
  public void setTFTransform(boolean TFTransform) {
      this.m_TFTransform = TFTransform;
  }
  
  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String TFTransformTipText() {
      return "Sets whether if the word frequencies should be transformed into:\n "+
             "   log(1+fij) \n"+
             "       where fij is the frequency of word i in document (instance) j.";
  }
  
  /** Sets whether if the word frequencies in a document should be transformed
   * into: <br>
   * fij*log(num of Docs/num of Docs with word i) <br>
   *      where fij is the frequency of word i in document(instance) j.
   *
   * @return true if the word frequencies are to be transformed.
   */
  public boolean getIDFTransform() {
      return this.m_IDFTransform;
  }
  
  /** Sets whether if the word frequencies in a document should be transformed
   * into: <br>
   * fij*log(num of Docs/num of Docs with word i) <br>
   *      where fij is the frequency of word i in document(instance) j.
   *
   * @param true if the word frequecies are to be transformed
   */
  public void setIDFTransform(boolean IDFTransform) {
      this.m_IDFTransform = IDFTransform;
  }
  
  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String IDFTransformTipText() {
      return "Sets whether if the word frequencies in a document should be "+
             "transformed into: \n"+
             "   fij*log(num of Docs/num of Docs with word i) \n"+
             "      where fij is the frequency of word i in document (instance) j.";
  }


  /** Gets whether if the word frequencies for a document (instance) should
   *  be normalized or not.
   *
   * @return true if word frequencies are to be normalized.
   */
  public boolean getNormalizeDocLength() {
      return this.m_normalizeDocLength;
  }
  
  /** Sets whether if the word frequencies for a document (instance) should
   *  be normalized or not.
   *
   * @param true if word frequencies are to be normalized.
   */
  public void setNormalizeDocLength(boolean normalizeDocLength) {
      this.m_normalizeDocLength = normalizeDocLength;
  }

  /**
   * Returns the tip text for this property
   *
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String normalizeDocLengthTipText() {
      return "Sets whether if the word frequencies for a document (instance) "+
             "should be normalized or not.";
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -