📄 stringtowordvector.java

📁 代码是一个分类器的实现,其中使用了部分weka的源代码。可以将项目导入eclipse运行
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
    setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));    String nString = Utils.getOption('N', options);    if (nString.length() != 0) {      setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER));    } else {      setNormalizeDocLength(new SelectedTag(FILTER_NONE, TAGS_FILTER));    }        setLowerCaseTokens(Utils.getFlag('L', options));        setOnlyAlphabeticTokens(Utils.getFlag('A', options));        setUseStoplist(Utils.getFlag('S', options));        String stemmerString = Utils.getOption("stemmer", options);    if (stemmerString.length() == 0) {      setStemmer(null);    }    else {      String[] stemmerSpec = Utils.splitOptions(stemmerString);      if (stemmerSpec.length == 0)        throw new Exception("Invalid stemmer specification string");      String stemmerName = stemmerSpec[0];      stemmerSpec[0] = "";      Stemmer stemmer = (Stemmer) Class.forName(stemmerName).newInstance();      if (stemmer instanceof OptionHandler)        ((OptionHandler) stemmer).setOptions(stemmerSpec);      setStemmer(stemmer);    }  }  /**   * Gets the current settings of the filter.   *   * @return an array of strings suitable for passing to setOptions   */  public String [] getOptions() {    String [] options = new String [22];    int current = 0;    options[current++] = "-D";     options[current++] = getDelimiters();    if (getSelectedRange() != null) {      options[current++] = "-R";       m_SelectedRange.setUpper(getInputFormat().numAttributes() - 1);      options[current++] = getSelectedRange().getRanges();    }    if (!"".equals(getAttributeNamePrefix())) {      options[current++] = "-P";       options[current++] = getAttributeNamePrefix();    }    options[current++] = "-W";     options[current++] = String.valueOf(getWordsToKeep());    if (getOutputWordCounts()) {      options[current++] = "-C";    }    if(getTFTransform())        options[current++] = "-T";        if(getIDFTransform())        options[current++] = "-I";        options[current++] = "-N"; options[current++] = "" + m_filterType;        if(this.getLowerCaseTokens())        options[current++] = "-L";        if(this.getOnlyAlphabeticTokens())        options[current++] = "-A";        if(this.getUseStoplist())        options[current++] = "-S";        if (getStemmer() != null) {      options[current++] = "-stemmer";      String spec = getStemmer().getClass().getName();      if (getStemmer() instanceof OptionHandler)        spec += " " + Utils.joinOptions(                          ((OptionHandler) getStemmer()).getOptions());      options[current++] = spec.trim();    }    options[current++] = "-M";     options[current++] = String.valueOf(getMinTermFreq());        if(this.getDoNotOperateOnPerClassBasis())      options[current++] = "-O";        while (current < options.length) {      options[current++] = "";    }    return options;  }  /**   * Default constructor. Targets 1000 words in the output.   */  public StringToWordVector() {  }  /**   * Constructor that allows specification of the target number of words   * in the output.   *   * @param wordsToKeep the number of words in the output vector (per class   * if assigned).   */  public StringToWordVector(int wordsToKeep) {    m_WordsToKeep = wordsToKeep;  }    /**    * Used to store word counts for dictionary selection based on    * a threshold.   */  private class Count     implements Serializable {    /** for serialization */    static final long serialVersionUID = 2157223818584474321L;        /** the counts */    public int count, docCount;        /**     * the constructor     *      * @param c the count     */    public Count(int c) {       count = c;     }  }  /**    * Returns the Capabilities of this filter.   *   * @return            the capabilities of this object   * @see               Capabilities   */  public Capabilities getCapabilities() {    Capabilities result = super.getCapabilities();    // attributes    result.enableAllAttributes();    result.enable(Capability.MISSING_VALUES);        // class    result.enableAllClasses();    result.enable(Capability.MISSING_CLASS_VALUES);    result.enable(Capability.NO_CLASS);        return result;  }  /**   * Sets the format of the input instances.   *   * @param instanceInfo an Instances object containing the input    * instance structure (any instances contained in the object are    * ignored - only the structure is required).   * @return true if the outputFormat may be collected immediately   * @throws Exception if the input format can't be set    * successfully   */  public boolean setInputFormat(Instances instanceInfo)     throws Exception {    super.setInputFormat(instanceInfo);    avgDocLength = -1;    numInstances = -1;    return false;  }  /**   * Input an instance for filtering. Filter requires all   * training instances be read before producing output.   *   * @param instance the input instance.   * @return true if the filtered instance may now be   * collected with output().   * @throws IllegalStateException if no input structure has been defined.   */  public boolean input(Instance instance) throws Exception {    if (getInputFormat() == null) {      throw new IllegalStateException("No input instance format defined");    }    if (m_NewBatch) {      resetQueue();      m_NewBatch = false;    }    if (isFirstBatchDone()) {      FastVector fv = new FastVector();      int firstCopy = convertInstancewoDocNorm(instance, fv);      Instance inst = (Instance)fv.elementAt(0);      if (m_filterType != FILTER_NONE) {	normalizeInstance(inst, firstCopy);      }      push(inst);      return true;    } else {      bufferInput(instance);      return false;    }  }  /**   * Signify that this batch of input to the filter is finished.    * If the filter requires all instances prior to filtering,   * output() may now be called to retrieve the filtered instances.   *   * @return true if there are instances pending output.   * @throws IllegalStateException if no input structure has been defined.   */  public boolean batchFinished() throws Exception {    if (getInputFormat() == null) {      throw new IllegalStateException("No input instance format defined");    }    // We only need to do something in this method    // if the first batch hasn't been processed. Otherwise    // input() has already done all the work.    if (!isFirstBatchDone()) {      // Determine the dictionary from the first batch (training data)      determineDictionary();      // Convert all instances w/o normalization      FastVector fv = new FastVector();      int firstCopy=0;      for(int i=0; i < numInstances; i++) {	firstCopy = convertInstancewoDocNorm(getInputFormat().instance(i), fv);      }            // Need to compute average document length if necessary      if (m_filterType != FILTER_NONE) {	avgDocLength = 0;	for(int i=0; i<fv.size(); i++) {	  Instance inst = (Instance) fv.elementAt(i);	  double docLength = 0;	  for(int j=0; j<inst.numValues(); j++) {	    if(inst.index(j)>=firstCopy) {	      docLength += inst.valueSparse(j) * inst.valueSparse(j);	    }	  }        	  avgDocLength += Math.sqrt(docLength);	}	avgDocLength /= numInstances;      }      // Perform normalization if necessary.      if (m_filterType == FILTER_NORMALIZE_ALL) {	for(int i=0; i<fv.size(); i++) {	  normalizeInstance((Instance) fv.elementAt(i), firstCopy);	}      }      // Push all instances into the output queue      for(int i=0; i<fv.size(); i++) {	push((Instance) fv.elementAt(i));      }    }    // Flush the input    flushInput();    m_NewBatch = true;    m_FirstBatchDone = true;    return (numPendingOutput() != 0);  }  /**   * Returns a string describing this filter   * @return a description of the filter suitable for   * displaying in the explorer/experimenter gui   */    public String globalInfo() {    return "Converts String attributes into a set of attributes representing "+           "word occurrence information from the text contained in the "+           "strings. The set of words (attributes) is determined by the first "+           "batch filtered (typically training data).";  }      /**   * Gets whether output instances contain 0 or 1 indicating word   * presence, or word counts.   *   * @return true if word counts should be output.   */  public boolean getOutputWordCounts() {    return m_OutputCounts;  }  /**   * Sets whether output instances contain 0 or 1 indicating word   * presence, or word counts.   *   * @param outputWordCounts true if word counts should be output.   */  public void setOutputWordCounts(boolean outputWordCounts) {    m_OutputCounts = outputWordCounts;  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String outputWordCountsTipText() {      return "Output word counts rather than boolean 0 or 1"+             "(indicating presence or absence of a word).";  }  /**   * Get the value of delimiters.   *   * @return Value of delimiters.   */  public String getDelimiters() {    return delimiters.replaceAll("\"", "\\\\\"").replaceAll("'", "\\\\'");  }      /**   * Set the value of delimiters.   *   * @param newDelimiters Value to assign to delimiters.   */  public void setDelimiters(String newDelimiters) {    delimiters = newDelimiters.replaceAll("\\\\\"", "\"").replaceAll("\\\\'", "'");  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String delimitersTipText() {      return "Set of delimiter characters to use in tokenizing "+             "(default: \" \\n\\t.,:'\\\"()?!\"). "+             "This option is ignored if onlyAlphabeticTokens option is set to"+             " true.";  }  /**   * Get the value of m_SelectedRange.   *   * @return Value of m_SelectedRange.   */  public Range getSelectedRange() {    return m_SelectedRange;  }      /**   * Set the value of m_SelectedRange.   *   * @param newSelectedRange Value to assign to m_SelectedRange.   */  public void setSelectedRange(String newSelectedRange) {    m_SelectedRange = new Range(newSelectedRange);  }  /**   * Get the attribute name prefix.   *   * @return The current attribute name prefix.   */  public String getAttributeNamePrefix() {    return m_Prefix;  }      /**   * Set the attribute name prefix.   *   * @param newPrefix String to use as the attribute name prefix.   */  public void setAttributeNamePrefix(String newPrefix) {    m_Prefix = newPrefix;  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String attributeNamePrefixTipText() {      return "Prefix for the created attribute names. "+             "(default: \"\")";  }  /**
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -