stringtowordvector.java

来自「Weka」· Java 代码 · 共 1,638 行 · 第 1/4 页

JAVA
1,638
字号
    value = Utils.getOption('W', options);    if (value.length() != 0)      setWordsToKeep(Integer.valueOf(value).intValue());    else      setWordsToKeep(1000);    value = Utils.getOption('M', options);    if (value.length() != 0)      setMinTermFreq(Integer.valueOf(value).intValue());    else      setMinTermFreq(1);        setOutputWordCounts(Utils.getFlag('C', options));    setTFTransform(Utils.getFlag('T',  options));    setIDFTransform(Utils.getFlag('I',  options));        setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options));    String nString = Utils.getOption('N', options);    if (nString.length() != 0)      setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER));    else      setNormalizeDocLength(new SelectedTag(FILTER_NONE, TAGS_FILTER));        setLowerCaseTokens(Utils.getFlag('L', options));        setUseStoplist(Utils.getFlag('S', options));        String stemmerString = Utils.getOption("stemmer", options);    if (stemmerString.length() == 0) {      setStemmer(null);    }    else {      String[] stemmerSpec = Utils.splitOptions(stemmerString);      if (stemmerSpec.length == 0)        throw new Exception("Invalid stemmer specification string");      String stemmerName = stemmerSpec[0];      stemmerSpec[0] = "";      Stemmer stemmer = (Stemmer) Class.forName(stemmerName).newInstance();      if (stemmer instanceof OptionHandler)        ((OptionHandler) stemmer).setOptions(stemmerSpec);      setStemmer(stemmer);    }    value = Utils.getOption("stopwords", options);    if (value.length() != 0)      setStopwords(new File(value));    else      setStopwords(null);    String tokenizerString = Utils.getOption("tokenizer", options);    if (tokenizerString.length() == 0) {      setTokenizer(new WordTokenizer());    }    else {      String[] tokenizerSpec = Utils.splitOptions(tokenizerString);      if (tokenizerSpec.length == 0)        throw new Exception("Invalid tokenizer specification string");      String tokenizerName = tokenizerSpec[0];      tokenizerSpec[0] = "";      Tokenizer tokenizer = (Tokenizer) Class.forName(tokenizerName).newInstance();      if (tokenizer instanceof OptionHandler)        ((OptionHandler) tokenizer).setOptions(tokenizerSpec);      setTokenizer(tokenizer);    }  }  /**   * Gets the current settings of the filter.   *   * @return an array of strings suitable for passing to setOptions   */  public String[] getOptions() {    Vector        result;    result = new Vector();    result.add("-R");     result.add(getSelectedRange().getRanges());    if (getInvertSelection())      result.add("-V");    if (!"".equals(getAttributeNamePrefix())) {      result.add("-P");       result.add(getAttributeNamePrefix());    }    result.add("-W");     result.add(String.valueOf(getWordsToKeep()));    if (getOutputWordCounts())      result.add("-C");    if (getTFTransform())      result.add("-T");    if (getIDFTransform())      result.add("-I");    result.add("-N");     result.add("" + m_filterType);    if (getLowerCaseTokens())      result.add("-L");    if (getUseStoplist())      result.add("-S");    if (getStemmer() != null) {      result.add("-stemmer");      String spec = getStemmer().getClass().getName();      if (getStemmer() instanceof OptionHandler)	spec += " " + Utils.joinOptions(	    ((OptionHandler) getStemmer()).getOptions());      result.add(spec.trim());    }    result.add("-M");     result.add(String.valueOf(getMinTermFreq()));    if (getDoNotOperateOnPerClassBasis())      result.add("-O");    if (!getStopwords().isDirectory()) {      result.add("-stopwords");      result.add(getStopwords().getAbsolutePath());    }    result.add("-tokenizer");    String spec = getTokenizer().getClass().getName();    if (getTokenizer() instanceof OptionHandler)      spec += " " + Utils.joinOptions(	  ((OptionHandler) getTokenizer()).getOptions());    result.add(spec.trim());    return (String[]) result.toArray(new String[result.size()]);  }  /**   * Constructor that allows specification of the target number of words   * in the output.   *   * @param wordsToKeep the number of words in the output vector (per class   * if assigned).   */  public StringToWordVector(int wordsToKeep) {    m_WordsToKeep = wordsToKeep;  }    /**    * Used to store word counts for dictionary selection based on    * a threshold.   */  private class Count     implements Serializable {    /** for serialization */    static final long serialVersionUID = 2157223818584474321L;        /** the counts */    public int count, docCount;        /**     * the constructor     *      * @param c the count     */    public Count(int c) {       count = c;     }  }  /**    * Returns the Capabilities of this filter.   *   * @return            the capabilities of this object   * @see               Capabilities   */  public Capabilities getCapabilities() {    Capabilities result = super.getCapabilities();    // attributes    result.enableAllAttributes();    result.enable(Capability.MISSING_VALUES);        // class    result.enableAllClasses();    result.enable(Capability.MISSING_CLASS_VALUES);    result.enable(Capability.NO_CLASS);        return result;  }  /**   * Sets the format of the input instances.   *   * @param instanceInfo an Instances object containing the input    * instance structure (any instances contained in the object are    * ignored - only the structure is required).   * @return true if the outputFormat may be collected immediately   * @throws Exception if the input format can't be set    * successfully   */  public boolean setInputFormat(Instances instanceInfo)     throws Exception {    super.setInputFormat(instanceInfo);    m_SelectedRange.setUpper(instanceInfo.numAttributes() - 1);    m_AvgDocLength = -1;    m_NumInstances = -1;    return false;  }  /**   * Input an instance for filtering. Filter requires all   * training instances be read before producing output.   *   * @param instance the input instance.   * @return true if the filtered instance may now be   * collected with output().   * @throws IllegalStateException if no input structure has been defined.   */  public boolean input(Instance instance) throws Exception {    if (getInputFormat() == null) {      throw new IllegalStateException("No input instance format defined");    }    if (m_NewBatch) {      resetQueue();      m_NewBatch = false;    }    if (isFirstBatchDone()) {      FastVector fv = new FastVector();      int firstCopy = convertInstancewoDocNorm(instance, fv);      Instance inst = (Instance)fv.elementAt(0);      if (m_filterType != FILTER_NONE) {	normalizeInstance(inst, firstCopy);      }      push(inst);      return true;    } else {      bufferInput(instance);      return false;    }  }  /**   * Signify that this batch of input to the filter is finished.    * If the filter requires all instances prior to filtering,   * output() may now be called to retrieve the filtered instances.   *   * @return true if there are instances pending output.   * @throws IllegalStateException if no input structure has been defined.   */  public boolean batchFinished() throws Exception {    if (getInputFormat() == null) {      throw new IllegalStateException("No input instance format defined");    }    // We only need to do something in this method    // if the first batch hasn't been processed. Otherwise    // input() has already done all the work.    if (!isFirstBatchDone()) {      // Determine the dictionary from the first batch (training data)      determineDictionary();      // Convert all instances w/o normalization      FastVector fv = new FastVector();      int firstCopy=0;      for(int i=0; i < m_NumInstances; i++) {	firstCopy = convertInstancewoDocNorm(getInputFormat().instance(i), fv);      }            // Need to compute average document length if necessary      if (m_filterType != FILTER_NONE) {	m_AvgDocLength = 0;	for(int i=0; i<fv.size(); i++) {	  Instance inst = (Instance) fv.elementAt(i);	  double docLength = 0;	  for(int j=0; j<inst.numValues(); j++) {	    if(inst.index(j)>=firstCopy) {	      docLength += inst.valueSparse(j) * inst.valueSparse(j);	    }	  }        	  m_AvgDocLength += Math.sqrt(docLength);	}	m_AvgDocLength /= m_NumInstances;      }      // Perform normalization if necessary.      if (m_filterType == FILTER_NORMALIZE_ALL) {	for(int i=0; i<fv.size(); i++) {	  normalizeInstance((Instance) fv.elementAt(i), firstCopy);	}      }      // Push all instances into the output queue      for(int i=0; i<fv.size(); i++) {	push((Instance) fv.elementAt(i));      }    }    // Flush the input    flushInput();    m_NewBatch = true;    m_FirstBatchDone = true;    return (numPendingOutput() != 0);  }  /**   * Returns a string describing this filter   * @return a description of the filter suitable for   * displaying in the explorer/experimenter gui   */    public String globalInfo() {    return         "Converts String attributes into a set of attributes representing "      + "word occurrence (depending on the tokenizer) information from the "      + "text contained in the strings. The set of words (attributes) is "      + "determined by the first batch filtered (typically training data).";  }      /**   * Gets whether output instances contain 0 or 1 indicating word   * presence, or word counts.   *   * @return true if word counts should be output.   */  public boolean getOutputWordCounts() {    return m_OutputCounts;  }  /**   * Sets whether output instances contain 0 or 1 indicating word   * presence, or word counts.   *   * @param outputWordCounts true if word counts should be output.   */  public void setOutputWordCounts(boolean outputWordCounts) {    m_OutputCounts = outputWordCounts;  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String outputWordCountsTipText() {      return "Output word counts rather than boolean 0 or 1"+             "(indicating presence or absence of a word).";  }  /**   * Get the value of m_SelectedRange.   *   * @return Value of m_SelectedRange.   */  public Range getSelectedRange() {    return m_SelectedRange;  }      /**   * Set the value of m_SelectedRange.   *   * @param newSelectedRange Value to assign to m_SelectedRange.   */  public void setSelectedRange(String newSelectedRange) {    m_SelectedRange = new Range(newSelectedRange);  }  /**   * Returns the tip text for this property   *   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String attributeIndicesTipText() {    return "Specify range of attributes to act on."      + " This is a comma separated list of attribute indices, with"      + " \"first\" and \"last\" valid values. Specify an inclusive"      + " range with \"-\". E.g: \"first-3,5,6-10,last\".";  }  /**   * Gets the current range selection   *   * @return a string containing a comma separated list of ranges   */  public String getAttributeIndices() {    return m_SelectedRange.getRanges();  }  /**   * Sets which attributes are to be worked on.   *   * @param rangeList a string representing the list of attributes. Since   * the string will typically come from a user, attributes are indexed from   * 1. <br>   * eg: first-3,5,6-last   * @throws IllegalArgumentException if an invalid range list is supplied    */  public void setAttributeIndices(String rangeList) {    m_SelectedRange.setRanges(rangeList);  }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?