📄 stringtowordvector.java

📁 代码是一个分类器的实现,其中使用了部分weka的源代码。可以将项目导入eclipse运行
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
   * Gets the number of words (per class if there is a class attribute   * assigned) to attempt to keep.   *   * @return the target number of words in the output vector (per class if   * assigned).   */  public int getWordsToKeep() {    return m_WordsToKeep;  }    /**   * Sets the number of words (per class if there is a class attribute   * assigned) to attempt to keep.   *   * @param newWordsToKeep the target number of words in the output    * vector (per class if assigned).   */  public void setWordsToKeep(int newWordsToKeep) {    m_WordsToKeep = newWordsToKeep;  }    /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String wordsToKeepTipText() {      return "The number of words (per class if there is a class attribute "+             "assigned) to attempt to keep.";  }  /** Gets whether if the word frequencies should be transformed into   *  log(1+fij) where fij is the frequency of word i in document(instance) j.   *   * @return true if word frequencies are to be transformed.   */  public boolean getTFTransform() {      return this.m_TFTransform;  }    /** Sets whether if the word frequencies should be transformed into   *  log(1+fij) where fij is the frequency of word i in document(instance) j.   *   * @param TFTransform true if word frequencies are to be transformed.   */  public void setTFTransform(boolean TFTransform) {      this.m_TFTransform = TFTransform;  }    /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String TFTransformTipText() {      return "Sets whether if the word frequencies should be transformed into:\n "+             "   log(1+fij) \n"+             "       where fij is the frequency of word i in document (instance) j.";  }    /** Sets whether if the word frequencies in a document should be transformed   * into: <br>   * fij*log(num of Docs/num of Docs with word i) <br>   *      where fij is the frequency of word i in document(instance) j.   *   * @return true if the word frequencies are to be transformed.   */  public boolean getIDFTransform() {      return this.m_IDFTransform;  }    /** Sets whether if the word frequencies in a document should be transformed   * into: <br>   * fij*log(num of Docs/num of Docs with word i) <br>   *      where fij is the frequency of word i in document(instance) j.   *   * @param IDFTransform true if the word frequecies are to be transformed   */  public void setIDFTransform(boolean IDFTransform) {      this.m_IDFTransform = IDFTransform;  }    /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String IDFTransformTipText() {      return "Sets whether if the word frequencies in a document should be "+             "transformed into: \n"+             "   fij*log(num of Docs/num of Docs with word i) \n"+             "      where fij is the frequency of word i in document (instance) j.";  }    /** Gets whether if the word frequencies for a document (instance) should   *  be normalized or not.   *   * @return true if word frequencies are to be normalized.   */  public SelectedTag getNormalizeDocLength() {    return new SelectedTag(m_filterType, TAGS_FILTER);  }    /** Sets whether if the word frequencies for a document (instance) should   *  be normalized or not.   *   * @param newType the new type.   */  public void setNormalizeDocLength(SelectedTag newType) {        if (newType.getTags() == TAGS_FILTER) {      m_filterType = newType.getSelectedTag().getID();    }  }  /**   * Returns the tip text for this property   *   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String normalizeDocLengthTipText() {      return "Sets whether if the word frequencies for a document (instance) "+             "should be normalized or not.";  }    /** Gets whether if the tokens are to be formed only from contiguous    *  alphabetic sequences. The delimiter string is ignored if this is true.   *   * @return true if tokens are to be formed from contiguous alphabetic    * characters.   */  public boolean getOnlyAlphabeticTokens() {      return m_onlyAlphabeticTokens;  }      /** Sets whether if tokens are to be formed only from contiguous alphabetic   * character sequences. The delimiter string is ignored if this option is    * set to true.   *   * @param tokenizeOnlyAlphabeticSequences should be set to true if only alphabetic    * tokens should be formed.   */  public void setOnlyAlphabeticTokens(boolean tokenizeOnlyAlphabeticSequences) {      m_onlyAlphabeticTokens = tokenizeOnlyAlphabeticSequences;  }  /**   * Returns the tip text for this property.   *   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String onlyAlphabeticTokensTipText() {      return "Sets whether if the word tokens are to be formed only from "+             "contiguous alphabetic sequences (The delimiter string is "+             "ignored if this option is set to true).";  }    /** Gets whether if the tokens are to be downcased or not.   *   * @return true if the tokens are to be downcased.   */  public boolean getLowerCaseTokens() {      return this.m_lowerCaseTokens;  }    /** Sets whether if the tokens are to be downcased or not. (Doesn't affect   * non-alphabetic characters in tokens).   *   * @param downCaseTokens should be true if only lower case tokens are    * to be formed.   */  public void setLowerCaseTokens(boolean downCaseTokens) {      this.m_lowerCaseTokens = downCaseTokens;  }  /**   * Returns the tip text for this property.   *   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String doNotOperateOnPerClassBasisTipText() {      return "If this is set, the maximum number of words and the "	+ "minimum term frequency is not enforced on a per-class "	+ "basis but based on the documents in all the classes "	+  "(even if a class attribute is set).";  }  /**   * Get the DoNotOperateOnPerClassBasis value.   * @return the DoNotOperateOnPerClassBasis value.   */  public boolean getDoNotOperateOnPerClassBasis() {    return m_doNotOperateOnPerClassBasis;  }  /**   * Set the DoNotOperateOnPerClassBasis value.   * @param newDoNotOperateOnPerClassBasis The new DoNotOperateOnPerClassBasis value.   */  public void setDoNotOperateOnPerClassBasis(boolean newDoNotOperateOnPerClassBasis) {    this.m_doNotOperateOnPerClassBasis = newDoNotOperateOnPerClassBasis;  }  /**   * Returns the tip text for this property.   *   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String minTermFreqTipText() {      return "Sets the minimum term frequency. This is enforced "	+ "on a per-class basis.";  }  /**   * Get the MinTermFreq value.   * @return the MinTermFreq value.   */  public int getMinTermFreq() {    return m_minTermFreq;  }  /**   * Set the MinTermFreq value.   * @param newMinTermFreq The new MinTermFreq value.   */  public void setMinTermFreq(int newMinTermFreq) {    this.m_minTermFreq = newMinTermFreq;  }    /**   * Returns the tip text for this property.   *   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String lowerCaseTokensTipText() {      return "If set then all the word tokens are converted to lower case "+             "before being added to the dictionary.";  }  /** Gets whether if the words on the stoplist are to be ignored (The stoplist   *  is in weka.core.StopWords).   *   * @return true if the words on the stoplist are to be ignored.   */  public boolean getUseStoplist() {      return m_useStoplist;  }      /** Sets whether if the words that are on a stoplist are to be ignored (The   * stop list is in weka.core.StopWords).   *   * @param useStoplist true if the tokens that are on a stoplist are to be    * ignored.   */  public void setUseStoplist(boolean useStoplist) {      m_useStoplist = useStoplist;  }      /**   * Returns the tip text for this property.   *   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String useStoplistTipText() {      return "Ignores all the words that are on the stoplist, if set to true.";  }   /**   * the stemming algorithm to use, null means no stemming at all (i.e., the   * NullStemmer is used)   *   * @param value     the configured stemming algorithm, or null   * @see             NullStemmer   */  public void setStemmer(Stemmer value) {    if (value != null)      m_Stemmer = value;    else      m_Stemmer = new NullStemmer();  }  /**   * Returns the current stemming algorithm, null if none is used.   *   * @return          the current stemming algorithm, null if none set   */  public Stemmer getStemmer() {    return m_Stemmer;  }  /**   * Returns the tip text for this property.   *   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String stemmerTipText() {    return "The stemming algorithm to use on the words.";  }    /**   * sorts an array   *    * @param array the array to sort   */  private static void sortArray(int [] array) {          int i, j, h, N = array.length - 1;	    for (h = 1; h <= N / 9; h = 3 * h + 1); 	    for (; h > 0; h /= 3) {      for (i = h + 1; i <= N; i++) {         int v = array[i];         j = i;         while (j > h && array[j - h] > v ) {           array[j] = array[j - h];           j -= h;         }         array[j] = v;       }     }  }  /**   * determines the selected range   */  private void determineSelectedRange() {        Instances inputFormat = getInputFormat();        // Calculate the default set of fields to convert    if (m_SelectedRange == null) {      StringBuffer fields = new StringBuffer();      for (int j = 0; j < inputFormat.numAttributes(); j++) { 	if (inputFormat.attribute(j).type() == Attribute.STRING)	  fields.append((j + 1) + ",");      }      m_SelectedRange = new Range(fields.toString());    }    m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);        // Prevent the user from converting non-string fields    StringBuffer fields = new StringBuffer();    for (int j = 0; j < inputFormat.numAttributes(); j++) {       if (m_SelectedRange.isInRange(j) 	  && inputFormat.attribute(j).type() == Attribute.STRING)	fields.append((j + 1) + ",");    }    m_SelectedRange.setRanges(fields.toString());    m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);    // System.err.println("Selected Range: " + getSelectedRange().getRanges());   }    /**   * determines the dictionary   */  private void determineDictionary() {    // Operate on a per-class basis if class attribute is set    int classInd = getInputFormat().classIndex();    int values = 1;    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {      values = getInputFormat().attribute(classInd).numValues();    }    //TreeMap dictionaryArr [] = new TreeMap[values];    TreeMap [] dictionaryArr = new TreeMap[values];    for (int i = 0; i < values; i++) {      dictionaryArr[i] = new TreeMap();    }    // Make sure we know which fields to convert    determineSelectedRange();    // Tokenize all training text into an orderedMap of "words".    for (int i = 0; i < getInputFormat().numInstances(); i++) {      Instance instance = getInputFormat().instance(i);      int vInd = 0;      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {	vInd = (int)instance.classValue();      }      // Iterate through all relevant string attributes of the current instance      Hashtable h = new Hashtable();      for (int j = 0; j < instance.numAttributes(); j++) {         if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {	  // Get tokenizer          Enumeration st;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -