📄 stringtowordvector.java
字号:
* Gets the number of words (per class if there is a class attribute * assigned) to attempt to keep. * * @return the target number of words in the output vector (per class if * assigned). */ public int getWordsToKeep() { return m_WordsToKeep; } /** * Sets the number of words (per class if there is a class attribute * assigned) to attempt to keep. * * @param newWordsToKeep the target number of words in the output * vector (per class if assigned). */ public void setWordsToKeep(int newWordsToKeep) { m_WordsToKeep = newWordsToKeep; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String wordsToKeepTipText() { return "The number of words (per class if there is a class attribute "+ "assigned) to attempt to keep."; } /** Gets whether if the word frequencies should be transformed into * log(1+fij) where fij is the frequency of word i in document(instance) j. * * @return true if word frequencies are to be transformed. */ public boolean getTFTransform() { return this.m_TFTransform; } /** Sets whether if the word frequencies should be transformed into * log(1+fij) where fij is the frequency of word i in document(instance) j. * * @param TFTransform true if word frequencies are to be transformed. */ public void setTFTransform(boolean TFTransform) { this.m_TFTransform = TFTransform; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String TFTransformTipText() { return "Sets whether if the word frequencies should be transformed into:\n "+ " log(1+fij) \n"+ " where fij is the frequency of word i in document (instance) j."; } /** Sets whether if the word frequencies in a document should be transformed * into: <br> * fij*log(num of Docs/num of Docs with word i) <br> * where fij is the frequency of word i in document(instance) j. * * @return true if the word frequencies are to be transformed. */ public boolean getIDFTransform() { return this.m_IDFTransform; } /** Sets whether if the word frequencies in a document should be transformed * into: <br> * fij*log(num of Docs/num of Docs with word i) <br> * where fij is the frequency of word i in document(instance) j. * * @param IDFTransform true if the word frequecies are to be transformed */ public void setIDFTransform(boolean IDFTransform) { this.m_IDFTransform = IDFTransform; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String IDFTransformTipText() { return "Sets whether if the word frequencies in a document should be "+ "transformed into: \n"+ " fij*log(num of Docs/num of Docs with word i) \n"+ " where fij is the frequency of word i in document (instance) j."; } /** Gets whether if the word frequencies for a document (instance) should * be normalized or not. * * @return true if word frequencies are to be normalized. */ public SelectedTag getNormalizeDocLength() { return new SelectedTag(m_filterType, TAGS_FILTER); } /** Sets whether if the word frequencies for a document (instance) should * be normalized or not. * * @param newType the new type. */ public void setNormalizeDocLength(SelectedTag newType) { if (newType.getTags() == TAGS_FILTER) { m_filterType = newType.getSelectedTag().getID(); } } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String normalizeDocLengthTipText() { return "Sets whether if the word frequencies for a document (instance) "+ "should be normalized or not."; } /** Gets whether if the tokens are to be formed only from contiguous * alphabetic sequences. The delimiter string is ignored if this is true. * * @return true if tokens are to be formed from contiguous alphabetic * characters. */ public boolean getOnlyAlphabeticTokens() { return m_onlyAlphabeticTokens; } /** Sets whether if tokens are to be formed only from contiguous alphabetic * character sequences. The delimiter string is ignored if this option is * set to true. * * @param tokenizeOnlyAlphabeticSequences should be set to true if only alphabetic * tokens should be formed. */ public void setOnlyAlphabeticTokens(boolean tokenizeOnlyAlphabeticSequences) { m_onlyAlphabeticTokens = tokenizeOnlyAlphabeticSequences; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String onlyAlphabeticTokensTipText() { return "Sets whether if the word tokens are to be formed only from "+ "contiguous alphabetic sequences (The delimiter string is "+ "ignored if this option is set to true)."; } /** Gets whether if the tokens are to be downcased or not. * * @return true if the tokens are to be downcased. */ public boolean getLowerCaseTokens() { return this.m_lowerCaseTokens; } /** Sets whether if the tokens are to be downcased or not. (Doesn't affect * non-alphabetic characters in tokens). * * @param downCaseTokens should be true if only lower case tokens are * to be formed. */ public void setLowerCaseTokens(boolean downCaseTokens) { this.m_lowerCaseTokens = downCaseTokens; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String doNotOperateOnPerClassBasisTipText() { return "If this is set, the maximum number of words and the " + "minimum term frequency is not enforced on a per-class " + "basis but based on the documents in all the classes " + "(even if a class attribute is set)."; } /** * Get the DoNotOperateOnPerClassBasis value. * @return the DoNotOperateOnPerClassBasis value. */ public boolean getDoNotOperateOnPerClassBasis() { return m_doNotOperateOnPerClassBasis; } /** * Set the DoNotOperateOnPerClassBasis value. * @param newDoNotOperateOnPerClassBasis The new DoNotOperateOnPerClassBasis value. */ public void setDoNotOperateOnPerClassBasis(boolean newDoNotOperateOnPerClassBasis) { this.m_doNotOperateOnPerClassBasis = newDoNotOperateOnPerClassBasis; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String minTermFreqTipText() { return "Sets the minimum term frequency. This is enforced " + "on a per-class basis."; } /** * Get the MinTermFreq value. * @return the MinTermFreq value. */ public int getMinTermFreq() { return m_minTermFreq; } /** * Set the MinTermFreq value. * @param newMinTermFreq The new MinTermFreq value. */ public void setMinTermFreq(int newMinTermFreq) { this.m_minTermFreq = newMinTermFreq; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String lowerCaseTokensTipText() { return "If set then all the word tokens are converted to lower case "+ "before being added to the dictionary."; } /** Gets whether if the words on the stoplist are to be ignored (The stoplist * is in weka.core.StopWords). * * @return true if the words on the stoplist are to be ignored. */ public boolean getUseStoplist() { return m_useStoplist; } /** Sets whether if the words that are on a stoplist are to be ignored (The * stop list is in weka.core.StopWords). * * @param useStoplist true if the tokens that are on a stoplist are to be * ignored. */ public void setUseStoplist(boolean useStoplist) { m_useStoplist = useStoplist; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String useStoplistTipText() { return "Ignores all the words that are on the stoplist, if set to true."; } /** * the stemming algorithm to use, null means no stemming at all (i.e., the * NullStemmer is used) * * @param value the configured stemming algorithm, or null * @see NullStemmer */ public void setStemmer(Stemmer value) { if (value != null) m_Stemmer = value; else m_Stemmer = new NullStemmer(); } /** * Returns the current stemming algorithm, null if none is used. * * @return the current stemming algorithm, null if none set */ public Stemmer getStemmer() { return m_Stemmer; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String stemmerTipText() { return "The stemming algorithm to use on the words."; } /** * sorts an array * * @param array the array to sort */ private static void sortArray(int [] array) { int i, j, h, N = array.length - 1; for (h = 1; h <= N / 9; h = 3 * h + 1); for (; h > 0; h /= 3) { for (i = h + 1; i <= N; i++) { int v = array[i]; j = i; while (j > h && array[j - h] > v ) { array[j] = array[j - h]; j -= h; } array[j] = v; } } } /** * determines the selected range */ private void determineSelectedRange() { Instances inputFormat = getInputFormat(); // Calculate the default set of fields to convert if (m_SelectedRange == null) { StringBuffer fields = new StringBuffer(); for (int j = 0; j < inputFormat.numAttributes(); j++) { if (inputFormat.attribute(j).type() == Attribute.STRING) fields.append((j + 1) + ","); } m_SelectedRange = new Range(fields.toString()); } m_SelectedRange.setUpper(inputFormat.numAttributes() - 1); // Prevent the user from converting non-string fields StringBuffer fields = new StringBuffer(); for (int j = 0; j < inputFormat.numAttributes(); j++) { if (m_SelectedRange.isInRange(j) && inputFormat.attribute(j).type() == Attribute.STRING) fields.append((j + 1) + ","); } m_SelectedRange.setRanges(fields.toString()); m_SelectedRange.setUpper(inputFormat.numAttributes() - 1); // System.err.println("Selected Range: " + getSelectedRange().getRanges()); } /** * determines the dictionary */ private void determineDictionary() { // Operate on a per-class basis if class attribute is set int classInd = getInputFormat().classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = getInputFormat().attribute(classInd).numValues(); } //TreeMap dictionaryArr [] = new TreeMap[values]; TreeMap [] dictionaryArr = new TreeMap[values]; for (int i = 0; i < values; i++) { dictionaryArr[i] = new TreeMap(); } // Make sure we know which fields to convert determineSelectedRange(); // Tokenize all training text into an orderedMap of "words". for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance instance = getInputFormat().instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int)instance.classValue(); } // Iterate through all relevant string attributes of the current instance Hashtable h = new Hashtable(); for (int j = 0; j < instance.numAttributes(); j++) { if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { // Get tokenizer Enumeration st;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -