📄 stringtowordvector.java
字号:
setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options)); String nString = Utils.getOption('N', options); if (nString.length() != 0) { setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER)); } else { setNormalizeDocLength(new SelectedTag(FILTER_NONE, TAGS_FILTER)); } setLowerCaseTokens(Utils.getFlag('L', options)); setOnlyAlphabeticTokens(Utils.getFlag('A', options)); setUseStoplist(Utils.getFlag('S', options)); String stemmerString = Utils.getOption("stemmer", options); if (stemmerString.length() == 0) { setStemmer(null); } else { String[] stemmerSpec = Utils.splitOptions(stemmerString); if (stemmerSpec.length == 0) throw new Exception("Invalid stemmer specification string"); String stemmerName = stemmerSpec[0]; stemmerSpec[0] = ""; Stemmer stemmer = (Stemmer) Class.forName(stemmerName).newInstance(); if (stemmer instanceof OptionHandler) ((OptionHandler) stemmer).setOptions(stemmerSpec); setStemmer(stemmer); } } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] options = new String [22]; int current = 0; options[current++] = "-D"; options[current++] = getDelimiters(); if (getSelectedRange() != null) { options[current++] = "-R"; m_SelectedRange.setUpper(getInputFormat().numAttributes() - 1); options[current++] = getSelectedRange().getRanges(); } if (!"".equals(getAttributeNamePrefix())) { options[current++] = "-P"; options[current++] = getAttributeNamePrefix(); } options[current++] = "-W"; options[current++] = String.valueOf(getWordsToKeep()); if (getOutputWordCounts()) { options[current++] = "-C"; } if(getTFTransform()) options[current++] = "-T"; if(getIDFTransform()) options[current++] = "-I"; options[current++] = "-N"; options[current++] = "" + m_filterType; if(this.getLowerCaseTokens()) options[current++] = "-L"; if(this.getOnlyAlphabeticTokens()) options[current++] = "-A"; if(this.getUseStoplist()) options[current++] = "-S"; if (getStemmer() != null) { options[current++] = "-stemmer"; String spec = getStemmer().getClass().getName(); if (getStemmer() instanceof OptionHandler) spec += " " + Utils.joinOptions( ((OptionHandler) getStemmer()).getOptions()); options[current++] = spec.trim(); } options[current++] = "-M"; options[current++] = String.valueOf(getMinTermFreq()); if(this.getDoNotOperateOnPerClassBasis()) options[current++] = "-O"; while (current < options.length) { options[current++] = ""; } return options; } /** * Default constructor. Targets 1000 words in the output. */ public StringToWordVector() { } /** * Constructor that allows specification of the target number of words * in the output. * * @param wordsToKeep the number of words in the output vector (per class * if assigned). */ public StringToWordVector(int wordsToKeep) { m_WordsToKeep = wordsToKeep; } /** * Used to store word counts for dictionary selection based on * a threshold. */ private class Count implements Serializable { /** for serialization */ static final long serialVersionUID = 2157223818584474321L; /** the counts */ public int count, docCount; /** * the constructor * * @param c the count */ public Count(int c) { count = c; } } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); // attributes result.enableAllAttributes(); result.enable(Capability.MISSING_VALUES); // class result.enableAllClasses(); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input * instance structure (any instances contained in the object are * ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the input format can't be set * successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); avgDocLength = -1; numInstances = -1; return false; } /** * Input an instance for filtering. Filter requires all * training instances be read before producing output. * * @param instance the input instance. * @return true if the filtered instance may now be * collected with output(). * @throws IllegalStateException if no input structure has been defined. */ public boolean input(Instance instance) throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (isFirstBatchDone()) { FastVector fv = new FastVector(); int firstCopy = convertInstancewoDocNorm(instance, fv); Instance inst = (Instance)fv.elementAt(0); if (m_filterType != FILTER_NONE) { normalizeInstance(inst, firstCopy); } push(inst); return true; } else { bufferInput(instance); return false; } } /** * Signify that this batch of input to the filter is finished. * If the filter requires all instances prior to filtering, * output() may now be called to retrieve the filtered instances. * * @return true if there are instances pending output. * @throws IllegalStateException if no input structure has been defined. */ public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } // We only need to do something in this method // if the first batch hasn't been processed. Otherwise // input() has already done all the work. if (!isFirstBatchDone()) { // Determine the dictionary from the first batch (training data) determineDictionary(); // Convert all instances w/o normalization FastVector fv = new FastVector(); int firstCopy=0; for(int i=0; i < numInstances; i++) { firstCopy = convertInstancewoDocNorm(getInputFormat().instance(i), fv); } // Need to compute average document length if necessary if (m_filterType != FILTER_NONE) { avgDocLength = 0; for(int i=0; i<fv.size(); i++) { Instance inst = (Instance) fv.elementAt(i); double docLength = 0; for(int j=0; j<inst.numValues(); j++) { if(inst.index(j)>=firstCopy) { docLength += inst.valueSparse(j) * inst.valueSparse(j); } } avgDocLength += Math.sqrt(docLength); } avgDocLength /= numInstances; } // Perform normalization if necessary. if (m_filterType == FILTER_NORMALIZE_ALL) { for(int i=0; i<fv.size(); i++) { normalizeInstance((Instance) fv.elementAt(i), firstCopy); } } // Push all instances into the output queue for(int i=0; i<fv.size(); i++) { push((Instance) fv.elementAt(i)); } } // Flush the input flushInput(); m_NewBatch = true; m_FirstBatchDone = true; return (numPendingOutput() != 0); } /** * Returns a string describing this filter * @return a description of the filter suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Converts String attributes into a set of attributes representing "+ "word occurrence information from the text contained in the "+ "strings. The set of words (attributes) is determined by the first "+ "batch filtered (typically training data)."; } /** * Gets whether output instances contain 0 or 1 indicating word * presence, or word counts. * * @return true if word counts should be output. */ public boolean getOutputWordCounts() { return m_OutputCounts; } /** * Sets whether output instances contain 0 or 1 indicating word * presence, or word counts. * * @param outputWordCounts true if word counts should be output. */ public void setOutputWordCounts(boolean outputWordCounts) { m_OutputCounts = outputWordCounts; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String outputWordCountsTipText() { return "Output word counts rather than boolean 0 or 1"+ "(indicating presence or absence of a word)."; } /** * Get the value of delimiters. * * @return Value of delimiters. */ public String getDelimiters() { return delimiters.replaceAll("\"", "\\\\\"").replaceAll("'", "\\\\'"); } /** * Set the value of delimiters. * * @param newDelimiters Value to assign to delimiters. */ public void setDelimiters(String newDelimiters) { delimiters = newDelimiters.replaceAll("\\\\\"", "\"").replaceAll("\\\\'", "'"); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String delimitersTipText() { return "Set of delimiter characters to use in tokenizing "+ "(default: \" \\n\\t.,:'\\\"()?!\"). "+ "This option is ignored if onlyAlphabeticTokens option is set to"+ " true."; } /** * Get the value of m_SelectedRange. * * @return Value of m_SelectedRange. */ public Range getSelectedRange() { return m_SelectedRange; } /** * Set the value of m_SelectedRange. * * @param newSelectedRange Value to assign to m_SelectedRange. */ public void setSelectedRange(String newSelectedRange) { m_SelectedRange = new Range(newSelectedRange); } /** * Get the attribute name prefix. * * @return The current attribute name prefix. */ public String getAttributeNamePrefix() { return m_Prefix; } /** * Set the attribute name prefix. * * @param newPrefix String to use as the attribute name prefix. */ public void setAttributeNamePrefix(String newPrefix) { m_Prefix = newPrefix; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String attributeNamePrefixTipText() { return "Prefix for the created attribute names. "+ "(default: \"\")"; } /**
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -