stringtowordvector.java
来自「Weka」· Java 代码 · 共 1,638 行 · 第 1/4 页
JAVA
1,638 行
value = Utils.getOption('W', options); if (value.length() != 0) setWordsToKeep(Integer.valueOf(value).intValue()); else setWordsToKeep(1000); value = Utils.getOption('M', options); if (value.length() != 0) setMinTermFreq(Integer.valueOf(value).intValue()); else setMinTermFreq(1); setOutputWordCounts(Utils.getFlag('C', options)); setTFTransform(Utils.getFlag('T', options)); setIDFTransform(Utils.getFlag('I', options)); setDoNotOperateOnPerClassBasis(Utils.getFlag('O', options)); String nString = Utils.getOption('N', options); if (nString.length() != 0) setNormalizeDocLength(new SelectedTag(Integer.parseInt(nString), TAGS_FILTER)); else setNormalizeDocLength(new SelectedTag(FILTER_NONE, TAGS_FILTER)); setLowerCaseTokens(Utils.getFlag('L', options)); setUseStoplist(Utils.getFlag('S', options)); String stemmerString = Utils.getOption("stemmer", options); if (stemmerString.length() == 0) { setStemmer(null); } else { String[] stemmerSpec = Utils.splitOptions(stemmerString); if (stemmerSpec.length == 0) throw new Exception("Invalid stemmer specification string"); String stemmerName = stemmerSpec[0]; stemmerSpec[0] = ""; Stemmer stemmer = (Stemmer) Class.forName(stemmerName).newInstance(); if (stemmer instanceof OptionHandler) ((OptionHandler) stemmer).setOptions(stemmerSpec); setStemmer(stemmer); } value = Utils.getOption("stopwords", options); if (value.length() != 0) setStopwords(new File(value)); else setStopwords(null); String tokenizerString = Utils.getOption("tokenizer", options); if (tokenizerString.length() == 0) { setTokenizer(new WordTokenizer()); } else { String[] tokenizerSpec = Utils.splitOptions(tokenizerString); if (tokenizerSpec.length == 0) throw new Exception("Invalid tokenizer specification string"); String tokenizerName = tokenizerSpec[0]; tokenizerSpec[0] = ""; Tokenizer tokenizer = (Tokenizer) Class.forName(tokenizerName).newInstance(); if (tokenizer instanceof OptionHandler) ((OptionHandler) tokenizer).setOptions(tokenizerSpec); setTokenizer(tokenizer); } } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector result; result = new Vector(); result.add("-R"); result.add(getSelectedRange().getRanges()); if (getInvertSelection()) result.add("-V"); if (!"".equals(getAttributeNamePrefix())) { result.add("-P"); result.add(getAttributeNamePrefix()); } result.add("-W"); result.add(String.valueOf(getWordsToKeep())); if (getOutputWordCounts()) result.add("-C"); if (getTFTransform()) result.add("-T"); if (getIDFTransform()) result.add("-I"); result.add("-N"); result.add("" + m_filterType); if (getLowerCaseTokens()) result.add("-L"); if (getUseStoplist()) result.add("-S"); if (getStemmer() != null) { result.add("-stemmer"); String spec = getStemmer().getClass().getName(); if (getStemmer() instanceof OptionHandler) spec += " " + Utils.joinOptions( ((OptionHandler) getStemmer()).getOptions()); result.add(spec.trim()); } result.add("-M"); result.add(String.valueOf(getMinTermFreq())); if (getDoNotOperateOnPerClassBasis()) result.add("-O"); if (!getStopwords().isDirectory()) { result.add("-stopwords"); result.add(getStopwords().getAbsolutePath()); } result.add("-tokenizer"); String spec = getTokenizer().getClass().getName(); if (getTokenizer() instanceof OptionHandler) spec += " " + Utils.joinOptions( ((OptionHandler) getTokenizer()).getOptions()); result.add(spec.trim()); return (String[]) result.toArray(new String[result.size()]); } /** * Constructor that allows specification of the target number of words * in the output. * * @param wordsToKeep the number of words in the output vector (per class * if assigned). */ public StringToWordVector(int wordsToKeep) { m_WordsToKeep = wordsToKeep; } /** * Used to store word counts for dictionary selection based on * a threshold. */ private class Count implements Serializable { /** for serialization */ static final long serialVersionUID = 2157223818584474321L; /** the counts */ public int count, docCount; /** * the constructor * * @param c the count */ public Count(int c) { count = c; } } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); // attributes result.enableAllAttributes(); result.enable(Capability.MISSING_VALUES); // class result.enableAllClasses(); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input * instance structure (any instances contained in the object are * ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the input format can't be set * successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_SelectedRange.setUpper(instanceInfo.numAttributes() - 1); m_AvgDocLength = -1; m_NumInstances = -1; return false; } /** * Input an instance for filtering. Filter requires all * training instances be read before producing output. * * @param instance the input instance. * @return true if the filtered instance may now be * collected with output(). * @throws IllegalStateException if no input structure has been defined. */ public boolean input(Instance instance) throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (isFirstBatchDone()) { FastVector fv = new FastVector(); int firstCopy = convertInstancewoDocNorm(instance, fv); Instance inst = (Instance)fv.elementAt(0); if (m_filterType != FILTER_NONE) { normalizeInstance(inst, firstCopy); } push(inst); return true; } else { bufferInput(instance); return false; } } /** * Signify that this batch of input to the filter is finished. * If the filter requires all instances prior to filtering, * output() may now be called to retrieve the filtered instances. * * @return true if there are instances pending output. * @throws IllegalStateException if no input structure has been defined. */ public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } // We only need to do something in this method // if the first batch hasn't been processed. Otherwise // input() has already done all the work. if (!isFirstBatchDone()) { // Determine the dictionary from the first batch (training data) determineDictionary(); // Convert all instances w/o normalization FastVector fv = new FastVector(); int firstCopy=0; for(int i=0; i < m_NumInstances; i++) { firstCopy = convertInstancewoDocNorm(getInputFormat().instance(i), fv); } // Need to compute average document length if necessary if (m_filterType != FILTER_NONE) { m_AvgDocLength = 0; for(int i=0; i<fv.size(); i++) { Instance inst = (Instance) fv.elementAt(i); double docLength = 0; for(int j=0; j<inst.numValues(); j++) { if(inst.index(j)>=firstCopy) { docLength += inst.valueSparse(j) * inst.valueSparse(j); } } m_AvgDocLength += Math.sqrt(docLength); } m_AvgDocLength /= m_NumInstances; } // Perform normalization if necessary. if (m_filterType == FILTER_NORMALIZE_ALL) { for(int i=0; i<fv.size(); i++) { normalizeInstance((Instance) fv.elementAt(i), firstCopy); } } // Push all instances into the output queue for(int i=0; i<fv.size(); i++) { push((Instance) fv.elementAt(i)); } } // Flush the input flushInput(); m_NewBatch = true; m_FirstBatchDone = true; return (numPendingOutput() != 0); } /** * Returns a string describing this filter * @return a description of the filter suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Converts String attributes into a set of attributes representing " + "word occurrence (depending on the tokenizer) information from the " + "text contained in the strings. The set of words (attributes) is " + "determined by the first batch filtered (typically training data)."; } /** * Gets whether output instances contain 0 or 1 indicating word * presence, or word counts. * * @return true if word counts should be output. */ public boolean getOutputWordCounts() { return m_OutputCounts; } /** * Sets whether output instances contain 0 or 1 indicating word * presence, or word counts. * * @param outputWordCounts true if word counts should be output. */ public void setOutputWordCounts(boolean outputWordCounts) { m_OutputCounts = outputWordCounts; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String outputWordCountsTipText() { return "Output word counts rather than boolean 0 or 1"+ "(indicating presence or absence of a word)."; } /** * Get the value of m_SelectedRange. * * @return Value of m_SelectedRange. */ public Range getSelectedRange() { return m_SelectedRange; } /** * Set the value of m_SelectedRange. * * @param newSelectedRange Value to assign to m_SelectedRange. */ public void setSelectedRange(String newSelectedRange) { m_SelectedRange = new Range(newSelectedRange); } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String attributeIndicesTipText() { return "Specify range of attributes to act on." + " This is a comma separated list of attribute indices, with" + " \"first\" and \"last\" valid values. Specify an inclusive" + " range with \"-\". E.g: \"first-3,5,6-10,last\"."; } /** * Gets the current range selection * * @return a string containing a comma separated list of ranges */ public String getAttributeIndices() { return m_SelectedRange.getRanges(); } /** * Sets which attributes are to be worked on. * * @param rangeList a string representing the list of attributes. Since * the string will typically come from a user, attributes are indexed from * 1. <br> * eg: first-3,5,6-last * @throws IllegalArgumentException if an invalid range list is supplied */ public void setAttributeIndices(String rangeList) { m_SelectedRange.setRanges(rangeList); }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?