📄 stringtowordvectorfilter.java
字号:
/**
*
* AgentAcademy - an open source Data Mining framework for
* training intelligent agents
*
* Copyright (C) 2001-2003 AA Consortium.
*
* This library is open source software; you can redistribute it
* and/or modify it under the terms of the GNU Lesser General
* Public License as published by the Free Software Foundation;
* either version 2.0 of the License, or (at your option) any later
* version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
*/
package org.agentacademy.modules.dataminer.filters;
/**
* <p>Title: The Data Miner prototype</p>
* <p>Description: A prototype for the DataMiner (DM), the Agent Academy (AA) module responsible for performing data mining on the contents of the Agent Use Repository (AUR). The extracted knowledge is to be sent back to the AUR in the form of a PMML document.</p>
* <p>Copyright: Copyright (c) 2002</p>
* <p>Company: CERTH</p>
* @author asymeon
* @version 0.3
*/
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.Vector;
import org.agentacademy.modules.dataminer.core.Attribute;
import org.agentacademy.modules.dataminer.core.FastVector;
import org.agentacademy.modules.dataminer.core.Instance;
import org.agentacademy.modules.dataminer.core.Instances;
import org.agentacademy.modules.dataminer.core.Option;
import org.agentacademy.modules.dataminer.core.OptionHandler;
import org.agentacademy.modules.dataminer.core.Range;
import org.agentacademy.modules.dataminer.core.SparseInstance;
import org.agentacademy.modules.dataminer.core.Utils;
import org.apache.log4j.Logger;
/**
* Converts String attributes into a set of attributes representing word
* occurrence information from the text contained in the strings. The set of
* words (attributes) is determined by the first batch filtered (typically
* training data).
*
* @version 1.8-gwp-$Revision: 1.2 $
**/
public class StringToWordVectorFilter extends Filter implements OptionHandler {
public static Logger log = Logger.getLogger(StringToWordVectorFilter.class);
/** Delimiters used in tokenization */
private String delimiters = " \n\t.,:'\"()?!";
/** Range of columns to convert to word vectors */
protected Range m_SelectedRange = null;
/** Contains a mapping of valid words to attribute indexes */
private TreeMap m_Dictionary = new TreeMap();
/** True if the first batch has been done */
private boolean m_FirstBatchDone = false;
/**
* The default number of words (per class if there is a class attribute
* assigned) to attempt to keep.
*/
private int m_WordsToKeep = 1000;
/**
* Returns an enumeration describing the available options
*
* @return an enumeration of all the available options
*/
public Enumeration listOptions() {
Vector newVector = new Vector(3);
newVector.addElement(new Option(
"\tString containing the set of delimiter characters\n"
+ "\t(default: \" \\n\\t.,:'\\\"()?!\")",
"D", 1, "-D <delimiter set>"));
newVector.addElement(new Option(
"\tSpecify list of string attributes to convert to words (as weka Range).\n"
+ "\t(default: select all string attributes)",
"R", 1, "-R <index1,index2-index4,...>"));
newVector.addElement(new Option(
"\tSpecify approximate number of word fields to create.\n"
+ "\tSurplus words will be discarded..\n"
+ "\t(default: 1000)",
"w", 1, "-w <number of words to keep>"));
return newVector.elements();
}
/**
* Parses a given list of options controlling the behaviour of this object.
* Valid options are:<p>
*
* -D <delimiter charcters>
* Specify set of delimiter characters
* (default: " \n\t.,:'\\\"()?!\"<p>
*
* -R index1,index2-index4,...<br>
* Specify list of string attributes to convert to words.
* (default: all string attributes)<p>
*
* -w <number of words to keep><br>
* Specify number of word fields to create.
* Other, less useful words will be discarded.
* (default: 1000)<p>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
String value = Utils.getOption('D', options);
if (value.length() != 0) {
setDelimiters(value);
}
value = Utils.getOption('R', options);
if (value.length() != 0) {
setSelectedRange(value);
}
value = Utils.getOption('w', options);
if (value.length() != 0) {
setWordsToKeep(Integer.valueOf(value).intValue());
}
}
/**
* Gets the current settings of the filter.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] options = new String [10];
int current = 0;
options[current++] = "-D";
options[current++] = getDelimiters();
if (getSelectedRange() != null) {
options[current++] = "-R";
m_SelectedRange.setUpper(getInputFormat().numAttributes() - 1);
options[current++] = getSelectedRange().getRanges();
}
options[current++] = "-w";
options[current++] = String.valueOf(getWordsToKeep());
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Default constructor. Targets 1000 words in the output.
*/
public StringToWordVectorFilter() {
}
/**
* Constructor that allows specification of the target number of words
* in the output.
*
* @param wordsToKeep the number of words in the output vector (per class
* if assigned).
*/
public StringToWordVectorFilter(int wordsToKeep) {
m_WordsToKeep = wordsToKeep;
}
/**
* Used to store word counts for dictionary selection based on
* a threshold.
*/
private class Count implements Serializable {
public int count;
public Count(int c) { count = c; }
}
/**
* Sets the format of the input instances.
*
* @param instanceInfo an Instances object containing the input
* instance structure (any instances contained in the object are
* ignored - only the structure is required).
* @return true if the outputFormat may be collected immediately
* @exception Exception if the input format can't be set
* successfully
*/
public boolean setInputFormat(Instances instanceInfo)
throws Exception {
super.setInputFormat(instanceInfo);
m_FirstBatchDone = false;
return false;
}
/**
* Input an instance for filtering. Filter requires all
* training instances be read before producing output.
*
* @param instance the input instance.
* @return true if the filtered instance may now be
* collected with output().
* @exception IllegalStateException if no input structure has been defined.
*/
public boolean input(Instance instance) {
if (getInputFormat() == null) {
throw new IllegalStateException("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
if (m_FirstBatchDone) {
convertInstance(instance);
return true;
} else {
bufferInput(instance);
return false;
}
}
/**
* Signify that this batch of input to the filter is finished.
* If the filter requires all instances prior to filtering,
* output() may now be called to retrieve the filtered instances.
*
* @return true if there are instances pending output.
* @exception IllegalStateException if no input structure has been defined.
*/
public boolean batchFinished() throws Exception{
if (getInputFormat() == null) {
throw new IllegalStateException("No input instance format defined");
}
// Determine the dictionary
if (!m_FirstBatchDone) {
determineDictionary();
}
// Convert pending input instances.
for(int i = 0; i < getInputFormat().numInstances(); i++) {
convertInstance(getInputFormat().instance(i));
}
flushInput();
m_NewBatch = true;
m_FirstBatchDone = true;
return (numPendingOutput() != 0);
}
/**
* Get the value of delimiters.
*
* @return Value of delimiters.
*/
public String getDelimiters() {
return delimiters;
}
/**
* Set the value of delimiters.
*
* @param newdelimiters Value to assign to delimiters.
*/
public void setDelimiters(String newDelimiters) {
delimiters = newDelimiters;
}
/**
* Get the value of m_SelectedRange.
*
* @return Value of m_SelectedRange.
*/
public Range getSelectedRange() {
return m_SelectedRange;
}
/**
* Set the value of m_SelectedRange.
*
* @param newSelectedRange Value to assign to m_SelectedRange.
*/
public void setSelectedRange(String newSelectedRange) {
m_SelectedRange = new Range(newSelectedRange);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -