📄 stringtowordvector.java
字号:
throws Exception {
super.setInputFormat(instanceInfo);
m_FirstBatchDone = false;
return false;
}
/**
* Input an instance for filtering. Filter requires all
* training instances be read before producing output.
*
* @param instance the input instance.
* @return true if the filtered instance may now be
* collected with output().
* @exception IllegalStateException if no input structure has been defined.
*/
public boolean input(Instance instance) throws Exception {
if (getInputFormat() == null) {
throw new IllegalStateException("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
if (m_FirstBatchDone) {
convertInstance(instance);
return true;
} else {
bufferInput(instance);
return false;
}
}
/**
* Signify that this batch of input to the filter is finished.
* If the filter requires all instances prior to filtering,
* output() may now be called to retrieve the filtered instances.
*
* @return true if there are instances pending output.
* @exception IllegalStateException if no input structure has been defined.
*/
public boolean batchFinished() throws Exception {
if (getInputFormat() == null) {
throw new IllegalStateException("No input instance format defined");
}
// Determine the dictionary
if (!m_FirstBatchDone) {
determineDictionary();
}
// Convert pending input instances.
if(this.m_normalizeDocLength==false || m_FirstBatchDone==true) {
for(int i = 0; i < getInputFormat().numInstances(); i++) {
convertInstance(getInputFormat().instance(i));
}
flushInput();
}
else {
FastVector fv = new FastVector();
int firstCopy=0;
Instances inputFormat = getInputFormat();
avgDocLength = 0;
for(int i=0; i<inputFormat.numInstances(); i++)
firstCopy = convertInstancewoDocNorm(inputFormat.instance(i), fv);
//Now normalizing document length
for(int i=0; i<fv.size(); i++) {
Instance inst = (Instance) fv.elementAt(i);
double docLength = 0;
double val=0;
for(int j=0; j<inst.numValues(); j++) {
if(inst.index(j)>=firstCopy) {
val = inst.valueSparse(j);
docLength += val*val;
}
}
docLength = Math.sqrt(docLength);
avgDocLength += docLength;
for(int j=0; j<inst.numValues(); j++) {
if(inst.index(j)>=firstCopy) {
val = inst.valueSparse(j);
val /= docLength;
// if(i==0)
// System.err.println("Instance "+i+
// ": "+
// "length: "+docLength+
// " setting value "+inst.index(j)+
// " from "+inst.valueSparse(j)+
// " to "+val);
inst.setValueSparse(j, val);
if(val==0){
System.err.println("setting value "+inst.index(j)+" to zero.");
j--;
}
}
}
}
avgDocLength /= inputFormat.numInstances();
for(int i=0; i<fv.size(); i++) {
Instance inst = (Instance) fv.elementAt(i);
double val=0;
for(int j=0; j<inst.numValues(); j++) {
if(inst.index(j)>=firstCopy) {
val = inst.valueSparse(j);
val = val * avgDocLength;
// if(i==0)
// System.err.println("Instance "+i+
// ": "+
// "avgDocLength: "+avgDocLength+
// " setting value "+inst.index(j)+
// " from "+inst.valueSparse(j)+
// " to "+val);
inst.setValueSparse(j, val);
if(val==0) {
System.err.println("setting value "+inst.index(j)+" to zero.");
j--;
}
}
}
push(inst);
}
flushInput();
}
m_NewBatch = true;
m_FirstBatchDone = true;
return (numPendingOutput() != 0);
}
/**
* Returns a string describing this filter
* @return a description of the filter suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Converts String attributes into a set of attributes representing "+
"word occurrence information from the text contained in the "+
"strings. The set of words (attributes) is determined by the first "+
"batch filtered (typically training data).";
}
/**
* Gets whether output instances contain 0 or 1 indicating word
* presence, or word counts.
*
* @return true if word counts should be output.
*/
public boolean getOutputWordCounts() {
return m_OutputCounts;
}
/**
* Sets whether output instances contain 0 or 1 indicating word
* presence, or word counts.
*
* @param outputWordCounts true if word counts should be output.
*/
public void setOutputWordCounts(boolean outputWordCounts) {
m_OutputCounts = outputWordCounts;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String outputWordCountsTipText() {
return "Output word counts rather than boolean 0 or 1"+
"(indicating presence or absence of a word).";
}
/**
* Get the value of delimiters.
*
* @return Value of delimiters.
*/
public String getDelimiters() {
return delimiters;
}
/**
* Set the value of delimiters.
*
* @param newdelimiters Value to assign to delimiters.
*/
public void setDelimiters(String newDelimiters) {
delimiters = newDelimiters;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String delimitersTipText() {
return "Set of delimiter characters to use in tokenizing "+
"(default: \" \\n\\t.,:'\\\"()?!\"). "+
"This option is ignored if onlyAlphabeticTokens option is set to"+
" true.";
}
/**
* Get the value of m_SelectedRange.
*
* @return Value of m_SelectedRange.
*/
public Range getSelectedRange() {
return m_SelectedRange;
}
/**
* Set the value of m_SelectedRange.
*
* @param newSelectedRange Value to assign to m_SelectedRange.
*/
public void setSelectedRange(String newSelectedRange) {
m_SelectedRange = new Range(newSelectedRange);
}
/**
* Get the attribute name prefix.
*
* @return The current attribute name prefix.
*/
public String getAttributeNamePrefix() {
return m_Prefix;
}
/**
* Set the attribute name prefix.
*
* @param newPrefix String to use as the attribute name prefix.
*/
public void setAttributeNamePrefix(String newPrefix) {
m_Prefix = newPrefix;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String attributeNamePrefixTipText() {
return "Prefix for the created attribute names. "+
"(default: \"\")";
}
/**
* Gets the number of words (per class if there is a class attribute
* assigned) to attempt to keep.
*
* @return the target number of words in the output vector (per class if
* assigned).
*/
public int getWordsToKeep() {
return m_WordsToKeep;
}
/**
* Sets the number of words (per class if there is a class attribute
* assigned) to attempt to keep.
*
* @param newWordsToKeep the target number of words in the output
* vector (per class if assigned).
*/
public void setWordsToKeep(int newWordsToKeep) {
m_WordsToKeep = newWordsToKeep;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String wordsToKeepTipText() {
return "The number of words (per class if there is a class attribute "+
"assigned) to attempt to keep.";
}
/** Gets whether if the word frequencies should be transformed into
* log(1+fij) where fij is the frequency of word i in document(instance) j.
*
* @return true if word frequencies are to be transformed.
*/
public boolean getTFTransform() {
return this.m_TFTransform;
}
/** Sets whether if the word frequencies should be transformed into
* log(1+fij) where fij is the frequency of word i in document(instance) j.
*
* @param true if word frequencies are to be transformed.
*/
public void setTFTransform(boolean TFTransform) {
this.m_TFTransform = TFTransform;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String TFTransformTipText() {
return "Sets whether if the word frequencies should be transformed into:\n "+
" log(1+fij) \n"+
" where fij is the frequency of word i in document (instance) j.";
}
/** Sets whether if the word frequencies in a document should be transformed
* into: <br>
* fij*log(num of Docs/num of Docs with word i) <br>
* where fij is the frequency of word i in document(instance) j.
*
* @return true if the word frequencies are to be transformed.
*/
public boolean getIDFTransform() {
return this.m_IDFTransform;
}
/** Sets whether if the word frequencies in a document should be transformed
* into: <br>
* fij*log(num of Docs/num of Docs with word i) <br>
* where fij is the frequency of word i in document(instance) j.
*
* @param true if the word frequecies are to be transformed
*/
public void setIDFTransform(boolean IDFTransform) {
this.m_IDFTransform = IDFTransform;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String IDFTransformTipText() {
return "Sets whether if the word frequencies in a document should be "+
"transformed into: \n"+
" fij*log(num of Docs/num of Docs with word i) \n"+
" where fij is the frequency of word i in document (instance) j.";
}
/** Gets whether if the word frequencies for a document (instance) should
* be normalized or not.
*
* @return true if word frequencies are to be normalized.
*/
public boolean getNormalizeDocLength() {
return this.m_normalizeDocLength;
}
/** Sets whether if the word frequencies for a document (instance) should
* be normalized or not.
*
* @param true if word frequencies are to be normalized.
*/
public void setNormalizeDocLength(boolean normalizeDocLength) {
this.m_normalizeDocLength = normalizeDocLength;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String normalizeDocLengthTipText() {
return "Sets whether if the word frequencies for a document (instance) "+
"should be normalized or not.";
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -