📄 stringtowordvector.java
字号:
}
/** Gets whether if the tokens are to be formed only from contiguous
* alphabetic sequences. The delimiter string is ignored if this is true.
*
* @return true if tokens are to be formed from contiguous alphabetic
* characters.
*/
public boolean getOnlyAlphabeticTokens() {
return m_onlyAlphabeticTokens;
}
/** Sets whether if tokens are to be formed only from contiguous alphabetic
* character sequences. The delimiter string is ignored if this option is
* set to true.
*
* @param onlyAlphabeticSequences should be set to true if only alphabetic
* tokens should be formed.
*/
public void setOnlyAlphabeticTokens(boolean tokenizeOnlyAlphabeticSequences) {
m_onlyAlphabeticTokens = tokenizeOnlyAlphabeticSequences;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String onlyAlphabeticTokensTipText() {
return "Sets whether if the word tokens are to be formed only from "+
"contiguous alphabetic sequences (The delimiter string is "+
"ignored if this option is set to true).";
}
/** Gets whether if the tokens are to be downcased or not.
*
* @return true if the tokens are to be downcased.
*/
public boolean getLowerCaseTokens() {
return this.m_lowerCaseTokens;
}
/** Sets whether if the tokens are to be downcased or not. (Doesn't affect
* non-alphabetic characters in tokens).
*
* @param downCaseTokens should be true if only lower case tokens are
* to be formed.
*/
public void setLowerCaseTokens(boolean downCaseTokens) {
this.m_lowerCaseTokens = downCaseTokens;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String lowerCaseTokensTipText() {
return "If set then all the word tokens are converted to lower case "+
"before being added to the dictionary.";
}
/** Gets whether if the words on the stoplist are to be ignored (The stoplist
* is in weka.core.StopWords).
*
* @return true if the words on the stoplist are to be ignored.
*/
public boolean getUseStoplist() {
return m_useStoplist;
}
/** Sets whether if the words that are on a stoplist are to be ignored (The
* stop list is in weka.core.StopWords).
*
* @param useStoplist true if the tokens that are on a stoplist are to be
* ignored.
*/
public void setUseStoplist(boolean useStoplist) {
m_useStoplist = useStoplist;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String useStoplistTipText() {
return "Ignores all the words that are on the stoplist, if set to true.";
}
private static void sortArray(int [] array) {
int i, j, h, N = array.length - 1;
for (h = 1; h <= N / 9; h = 3 * h + 1);
for (; h > 0; h /= 3) {
for (i = h + 1; i <= N; i++) {
int v = array[i];
j = i;
while (j > h && array[j - h] > v ) {
array[j] = array[j - h];
j -= h;
}
array[j] = v;
}
}
}
private void determineSelectedRange() {
Instances inputFormat = getInputFormat();
// Calculate the default set of fields to convert
if (m_SelectedRange == null) {
StringBuffer fields = new StringBuffer();
for (int j = 0; j < inputFormat.numAttributes(); j++) {
if (inputFormat.attribute(j).type() == Attribute.STRING)
fields.append((j + 1) + ",");
}
m_SelectedRange = new Range(fields.toString());
}
m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
// Prevent the user from converting non-string fields
StringBuffer fields = new StringBuffer();
for (int j = 0; j < inputFormat.numAttributes(); j++) {
if (m_SelectedRange.isInRange(j)
&& inputFormat.attribute(j).type() == Attribute.STRING)
fields.append((j + 1) + ",");
}
m_SelectedRange.setRanges(fields.toString());
m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
// System.err.println("Selected Range: " + getSelectedRange().getRanges());
}
private void determineDictionary() {
// System.err.println("Creating dictionary");
int classInd = getInputFormat().classIndex();
int values = 1;
if (classInd != -1) {
values = getInputFormat().attribute(classInd).numValues();
}
//TreeMap dictionaryArr [] = new TreeMap[values];
TreeMap [] dictionaryArr = new TreeMap[values];
for (int i = 0; i < values; i++) {
dictionaryArr[i] = new TreeMap();
}
// Make sure we know which fields to convert
determineSelectedRange();
// Tokenize all training text into an orderedMap of "words".
for (int i = 0; i < getInputFormat().numInstances(); i++) {
/*
if (i % 10 == 0) {
System.err.print( i + " " + getInputFormat().numInstances() + "\r");
System.err.flush();
}
*/
Instance instance = getInputFormat().instance(i);
int vInd = 0;
if (classInd != -1) {
vInd = (int)instance.classValue();
}
Hashtable h = new Hashtable();
for (int j = 0; j < instance.numAttributes(); j++) {
if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {
//getInputFormat().attribute(j).type() == Attribute.STRING
Enumeration st;
if(this.m_onlyAlphabeticTokens==false)
st = new StringTokenizer(instance.stringValue(j),
delimiters);
else
st = new AlphabeticStringTokenizer(instance.stringValue(j));
while (st.hasMoreElements()) {
String word = ((String)st.nextElement()).intern();
if(this.m_lowerCaseTokens==true)
word = word.toLowerCase();
if(this.m_useStoplist==true)
if(weka.core.Stopwords.isStopword(word))
continue;
if(!(h.contains(word)))
h.put(word, new Integer(0));
Count count = (Count)dictionaryArr[vInd].get(word);
if (count == null) {
dictionaryArr[vInd].put(word, new Count(1));
} else {
count.count ++;
}
}
}
}
//updating the docCount for the words that have occurred in this
//instance(document).
Enumeration e = h.keys();
while(e.hasMoreElements()) {
String word = (String) e.nextElement();
Count c = (Count)dictionaryArr[vInd].get(word);
if(c!=null) {
c.docCount++;
}
else
System.err.println("Warning: A word should definitely be in the "+
"dictionary.Please check the code");
}
}
int totalsize = 0;
int prune[] = new int[values];
for (int z = 0; z < values; z++) {
totalsize += dictionaryArr[z].size();
int array[] = new int[dictionaryArr[z].size()];
int pos = 0;
Iterator it = dictionaryArr[z].keySet().iterator();
while (it.hasNext()) {
String word = (String)it.next();
Count count = (Count)dictionaryArr[z].get(word);
array[pos] = count.count;
pos++;
}
// sort the array
sortArray(array);
if (array.length < m_WordsToKeep) {
// if there aren't enough words, set the threshold to 1
prune[z] = 1;
} else {
// otherwise set it to be at least 1
prune[z] = Math.max(1, array[array.length - m_WordsToKeep]);
}
}
/*
for (int z=0;z<values;z++) {
System.err.println(dictionaryArr[z].size()+" "+totalsize);
}
*/
// Convert the dictionary into an attribute index
// and create one attribute per word
FastVector attributes = new FastVector(totalsize +
getInputFormat().numAttributes());
// Add the non-converted attributes
int classIndex = -1;
for (int i = 0; i < getInputFormat().numAttributes(); i++) {
if (!m_SelectedRange.isInRange(i)) {
if (getInputFormat().classIndex() == i) {
classIndex = attributes.size();
}
attributes.addElement(getInputFormat().attribute(i).copy());
}
}
// Add the word vector attributes
TreeMap newDictionary = new TreeMap();
int index = attributes.size();
for(int z = 0; z < values; z++) {
/*
System.err.print("\nCreating word index...");
if (values > 1) {
System.err.print(" for class id=" + z);
}
System.err.flush();
*/
Iterator it = dictionaryArr[z].keySet().iterator();
while (it.hasNext()) {
String word = (String)it.next();
Count count = (Count)dictionaryArr[z].get(word);
if (count.count >= prune[z]) {
// System.err.println(word+" "+newDictionary.get(word));
if(newDictionary.get(word) == null) {
/*
if (values > 1) {
System.err.print(getInputFormat().classAttribute().value(z) + " ");
}
System.err.println(word);
*/
newDictionary.put(word, new Integer(index++));
attributes.addElement(new Attribute(m_Prefix + word));
}
}
}
}
docsCounts = new int[attributes.size()];
Iterator it = newDictionary.keySet().iterator();
while(it.hasNext()) {
String word = (String) it.next();
int idx = ((Integer)newDictionary.get(word)).intValue();
int docsCount=0;
for(int j=0; j<values; j++) {
Count c = (Count) dictionaryArr[j].get(word);
if(c!=null)
docsCount += c.docCount;
}
docsCounts[idx]=docsCount;
}
attributes.trimToSize();
m_Dictionary = newDictionary;
//System.err.println("done: " + index + " words in total.");
numInstances = getInputFormat().numInstances();
// Set the filter's output format
Instances outputFormat = new Instances(getInputFormat().relationName(),
attributes, 0);
outputFormat.setClassIndex(classIndex);
setOutputFormat(outputFormat);
}
private void convertInstance(Instance instance) throws Exception {
// Convert the instance into a sorted set of indexes
TreeMap contained = new TreeMap();
// Copy all non-converted attributes from input to output
int firstCopy = 0;
for (int i = 0; i < getInputFormat().numAttributes(); i++) {
if (!m_SelectedRange.isInRange(i)) {
if (getInputFormat().attribute(i).type() != Attribute.STRING) {
// Add simple nominal and numeric attributes directly
if (instance.value(i) != 0.0) {
contained.put(new Integer(firstCopy),
new Double(instance.value(i)));
}
} else {
if (instance.isMissing(i)) {
contained.put(new Integer(firstCopy),
new Double(Instance.missingValue()));
} else {
// If this is a string attribute, we have to first add
// this value to the range of possible values, then add
// its new internal index.
if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
// Note that the first string value in a
// SparseInstance doesn't get printed.
outputFormatPeek().attribute(firstCopy)
.addStringValue("Hack to defeat SparseInstance bug");
}
int newIndex = outputFormatPeek().attribute(firstCopy)
.addStringValue(instance.stringValue(i));
contained.put(new Integer(firstCopy),
new Double(newIndex));
}
}
firstCopy++;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -