📄 stringtowordvector.java
字号:
for (int j = 0; j < instance.numAttributes(); j++) {
//if ((getInputFormat().attribute(j).type() == Attribute.STRING)
if (m_SelectedRange.isInRange(j)
&& (instance.isMissing(j) == false)) {
Enumeration st;
if(this.m_onlyAlphabeticTokens==false)
st = new StringTokenizer(instance.stringValue(j),
delimiters);
else
st = new AlphabeticStringTokenizer(instance.stringValue(j));
while (st.hasMoreElements()) {
String word = (String)st.nextElement();
if(this.m_lowerCaseTokens==true)
word = word.toLowerCase();
Integer index = (Integer) m_Dictionary.get(word);
if (index != null) {
if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup
Double count = (Double)contained.get(index);
if (count != null) {
contained.put(index, new Double(count.doubleValue() + 1.0));
} else {
contained.put(index, new Double(1));
}
} else {
contained.put(index, new Double(1));
}
}
}
}
}
//Doing TFTransform
if(m_TFTransform==true) {
Iterator it = contained.keySet().iterator();
for(int i=0; it.hasNext(); i++) {
Integer index = (Integer)it.next();
if( index.intValue() >= firstCopy ) {
double val = ((Double)contained.get(index)).doubleValue();
val = Math.log(val+1);
// if(printInstance==true)
// System.out.println("Instance 0"+ //instance.toString()+
// ": setting value "+index.intValue()+
// " from "+((Double)contained.get(index)).doubleValue()+
// " to "+val);
contained.put(index, new Double(val));
}
}
}
//Doing IDFTransform
if(m_IDFTransform==true) {
Iterator it = contained.keySet().iterator();
for(int i=0; it.hasNext(); i++) {
Integer index = (Integer)it.next();
//int num = getInputFormat().numInstances();
if( index.intValue() >= firstCopy ) {
double val = ((Double)contained.get(index)).doubleValue();
val = val*Math.log( numInstances / //num /
(double)docsCounts[index.intValue()] );
// if(printInstance==true)
// System.out.println("Instance 0"+ //instance.toString()+
// ": "+
// "num: "+numInstances+" index.intValue(): "+index.intValue()+
// " docsCounts: "+this.docsCounts[index.intValue()]+ //"\n"+
// "setting value "+index.intValue()+
// " from "+((Double)contained.get(index)).doubleValue()+
// " to "+val);
contained.put(index, new Double(val));
}
}
}
//Doing length normalization
if(m_normalizeDocLength==true) {
if(avgDocLength<0)
throw new Exception("Error. Average Doc Length not defined yet.");
double sumSq = 0;
Iterator it = contained.keySet().iterator();
for(int i=0; it.hasNext(); i++) {
Integer index = (Integer)it.next();
if( index.intValue() >= firstCopy ) {
double val = ((Double)contained.get(index)).doubleValue();
sumSq += val*val;
}
}
it = contained.keySet().iterator();
for(int i=0; it.hasNext(); i++) {
Integer index = (Integer)it.next();
if( index.intValue() >= firstCopy ) {
double val = ((Double)contained.get(index)).doubleValue();
val = val/Math.sqrt(sumSq);
val = val*avgDocLength;
// System.out.println("Instance "+instance.toString()+
// ": setting value "+index.intValue()+
// " from "+((Double)contained.get(index)).doubleValue()+
// " to "+val);
contained.put(index, new Double(val));
}
}
}
// Convert the set to structures needed to create a sparse instance.
double [] values = new double [contained.size()];
int [] indices = new int [contained.size()];
Iterator it = contained.keySet().iterator();
for (int i = 0; it.hasNext(); i++) {
Integer index = (Integer)it.next();
Double value = (Double)contained.get(index);
values[i] = value.doubleValue();
indices[i] = index.intValue();
}
Instance inst = new SparseInstance(instance.weight(), values, indices,
outputFormatPeek().numAttributes());
inst.setDataset(outputFormatPeek());
push(inst);
//System.err.print("#"); System.err.flush();
}
private int convertInstancewoDocNorm(Instance instance, FastVector v) {
// Convert the instance into a sorted set of indexes
TreeMap contained = new TreeMap();
// Copy all non-converted attributes from input to output
int firstCopy = 0;
for (int i = 0; i < getInputFormat().numAttributes(); i++) {
if (!m_SelectedRange.isInRange(i)) {
if (getInputFormat().attribute(i).type() != Attribute.STRING) {
// Add simple nominal and numeric attributes directly
if (instance.value(i) != 0.0) {
contained.put(new Integer(firstCopy),
new Double(instance.value(i)));
}
} else {
if (instance.isMissing(i)) {
contained.put(new Integer(firstCopy),
new Double(Instance.missingValue()));
} else {
// If this is a string attribute, we have to first add
// this value to the range of possible values, then add
// its new internal index.
if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
// Note that the first string value in a
// SparseInstance doesn't get printed.
outputFormatPeek().attribute(firstCopy)
.addStringValue("Hack to defeat SparseInstance bug");
}
int newIndex = outputFormatPeek().attribute(firstCopy)
.addStringValue(instance.stringValue(i));
contained.put(new Integer(firstCopy),
new Double(newIndex));
}
}
firstCopy++;
}
}
for (int j = 0; j < instance.numAttributes(); j++) {
//if ((getInputFormat().attribute(j).type() == Attribute.STRING)
if (m_SelectedRange.isInRange(j)
&& (instance.isMissing(j) == false)) {
Enumeration st;
if(this.m_onlyAlphabeticTokens==false)
st = new StringTokenizer(instance.stringValue(j),
delimiters);
else
st = new AlphabeticStringTokenizer(instance.stringValue(j));
while (st.hasMoreElements()) {
String word = (String)st.nextElement();
if(this.m_lowerCaseTokens==true)
word = word.toLowerCase();
Integer index = (Integer) m_Dictionary.get(word);
if (index != null) {
if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup
Double count = (Double)contained.get(index);
if (count != null) {
contained.put(index, new Double(count.doubleValue() + 1.0));
} else {
contained.put(index, new Double(1));
}
} else {
contained.put(index, new Double(1));
}
}
}
}
}
//Doing TFTransform
if(m_TFTransform==true) {
Iterator it = contained.keySet().iterator();
for(int i=0; it.hasNext(); i++) {
Integer index = (Integer)it.next();
if( index.intValue() >= firstCopy ) {
double val = ((Double)contained.get(index)).doubleValue();
val = Math.log(val+1);
// if(printInstance==true)
// System.out.println("Instance 0"+ //instance.toString()+
// ": setting value "+index.intValue()+
// " from "+((Double)contained.get(index)).doubleValue()+
// " to "+val);
contained.put(index, new Double(val));
}
}
}
//Doing IDFTransform
if(m_IDFTransform==true) {
Iterator it = contained.keySet().iterator();
for(int i=0; it.hasNext(); i++) {
Integer index = (Integer)it.next();
//int num = getInputFormat().numInstances();
if( index.intValue() >= firstCopy ) {
double val = ((Double)contained.get(index)).doubleValue();
val = val*Math.log( numInstances / //num /
(double) docsCounts[index.intValue()] );
// if(printInstance==true)
// System.out.println("Instance 0"+ //instance.toString()+
// ": "+
// "num: "+numInstances+" index.intValue(): "+index.intValue()+
// " docsCounts: "+this.docsCounts[index.intValue()]+ //"\n"+
// "setting value "+index.intValue()+
// " from "+((Double)contained.get(index)).doubleValue()+
// " to "+val);
contained.put(index, new Double(val));
}
}
}
//Doing length normalization
//if(m_normalizeDocLength==true) {
// double sumSq = 0;
// Iterator it = contained.keySet().iterator();
// for(int i=0; it.hasNext(); i++) {
// Integer index = (Integer)it.next();
// if( index.intValue() >= firstCopy ) {
// double val = ((Double)contained.get(index)).doubleValue();
// sumSq += val*val;
// }
// }
// it = contained.keySet().iterator();
// for(int i=0; it.hasNext(); i++) {
// Integer index = (Integer)it.next();
// if( index.intValue() >= firstCopy ) {
// double val = ((Double)contained.get(index)).doubleValue();
// val = val/Math.sqrt(sumSq);
// System.out.println("Instance "+instance.toString()+
// ": setting value "+index.intValue()+
// " from "+((Double)contained.get(index)).doubleValue()+
// " to "+val);
// contained.put(index, new Double(val));
// }
// }
//}
// Convert the set to structures needed to create a sparse instance.
double [] values = new double [contained.size()];
int [] indices = new int [contained.size()];
Iterator it = contained.keySet().iterator();
for (int i = 0; it.hasNext(); i++) {
Integer index = (Integer)it.next();
Double value = (Double)contained.get(index);
values[i] = value.doubleValue();
indices[i] = index.intValue();
}
Instance inst = new SparseInstance(instance.weight(), values, indices,
outputFormatPeek().numAttributes());
inst.setDataset(outputFormatPeek());
//push(inst);
v.addElement(inst);
return firstCopy;
//System.err.print("#"); System.err.flush();
}
/**
* Main method for testing this class.
*
* @param argv should contain arguments to the filter:
* use -h for help
*/
public static void main(String [] argv) {
try {
if (Utils.getFlag('b', argv)) {
Filter.batchFilterFile(new StringToWordVector(), argv);
} else {
Filter.filterFile(new StringToWordVector(), argv);
}
} catch (Exception ex) {
ex.printStackTrace();
System.out.println(ex.getMessage());
}
}
private class AlphabeticStringTokenizer implements Enumeration {
private char[] str;
int currentPos=0;
public AlphabeticStringTokenizer(String toTokenize) {
str = new char[toTokenize.length()];
toTokenize.getChars(0, toTokenize.length(), str, 0);
}
public boolean hasMoreElements() {
int beginpos = currentPos;
while( beginpos < str.length &&
(str[beginpos]<'a' || str[beginpos]>'z') &&
(str[beginpos]<'A' || str[beginpos]>'Z') ) {
beginpos++;
}
currentPos = beginpos;
//System.out.println("Currently looking at "+str[beginpos]);
if( beginpos<str.length &&
((str[beginpos]>='a' && str[beginpos]<='z') ||
(str[beginpos]>='A' && str[beginpos]<='Z')) ) {
return true;
}
else
return false;
}
public Object nextElement() {
int beginpos, endpos;
beginpos = currentPos;
while( beginpos < str.length &&
(str[beginpos]<'a' && str[beginpos]>'z') &&
(str[beginpos]<'A' && str[beginpos]>'Z') ) {
beginpos++;
}
currentPos = endpos = beginpos;
if(beginpos>=str.length)
throw new NoSuchElementException("no more tokens present");
while( endpos < str.length &&
((str[endpos]>='a' && str[endpos]<='z') ||
(str[endpos]>='A' && str[endpos]<='Z')) ) {
endpos++;
}
String s = new String(str, beginpos, endpos-currentPos);
currentPos = endpos;
//System.out.println("found token >"+s+
// "< beginpos: "+beginpos+
// " endpos: "+endpos+
// " str.length: "+str.length+
// " str[beginpos]: "+str[beginpos]);
return s;
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -