📄 stringtowordvector.java
字号:
if(this.m_onlyAlphabeticTokens==false) st = new StringTokenizer(instance.stringValue(j), delimiters); else st = new AlphabeticStringTokenizer(instance.stringValue(j)); // Iterate through tokens, perform stemming, and remove stopwords // (if required) while (st.hasMoreElements()) { String word = ((String)st.nextElement()).intern(); if(this.m_lowerCaseTokens==true) word = word.toLowerCase(); word = m_Stemmer.stem(word); if(this.m_useStoplist==true) if(weka.core.Stopwords.isStopword(word)) continue; if(!(h.contains(word))) h.put(word, new Integer(0)); Count count = (Count)dictionaryArr[vInd].get(word); if (count == null) { dictionaryArr[vInd].put(word, new Count(1)); } else { count.count ++; } } } } //updating the docCount for the words that have occurred in this //instance(document). Enumeration e = h.keys(); while(e.hasMoreElements()) { String word = (String) e.nextElement(); Count c = (Count)dictionaryArr[vInd].get(word); if(c!=null) { c.docCount++; } else System.err.println("Warning: A word should definitely be in the "+ "dictionary.Please check the code"); } } // Figure out the minimum required word frequency int totalsize = 0; int prune[] = new int[values]; for (int z = 0; z < values; z++) { totalsize += dictionaryArr[z].size(); int array[] = new int[dictionaryArr[z].size()]; int pos = 0; Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String)it.next(); Count count = (Count)dictionaryArr[z].get(word); array[pos] = count.count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Convert the dictionary into an attribute index // and create one attribute per word FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes()); // Add the non-converted attributes int classIndex = -1; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().classIndex() == i) { classIndex = attributes.size(); } attributes.addElement(getInputFormat().attribute(i).copy()); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) TreeMap newDictionary = new TreeMap(); int index = attributes.size(); for(int z = 0; z < values; z++) { Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String)it.next(); Count count = (Count)dictionaryArr[z].get(word); if (count.count >= prune[z]) { if(newDictionary.get(word) == null) { newDictionary.put(word, new Integer(index++)); attributes.addElement(new Attribute(m_Prefix + word)); } } } } // Compute document frequencies docsCounts = new int[attributes.size()]; Iterator it = newDictionary.keySet().iterator(); while(it.hasNext()) { String word = (String) it.next(); int idx = ((Integer)newDictionary.get(word)).intValue(); int docsCount=0; for(int j=0; j<values; j++) { Count c = (Count) dictionaryArr[j].get(word); if(c!=null) docsCount += c.docCount; } docsCounts[idx]=docsCount; } // Trim vector and set instance variables attributes.trimToSize(); m_Dictionary = newDictionary; numInstances = getInputFormat().numInstances(); // Set the filter's output format Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); } /** * Converts the instance w/o normalization. * * @oaram instance the instance to convert * @param v * @return the conerted instance */ private int convertInstancewoDocNorm(Instance instance, FastVector v) { // Convert the instance into a sorted set of indexes TreeMap contained = new TreeMap(); // Copy all non-converted attributes from input to output int firstCopy = 0; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { contained.put(new Integer(firstCopy), new Double(instance.value(i))); } } else { if (instance.isMissing(i)) { contained.put(new Integer(firstCopy), new Double(Instance.missingValue())); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(firstCopy) .addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(firstCopy) .addStringValue(instance.stringValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } } firstCopy++; } } for (int j = 0; j < instance.numAttributes(); j++) { //if ((getInputFormat().attribute(j).type() == Attribute.STRING) if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { Enumeration st; if(this.m_onlyAlphabeticTokens==false) st = new StringTokenizer(instance.stringValue(j), delimiters); else st = new AlphabeticStringTokenizer(instance.stringValue(j)); while (st.hasMoreElements()) { String word = (String)st.nextElement(); if(this.m_lowerCaseTokens==true) word = word.toLowerCase(); word = m_Stemmer.stem(word); Integer index = (Integer) m_Dictionary.get(word); if (index != null) { if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup Double count = (Double)contained.get(index); if (count != null) { contained.put(index, new Double(count.doubleValue() + 1.0)); } else { contained.put(index, new Double(1)); } } else { contained.put(index, new Double(1)); } } } } } //Doing TFTransform if(m_TFTransform==true) { Iterator it = contained.keySet().iterator(); for(int i=0; it.hasNext(); i++) { Integer index = (Integer)it.next(); if( index.intValue() >= firstCopy ) { double val = ((Double)contained.get(index)).doubleValue(); val = Math.log(val+1); contained.put(index, new Double(val)); } } } //Doing IDFTransform if(m_IDFTransform==true) { Iterator it = contained.keySet().iterator(); for(int i=0; it.hasNext(); i++) { Integer index = (Integer)it.next(); if( index.intValue() >= firstCopy ) { double val = ((Double)contained.get(index)).doubleValue(); val = val*Math.log( numInstances / (double) docsCounts[index.intValue()] ); contained.put(index, new Double(val)); } } } // Convert the set to structures needed to create a sparse instance. double [] values = new double [contained.size()]; int [] indices = new int [contained.size()]; Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer)it.next(); Double value = (Double)contained.get(index); values[i] = value.doubleValue(); indices[i] = index.intValue(); } Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes()); inst.setDataset(outputFormatPeek()); v.addElement(inst); return firstCopy; } /** * Normalizes given instance to average doc length (only the newly * constructed attributes). * * @param inst the instance to normalize * @param firstCopy * @throws Exception if avg. doc length not set */ private void normalizeInstance(Instance inst, int firstCopy) throws Exception { double docLength = 0; if (avgDocLength < 0) { throw new Exception("Average document length not set."); } // Compute length of document vector for(int j=0; j<inst.numValues(); j++) { if(inst.index(j)>=firstCopy) { docLength += inst.valueSparse(j) * inst.valueSparse(j); } } docLength = Math.sqrt(docLength); // Normalize document vector for(int j=0; j<inst.numValues(); j++) { if(inst.index(j)>=firstCopy) { double val = inst.valueSparse(j) * avgDocLength / docLength; inst.setValueSparse(j, val); if (val == 0){ System.err.println("setting value "+inst.index(j)+" to zero."); j--; } } } } /** * Main method for testing this class. * * @param argv should contain arguments to the filter: * use -h for help */ public static void main(String [] argv) { runFilter(new StringToWordVector(), argv); } /** * alphabetic string tokenizer */ private class AlphabeticStringTokenizer implements Enumeration { /** the characters of the string */ private char[] str; /** the current position */ int currentPos=0; /** * Constructor * * @param toTokenize the string to tokenize */ public AlphabeticStringTokenizer(String toTokenize) { str = new char[toTokenize.length()]; toTokenize.getChars(0, toTokenize.length(), str, 0); } /** * returns whether there are more elements still * * @return true if there are still more elements */ public boolean hasMoreElements() { int beginpos = currentPos; while( beginpos < str.length && (str[beginpos]<'a' || str[beginpos]>'z') && (str[beginpos]<'A' || str[beginpos]>'Z') ) { beginpos++; } currentPos = beginpos; //System.out.println("Currently looking at "+str[beginpos]); if( beginpos<str.length && ((str[beginpos]>='a' && str[beginpos]<='z') || (str[beginpos]>='A' && str[beginpos]<='Z')) ) { return true; } else return false; } /** * returns the next element * * @return the next element */ public Object nextElement() { int beginpos, endpos; beginpos = currentPos; while( beginpos < str.length && (str[beginpos]<'a' && str[beginpos]>'z') && (str[beginpos]<'A' && str[beginpos]>'Z') ) { beginpos++; } currentPos = endpos = beginpos; if(beginpos>=str.length) throw new NoSuchElementException("no more tokens present"); while( endpos < str.length && ((str[endpos]>='a' && str[endpos]<='z') || (str[endpos]>='A' && str[endpos]<='Z')) ) { endpos++; } String s = new String(str, beginpos, endpos-currentPos); currentPos = endpos; //System.out.println("found token >"+s+ // "< beginpos: "+beginpos+ // " endpos: "+endpos+ // " str.length: "+str.length+ // " str[beginpos]: "+str[beginpos]); return s; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -