📄 stringtowordvector.java

📁 代码是一个分类器的实现,其中使用了部分weka的源代码。可以将项目导入eclipse运行
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
          if(this.m_onlyAlphabeticTokens==false)              st = new StringTokenizer(instance.stringValue(j),                                                   delimiters);          else              st = new AlphabeticStringTokenizer(instance.stringValue(j));          	  // Iterate through tokens, perform stemming, and remove stopwords	  // (if required)          while (st.hasMoreElements()) {            String word = ((String)st.nextElement()).intern();                        if(this.m_lowerCaseTokens==true)                word = word.toLowerCase();                        word = m_Stemmer.stem(word);                        if(this.m_useStoplist==true)                if(weka.core.Stopwords.isStopword(word))                    continue;                        if(!(h.contains(word)))                h.put(word, new Integer(0));            Count count = (Count)dictionaryArr[vInd].get(word);            if (count == null) {              dictionaryArr[vInd].put(word, new Count(1));            } else {	      count.count ++;                            }          }                  }      }      //updating the docCount for the words that have occurred in this      //instance(document).      Enumeration e = h.keys();      while(e.hasMoreElements()) {	String word = (String) e.nextElement();	Count c = (Count)dictionaryArr[vInd].get(word);	if(c!=null) {	  c.docCount++;	} else 	  System.err.println("Warning: A word should definitely be in the "+			     "dictionary.Please check the code");      }    }    // Figure out the minimum required word frequency    int totalsize = 0;    int prune[] = new int[values];    for (int z = 0; z < values; z++) {      totalsize += dictionaryArr[z].size();      int array[] = new int[dictionaryArr[z].size()];      int pos = 0;      Iterator it = dictionaryArr[z].keySet().iterator();      while (it.hasNext()) {        String word = (String)it.next();        Count count = (Count)dictionaryArr[z].get(word);        array[pos] = count.count;        pos++;      }      // sort the array      sortArray(array);      if (array.length < m_WordsToKeep) {        // if there aren't enough words, set the threshold to	// minFreq        prune[z] = m_minTermFreq;      } else {        // otherwise set it to be at least minFreq        prune[z] = Math.max(m_minTermFreq, 			    array[array.length - m_WordsToKeep]);      }    }    // Convert the dictionary into an attribute index    // and create one attribute per word    FastVector attributes = new FastVector(totalsize +					   getInputFormat().numAttributes());    // Add the non-converted attributes     int classIndex = -1;    for (int i = 0; i < getInputFormat().numAttributes(); i++) {      if (!m_SelectedRange.isInRange(i)) {         if (getInputFormat().classIndex() == i) {          classIndex = attributes.size();        }	attributes.addElement(getInputFormat().attribute(i).copy());      }         }        // Add the word vector attributes (eliminating duplicates    // that occur in multiple classes)    TreeMap newDictionary = new TreeMap();    int index = attributes.size();    for(int z = 0; z < values; z++) {      Iterator it = dictionaryArr[z].keySet().iterator();      while (it.hasNext()) {        String word = (String)it.next();        Count count = (Count)dictionaryArr[z].get(word);        if (count.count >= prune[z]) {          if(newDictionary.get(word) == null) {            newDictionary.put(word, new Integer(index++));            attributes.addElement(new Attribute(m_Prefix + word));          }        }      }    }        // Compute document frequencies    docsCounts = new int[attributes.size()];    Iterator it = newDictionary.keySet().iterator();    while(it.hasNext()) {      String word = (String) it.next();      int idx = ((Integer)newDictionary.get(word)).intValue();      int docsCount=0;      for(int j=0; j<values; j++) {	Count c = (Count) dictionaryArr[j].get(word);	if(c!=null)	  docsCount += c.docCount;      }      docsCounts[idx]=docsCount;    }    // Trim vector and set instance variables    attributes.trimToSize();    m_Dictionary = newDictionary;    numInstances = getInputFormat().numInstances();        // Set the filter's output format    Instances outputFormat = new Instances(getInputFormat().relationName(),                                            attributes, 0);    outputFormat.setClassIndex(classIndex);    setOutputFormat(outputFormat);  }  /**   * Converts the instance w/o normalization.   *    * @oaram instance the instance to convert   * @param v   * @return the conerted instance   */  private int convertInstancewoDocNorm(Instance instance, FastVector v) {    // Convert the instance into a sorted set of indexes    TreeMap contained = new TreeMap();        // Copy all non-converted attributes from input to output    int firstCopy = 0;    for (int i = 0; i < getInputFormat().numAttributes(); i++) {      if (!m_SelectedRange.isInRange(i)) { 	if (getInputFormat().attribute(i).type() != Attribute.STRING) {	  // Add simple nominal and numeric attributes directly	  if (instance.value(i) != 0.0) {	    contained.put(new Integer(firstCopy), 			  new Double(instance.value(i)));	  } 	} else {	  if (instance.isMissing(i)) {	    contained.put(new Integer(firstCopy),			  new Double(Instance.missingValue()));	  } else {	    	    // If this is a string attribute, we have to first add	    // this value to the range of possible values, then add	    // its new internal index.	    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {	      // Note that the first string value in a	      // SparseInstance doesn't get printed.	      outputFormatPeek().attribute(firstCopy)		.addStringValue("Hack to defeat SparseInstance bug");	    }	    int newIndex = outputFormatPeek().attribute(firstCopy)	      .addStringValue(instance.stringValue(i));	    contained.put(new Integer(firstCopy), 			  new Double(newIndex));	  }	}	firstCopy++;      }         }        for (int j = 0; j < instance.numAttributes(); j++) {       //if ((getInputFormat().attribute(j).type() == Attribute.STRING)       if (m_SelectedRange.isInRange(j)	  && (instance.isMissing(j) == false)) {                  Enumeration st;                if(this.m_onlyAlphabeticTokens==false)	  st = new StringTokenizer(instance.stringValue(j),				   delimiters);        else	  st = new AlphabeticStringTokenizer(instance.stringValue(j));                while (st.hasMoreElements()) {          String word = (String)st.nextElement();           if(this.m_lowerCaseTokens==true)	    word = word.toLowerCase();          word = m_Stemmer.stem(word);          Integer index = (Integer) m_Dictionary.get(word);          if (index != null) {            if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup              Double count = (Double)contained.get(index);              if (count != null) {                contained.put(index, new Double(count.doubleValue() + 1.0));              } else {                contained.put(index, new Double(1));              }            } else {              contained.put(index, new Double(1));            }                          }        }      }    }        //Doing TFTransform    if(m_TFTransform==true) {      Iterator it = contained.keySet().iterator();      for(int i=0; it.hasNext(); i++) {	Integer index = (Integer)it.next();	if( index.intValue() >= firstCopy ) { 	  double val = ((Double)contained.get(index)).doubleValue();	  val = Math.log(val+1);	  contained.put(index, new Double(val));	}      }    }        //Doing IDFTransform    if(m_IDFTransform==true) {      Iterator it = contained.keySet().iterator();      for(int i=0; it.hasNext(); i++) {	Integer index = (Integer)it.next();	if( index.intValue() >= firstCopy ) {	  double val = ((Double)contained.get(index)).doubleValue();	  val = val*Math.log( numInstances /			      (double) docsCounts[index.intValue()] );	  contained.put(index, new Double(val));	}      }            }        // Convert the set to structures needed to create a sparse instance.    double [] values = new double [contained.size()];    int [] indices = new int [contained.size()];    Iterator it = contained.keySet().iterator();    for (int i = 0; it.hasNext(); i++) {      Integer index = (Integer)it.next();      Double value = (Double)contained.get(index);      values[i] = value.doubleValue();      indices[i] = index.intValue();    }    Instance inst = new SparseInstance(instance.weight(), values, indices,                                        outputFormatPeek().numAttributes());    inst.setDataset(outputFormatPeek());    v.addElement(inst);        return firstCopy;      }    /**   * Normalizes given instance to average doc length (only the newly   * constructed attributes).   *    * @param inst	the instance to normalize   * @param firstCopy   * @throws Exception if avg. doc length not set   */  private void normalizeInstance(Instance inst, int firstCopy)     throws Exception {    double docLength = 0;    if (avgDocLength < 0) {      throw new Exception("Average document length not set.");    }    // Compute length of document vector    for(int j=0; j<inst.numValues(); j++) {      if(inst.index(j)>=firstCopy) {	docLength += inst.valueSparse(j) * inst.valueSparse(j);      }    }            docLength = Math.sqrt(docLength);    // Normalize document vector    for(int j=0; j<inst.numValues(); j++) {      if(inst.index(j)>=firstCopy) {	double val = inst.valueSparse(j) * avgDocLength / docLength;	inst.setValueSparse(j, val);	if (val == 0){	  System.err.println("setting value "+inst.index(j)+" to zero.");	  j--;	}      }    }          }    /**   * Main method for testing this class.   *   * @param argv should contain arguments to the filter:    * use -h for help   */  public static void main(String [] argv) {    runFilter(new StringToWordVector(), argv);  }      /**   * alphabetic string tokenizer   */  private class AlphabeticStringTokenizer       implements Enumeration {          /** the characters of the string */      private char[] str;            /** the current position */      int currentPos=0;            /**       * Constructor       *        * @param toTokenize the string to tokenize       */      public AlphabeticStringTokenizer(String toTokenize) {          str = new char[toTokenize.length()];          toTokenize.getChars(0, toTokenize.length(), str, 0);      }            /**       * returns whether there are more elements still       *        * @return true if there are still more elements       */      public boolean hasMoreElements() {          int beginpos = currentPos;                    while( beginpos < str.length &&                  (str[beginpos]<'a' || str[beginpos]>'z') &&                 (str[beginpos]<'A' || str[beginpos]>'Z') ) {                     beginpos++;              }          currentPos = beginpos;          //System.out.println("Currently looking at "+str[beginpos]);                    if( beginpos<str.length &&               ((str[beginpos]>='a' && str[beginpos]<='z') ||               (str[beginpos]>='A' && str[beginpos]<='Z')) ) {                   return true;          }          else              return false;      }            /**       * returns the next element       *        * @return the next element       */      public Object nextElement() {          int beginpos, endpos;          beginpos = currentPos;                    while( beginpos < str.length &&                  (str[beginpos]<'a' && str[beginpos]>'z') &&                 (str[beginpos]<'A' && str[beginpos]>'Z') ) {                     beginpos++;              }          currentPos = endpos = beginpos;                    if(beginpos>=str.length)              throw new NoSuchElementException("no more tokens present");                    while( endpos < str.length &&                  ((str[endpos]>='a' && str[endpos]<='z') ||                  (str[endpos]>='A' && str[endpos]<='Z')) ) {                                          endpos++;          }                    String s = new String(str, beginpos, endpos-currentPos);          currentPos = endpos;          //System.out.println("found token >"+s+          //                   "< beginpos: "+beginpos+          //                   " endpos: "+endpos+          //                   " str.length: "+str.length+          //                   " str[beginpos]: "+str[beginpos]);          return s;      }        }}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -