stringtowordvector.java

来自「Weka」· Java 代码 · 共 1,638 行 · 第 1/4 页

JAVA
1,638
字号
  }  /**   * sorts an array   *    * @param array the array to sort   */  private static void sortArray(int [] array) {          int i, j, h, N = array.length - 1;	    for (h = 1; h <= N / 9; h = 3 * h + 1); 	    for (; h > 0; h /= 3) {      for (i = h + 1; i <= N; i++) {         int v = array[i];         j = i;         while (j > h && array[j - h] > v ) {           array[j] = array[j - h];           j -= h;         }         array[j] = v;       }     }  }  /**   * determines the selected range   */  private void determineSelectedRange() {        Instances inputFormat = getInputFormat();        // Calculate the default set of fields to convert    if (m_SelectedRange == null) {      StringBuffer fields = new StringBuffer();      for (int j = 0; j < inputFormat.numAttributes(); j++) { 	if (inputFormat.attribute(j).type() == Attribute.STRING)	  fields.append((j + 1) + ",");      }      m_SelectedRange = new Range(fields.toString());    }    m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);        // Prevent the user from converting non-string fields    StringBuffer fields = new StringBuffer();    for (int j = 0; j < inputFormat.numAttributes(); j++) {       if (m_SelectedRange.isInRange(j) 	  && inputFormat.attribute(j).type() == Attribute.STRING)	fields.append((j + 1) + ",");    }    m_SelectedRange.setRanges(fields.toString());    m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);    // System.err.println("Selected Range: " + getSelectedRange().getRanges());   }    /**   * determines the dictionary   */  private void determineDictionary() {    // initialize stopwords    Stopwords stopwords = new Stopwords();    if (getUseStoplist()) {      try {	if (getStopwords().exists() && !getStopwords().isDirectory())	  stopwords.read(getStopwords());      }      catch (Exception e) {	e.printStackTrace();      }    }    // Operate on a per-class basis if class attribute is set    int classInd = getInputFormat().classIndex();    int values = 1;    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {      values = getInputFormat().attribute(classInd).numValues();    }    //TreeMap dictionaryArr [] = new TreeMap[values];    TreeMap [] dictionaryArr = new TreeMap[values];    for (int i = 0; i < values; i++) {      dictionaryArr[i] = new TreeMap();    }    // Make sure we know which fields to convert    determineSelectedRange();    // Tokenize all training text into an orderedMap of "words".    for (int i = 0; i < getInputFormat().numInstances(); i++) {      Instance instance = getInputFormat().instance(i);      int vInd = 0;      if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {	vInd = (int)instance.classValue();      }      // Iterate through all relevant string attributes of the current instance      Hashtable h = new Hashtable();      for (int j = 0; j < instance.numAttributes(); j++) {         if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {	  // Get tokenizer          m_Tokenizer.tokenize(instance.stringValue(j));          	  // Iterate through tokens, perform stemming, and remove stopwords	  // (if required)          while (m_Tokenizer.hasMoreElements()) {            String word = ((String)m_Tokenizer.nextElement()).intern();                        if(this.m_lowerCaseTokens==true)                word = word.toLowerCase();                        word = m_Stemmer.stem(word);                        if(this.m_useStoplist==true)                if(stopwords.is(word))                    continue;                        if(!(h.contains(word)))                h.put(word, new Integer(0));            Count count = (Count)dictionaryArr[vInd].get(word);            if (count == null) {              dictionaryArr[vInd].put(word, new Count(1));            } else {	      count.count ++;                            }          }                  }      }      //updating the docCount for the words that have occurred in this      //instance(document).      Enumeration e = h.keys();      while(e.hasMoreElements()) {	String word = (String) e.nextElement();	Count c = (Count)dictionaryArr[vInd].get(word);	if(c!=null) {	  c.docCount++;	} else 	  System.err.println("Warning: A word should definitely be in the "+			     "dictionary.Please check the code");      }    }    // Figure out the minimum required word frequency    int totalsize = 0;    int prune[] = new int[values];    for (int z = 0; z < values; z++) {      totalsize += dictionaryArr[z].size();      int array[] = new int[dictionaryArr[z].size()];      int pos = 0;      Iterator it = dictionaryArr[z].keySet().iterator();      while (it.hasNext()) {        String word = (String)it.next();        Count count = (Count)dictionaryArr[z].get(word);        array[pos] = count.count;        pos++;      }      // sort the array      sortArray(array);      if (array.length < m_WordsToKeep) {        // if there aren't enough words, set the threshold to	// minFreq        prune[z] = m_minTermFreq;      } else {        // otherwise set it to be at least minFreq        prune[z] = Math.max(m_minTermFreq, 			    array[array.length - m_WordsToKeep]);      }    }    // Convert the dictionary into an attribute index    // and create one attribute per word    FastVector attributes = new FastVector(totalsize +					   getInputFormat().numAttributes());    // Add the non-converted attributes     int classIndex = -1;    for (int i = 0; i < getInputFormat().numAttributes(); i++) {      if (!m_SelectedRange.isInRange(i)) {         if (getInputFormat().classIndex() == i) {          classIndex = attributes.size();        }	attributes.addElement(getInputFormat().attribute(i).copy());      }         }        // Add the word vector attributes (eliminating duplicates    // that occur in multiple classes)    TreeMap newDictionary = new TreeMap();    int index = attributes.size();    for(int z = 0; z < values; z++) {      Iterator it = dictionaryArr[z].keySet().iterator();      while (it.hasNext()) {        String word = (String)it.next();        Count count = (Count)dictionaryArr[z].get(word);        if (count.count >= prune[z]) {          if(newDictionary.get(word) == null) {            newDictionary.put(word, new Integer(index++));            attributes.addElement(new Attribute(m_Prefix + word));          }        }      }    }        // Compute document frequencies    m_DocsCounts = new int[attributes.size()];    Iterator it = newDictionary.keySet().iterator();    while(it.hasNext()) {      String word = (String) it.next();      int idx = ((Integer)newDictionary.get(word)).intValue();      int docsCount=0;      for(int j=0; j<values; j++) {	Count c = (Count) dictionaryArr[j].get(word);	if(c!=null)	  docsCount += c.docCount;      }      m_DocsCounts[idx]=docsCount;    }    // Trim vector and set instance variables    attributes.trimToSize();    m_Dictionary = newDictionary;    m_NumInstances = getInputFormat().numInstances();        // Set the filter's output format    Instances outputFormat = new Instances(getInputFormat().relationName(),                                            attributes, 0);    outputFormat.setClassIndex(classIndex);    setOutputFormat(outputFormat);  }  /**   * Converts the instance w/o normalization.   *    * @oaram instance the instance to convert   * @param v   * @return the conerted instance   */  private int convertInstancewoDocNorm(Instance instance, FastVector v) {    // Convert the instance into a sorted set of indexes    TreeMap contained = new TreeMap();        // Copy all non-converted attributes from input to output    int firstCopy = 0;    for (int i = 0; i < getInputFormat().numAttributes(); i++) {      if (!m_SelectedRange.isInRange(i)) { 	if (getInputFormat().attribute(i).type() != Attribute.STRING) {	  // Add simple nominal and numeric attributes directly	  if (instance.value(i) != 0.0) {	    contained.put(new Integer(firstCopy), 			  new Double(instance.value(i)));	  } 	} else {	  if (instance.isMissing(i)) {	    contained.put(new Integer(firstCopy),			  new Double(Instance.missingValue()));	  } else {	    	    // If this is a string attribute, we have to first add	    // this value to the range of possible values, then add	    // its new internal index.	    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {	      // Note that the first string value in a	      // SparseInstance doesn't get printed.	      outputFormatPeek().attribute(firstCopy)		.addStringValue("Hack to defeat SparseInstance bug");	    }	    int newIndex = outputFormatPeek().attribute(firstCopy)	      .addStringValue(instance.stringValue(i));	    contained.put(new Integer(firstCopy), 			  new Double(newIndex));	  }	}	firstCopy++;      }         }        for (int j = 0; j < instance.numAttributes(); j++) {       //if ((getInputFormat().attribute(j).type() == Attribute.STRING)       if (m_SelectedRange.isInRange(j)	  && (instance.isMissing(j) == false)) {                          m_Tokenizer.tokenize(instance.stringValue(j));                while (m_Tokenizer.hasMoreElements()) {          String word = (String)m_Tokenizer.nextElement();           if(this.m_lowerCaseTokens==true)	    word = word.toLowerCase();          word = m_Stemmer.stem(word);          Integer index = (Integer) m_Dictionary.get(word);          if (index != null) {            if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup              Double count = (Double)contained.get(index);              if (count != null) {                contained.put(index, new Double(count.doubleValue() + 1.0));              } else {                contained.put(index, new Double(1));              }            } else {              contained.put(index, new Double(1));            }                          }        }      }    }        //Doing TFTransform    if(m_TFTransform==true) {      Iterator it = contained.keySet().iterator();      for(int i=0; it.hasNext(); i++) {	Integer index = (Integer)it.next();	if( index.intValue() >= firstCopy ) { 	  double val = ((Double)contained.get(index)).doubleValue();	  val = Math.log(val+1);	  contained.put(index, new Double(val));	}      }    }        //Doing IDFTransform    if(m_IDFTransform==true) {      Iterator it = contained.keySet().iterator();      for(int i=0; it.hasNext(); i++) {	Integer index = (Integer)it.next();	if( index.intValue() >= firstCopy ) {	  double val = ((Double)contained.get(index)).doubleValue();	  val = val*Math.log( m_NumInstances /			      (double) m_DocsCounts[index.intValue()] );	  contained.put(index, new Double(val));	}      }            }        // Convert the set to structures needed to create a sparse instance.    double [] values = new double [contained.size()];    int [] indices = new int [contained.size()];    Iterator it = contained.keySet().iterator();    for (int i = 0; it.hasNext(); i++) {      Integer index = (Integer)it.next();      Double value = (Double)contained.get(index);      values[i] = value.doubleValue();      indices[i] = index.intValue();    }    Instance inst = new SparseInstance(instance.weight(), values, indices,                                        outputFormatPeek().numAttributes());    inst.setDataset(outputFormatPeek());    v.addElement(inst);        return firstCopy;      }    /**   * Normalizes given instance to average doc length (only the newly   * constructed attributes).   *    * @param inst	the instance to normalize   * @param firstCopy   * @throws Exception if avg. doc length not set   */  private void normalizeInstance(Instance inst, int firstCopy)     throws Exception {    double docLength = 0;    if (m_AvgDocLength < 0) {      throw new Exception("Average document length not set.");    }    // Compute length of document vector    for(int j=0; j<inst.numValues(); j++) {      if(inst.index(j)>=firstCopy) {	docLength += inst.valueSparse(j) * inst.valueSparse(j);      }    }            docLength = Math.sqrt(docLength);    // Normalize document vector    for(int j=0; j<inst.numValues(); j++) {      if(inst.index(j)>=firstCopy) {	double val = inst.valueSparse(j) * m_AvgDocLength / docLength;	inst.setValueSparse(j, val);	if (val == 0){	  System.err.println("setting value "+inst.index(j)+" to zero.");	  j--;	}      }    }          }    /**   * Main method for testing this class.   *   * @param argv should contain arguments to the filter:    * use -h for help   */  public static void main(String [] argv) {    runFilter(new StringToWordVector(), argv);  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?