📄 stringtowordvector.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
  }      /**   * Set the value of m_SelectedRange.   *   * @param newSelectedRange Value to assign to m_SelectedRange.   */  public void setSelectedRange(String newSelectedRange) {    m_SelectedRange = new Range(newSelectedRange);  }  /**   * Gets the number of words (per class if there is a class attribute   * assigned) to attempt to keep.   *   * @return the target number of words in the output vector (per class if   * assigned).   */  public int getWordsToKeep() {    return m_WordsToKeep;  }    /**   * Sets the number of words (per class if there is a class attribute   * assigned) to attempt to keep.   *   * @param newWordsToKeep the target number of words in the output    * vector (per class if assigned).   */  public void setWordsToKeep(int newWordsToKeep) {    m_WordsToKeep = newWordsToKeep;  }    private static void sortArray(int [] array) {          int i, j, h, N = array.length - 1;	    for (h = 1; h <= N / 9; h = 3 * h + 1); 	    for (; h > 0; h /= 3) {      for (i = h + 1; i <= N; i++) {         int v = array[i];         j = i;         while (j > h && array[j - h] > v ) {           array[j] = array[j - h];           j -= h;         }         array[j] = v;       }     }  }  private void determineSelectedRange() {        Instances inputFormat = getInputFormat();        // Calculate the default set of fields to convert    if (m_SelectedRange == null) {      StringBuffer fields = new StringBuffer();      for (int j = 0; j < inputFormat.numAttributes(); j++) { 	if (inputFormat.attribute(j).type() == Attribute.STRING)	  fields.append((j + 1) + ",");      }      m_SelectedRange = new Range(fields.toString());    }        m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);        // Prevent the user from converting non-string fields    StringBuffer fields = new StringBuffer();    for (int j = 0; j < inputFormat.numAttributes(); j++) {       if (m_SelectedRange.isInRange(j) 	  && inputFormat.attribute(j).type() == Attribute.STRING)	fields.append((j + 1) + ",");    }    m_SelectedRange.setRanges(fields.toString());        // System.err.println("Selected Range: " + getSelectedRange().getRanges());   }    private void determineDictionary() {        // System.err.println("Creating dictionary");         int classInd = getInputFormat().classIndex();    int values = 1;    if (classInd != -1) {      values = getInputFormat().attribute(classInd).numValues();    }    TreeMap dictionaryArr [] = new TreeMap[values];    for (int i = 0; i < values; i++) {      dictionaryArr[i] = new TreeMap();    }    // Make sure we know which fields to convert    determineSelectedRange();    // Tokenize all training text into an orderedMap of "words".    for (int i = 0; i < getInputFormat().numInstances(); i++) {      /*	if (i % 10 == 0) {        System.err.print( i + " " + getInputFormat().numInstances() + "\r");         System.err.flush();	}      */      Instance instance = getInputFormat().instance(i);      for (int j = 0; j < instance.numAttributes(); j++) {         if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {	  //getInputFormat().attribute(j).type() == Attribute.STRING                       StringTokenizer st = new StringTokenizer(instance.stringValue(j),                                                   delimiters);          while (st.hasMoreTokens()) {            String word = st.nextToken().intern();            int vInd = 0;            if (classInd != -1) {              vInd = (int)instance.classValue();            }            Count count = (Count)dictionaryArr[vInd].get(word);            if (count == null) {              dictionaryArr[vInd].put(word, new Count(1));            } else {              count.count ++;            }          }        }      }    }    int totalsize = 0;    int prune[] = new int[values];    for (int z = 0; z < values; z++) {      totalsize += dictionaryArr[z].size();      int array[] = new int[dictionaryArr[z].size()];      int pos = 0;      Iterator it = dictionaryArr[z].keySet().iterator();      while (it.hasNext()) {        String word = (String)it.next();        Count count = (Count)dictionaryArr[z].get(word);        array[pos] = count.count;        pos++;      }      // sort the array      sortArray(array);      if (array.length < m_WordsToKeep) {        // if there aren't enough words, set the threshold to 1        prune[z] = 1;      } else {        // otherwise set it to be at least 1        prune[z] = Math.max(1, array[array.length - m_WordsToKeep]);      }    }    /*      for (int z=0;z<values;z++) {      System.err.println(dictionaryArr[z].size()+" "+totalsize);      }    */    // Convert the dictionary into an attribute index    // and create one attribute per word    FastVector attributes = new FastVector(totalsize +					   getInputFormat().numAttributes());    // Add the non-converted attributes     int classIndex = -1;    for (int i = 0; i < getInputFormat().numAttributes(); i++) {      if (!m_SelectedRange.isInRange(i)) {         if (getInputFormat().classIndex() == i) {          classIndex = attributes.size();        }	attributes.addElement(getInputFormat().attribute(i).copy());      }         }    // Add the word vector attributes    TreeMap newDictionary = new TreeMap();    int index = attributes.size();    for(int z = 0; z < values; z++) {      /*	System.err.print("\nCreating word index...");	if (values > 1) {        System.err.print(" for class id=" + z); 	}	System.err.flush();      */      Iterator it = dictionaryArr[z].keySet().iterator();      while (it.hasNext()) {        String word = (String)it.next();        Count count = (Count)dictionaryArr[z].get(word);        if (count.count >= prune[z]) {          //          System.err.println(word+" "+newDictionary.get(word));          if(newDictionary.get(word) == null) {            /*	      if (values > 1) {              System.err.print(getInputFormat().classAttribute().value(z) + " ");	      }	      System.err.println(word);            */            newDictionary.put(word, new Integer(index++));            attributes.addElement(new Attribute(word));          }        }      }    }    attributes.trimToSize();    m_Dictionary = newDictionary;    //System.err.println("done: " + index + " words in total.");        // Set the filter's output format    Instances outputFormat = new Instances(getInputFormat().relationName(),                                            attributes, 0);    outputFormat.setClassIndex(classIndex);    setOutputFormat(outputFormat);  }  private void convertInstance(Instance instance) {    // Convert the instance into a sorted set of indexes    TreeMap contained = new TreeMap();    // Copy all non-converted attributes from input to output    int firstCopy = 0;    for (int i = 0; i < getInputFormat().numAttributes(); i++) {      if (!m_SelectedRange.isInRange(i)) {       	if (getInputFormat().attribute(i).type() != Attribute.STRING) {	  // Add simple nominal and numeric attributes directly	  if (instance.value(i) != 0.0) {	    contained.put(new Integer(firstCopy), 			  new Double(instance.value(i)));	  } 	} else {	  if (instance.isMissing(i)) {	    contained.put(new Integer(firstCopy),			  new Double(Instance.missingValue()));	  } else {	    // If this is a string attribute, we have to first add	    // this value to the range of possible values, then add	    // its new internal index.	    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {	      // Note that the first string value in a	      // SparseInstance doesn't get printed.	      outputFormatPeek().attribute(firstCopy)		.addStringValue("Hack to defeat SparseInstance bug");	    }	    int newIndex = outputFormatPeek().attribute(firstCopy)	      .addStringValue(instance.stringValue(i));	    contained.put(new Integer(firstCopy), 			  new Double(newIndex));	  }	}	firstCopy++;      }         }    for (int j = 0; j < instance.numAttributes(); j++) {       //if ((getInputFormat().attribute(j).type() == Attribute.STRING)       if (m_SelectedRange.isInRange(j)	  && (instance.isMissing(j) == false)) {                  StringTokenizer st = new StringTokenizer(instance.stringValue(j),                                                 delimiters);        while (st.hasMoreTokens()) {          String word = st.nextToken();          Integer index = (Integer) m_Dictionary.get(word);          if (index != null) {            if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup              Double count = (Double)contained.get(index);              if (count != null) {                contained.put(index, new Double(count.doubleValue() + 1.0));              } else {                contained.put(index, new Double(1));              }            } else {              contained.put(index, new Double(1));            }          }        }      }    }        // Convert the set to structures needed to create a sparse instance.    double [] values = new double [contained.size()];    int [] indices = new int [contained.size()];    Iterator it = contained.keySet().iterator();    for (int i = 0; it.hasNext(); i++) {      Integer index = (Integer)it.next();      Double value = (Double)contained.get(index);      values[i] = value.doubleValue();      indices[i] = index.intValue();    }    Instance inst = new SparseInstance(instance.weight(), values, indices,                                        outputFormatPeek().numAttributes());    inst.setDataset(outputFormatPeek());    push(inst);    //System.err.print("#"); System.err.flush();  }  /**   * Main method for testing this class.   *   * @param argv should contain arguments to the filter:    * use -h for help   */  public static void main(String [] argv) {    try {      if (Utils.getFlag('b', argv)) { 	Filter.batchFilterFile(new StringToWordVector(), argv);      } else {	Filter.filterFile(new StringToWordVector(), argv);      }    } catch (Exception ex) {      ex.printStackTrace();      System.out.println(ex.getMessage());    }  }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -