⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stringtowordvectorfilter.java

📁 一个数据挖掘系统的源码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:

  /**
   * Gets the number of words (per class if there is a class attribute
   * assigned) to attempt to keep.
   *
   * @return the target number of words in the output vector (per class if
   * assigned).
   */
  public int getWordsToKeep() {

    return m_WordsToKeep;
  }

  /**
   * Sets the number of words (per class if there is a class attribute
   * assigned) to attempt to keep.
   *
   * @param newWordsToKeep the target number of words in the output
   * vector (per class if assigned).
   */
  public void setWordsToKeep(int newWordsToKeep) {

    m_WordsToKeep = newWordsToKeep;
  }


  private static void sortArray(int [] array) {

    int i, j, h, N = array.length - 1;

    for (h = 1; h <= N / 9; h = 3 * h + 1);

    for (; h > 0; h /= 3) {
      for (i = h + 1; i <= N; i++) {
        int v = array[i];
        j = i;
        while (j > h && array[j - h] > v ) {
          array[j] = array[j - h];
          j -= h;
        }
        array[j] = v;
      }
    }
  }

  private void determineSelectedRange() {

    Instances inputFormat = getInputFormat();

    // Calculate the default set of fields to convert
    if (m_SelectedRange == null) {
      StringBuffer fields = new StringBuffer();
      for (int j = 0; j < inputFormat.numAttributes(); j++) {
	if (inputFormat.attribute(j).type() == Attribute.STRING)
	  fields.append((j + 1) + ",");
      }
      m_SelectedRange = new Range(fields.toString());
    }

    m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);

    // Prevent the user from converting non-string fields
    StringBuffer fields = new StringBuffer();
    for (int j = 0; j < inputFormat.numAttributes(); j++) {
      if (m_SelectedRange.isInRange(j)
	  && inputFormat.attribute(j).type() == Attribute.STRING)
	fields.append((j + 1) + ",");
    }
    m_SelectedRange.setRanges(fields.toString());

    // System.err.println("Selected Range: " + getSelectedRange().getRanges());
  }

  private void determineDictionary() throws Exception {

    // System.err.println("Creating dictionary");

    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (classInd != -1) {
      values = getInputFormat().attribute(classInd).numValues();
    }
    TreeMap dictionaryArr [] = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      /*
	if (i % 10 == 0) {
        System.err.print( i + " " + getInputFormat().numInstances() + "\r");
        System.err.flush();
	}
      */
      Instance instance = getInputFormat().instance(i);
      for (int j = 0; j < instance.numAttributes(); j++) {
        if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {
	  //getInputFormat().attribute(j).type() == Attribute.STRING

          StringTokenizer st = new StringTokenizer(instance.stringValue(j),
                                                   delimiters);
          while (st.hasMoreTokens()) {
            String word = st.nextToken().intern();
            int vInd = 0;
            if (classInd != -1) {
              vInd = (int)instance.classValue();
            }
            Count count = (Count)dictionaryArr[vInd].get(word);
            if (count == null) {
              dictionaryArr[vInd].put(word, new Count(1));
            } else {
              count.count ++;
            }
          }
        }
      }
    }


    int totalsize = 0;
    int prune[] = new int[values];
    for (int z = 0; z < values; z++) {
      totalsize += dictionaryArr[z].size();

      int array[] = new int[dictionaryArr[z].size()];
      int pos = 0;
      Iterator it = dictionaryArr[z].keySet().iterator();
      while (it.hasNext()) {
        String word = (String)it.next();
        Count count = (Count)dictionaryArr[z].get(word);
        array[pos] = count.count;
        pos++;
      }

      // sort the array
      sortArray(array);
      if (array.length < m_WordsToKeep) {
        // if there aren't enough words, set the threshold to 1
        prune[z] = 1;
      } else {
        // otherwise set it to be at least 1
        prune[z] = Math.max(1, array[array.length - m_WordsToKeep]);
      }

    }

    /*
      for (int z=0;z<values;z++) {
      System.err.println(dictionaryArr[z].size()+" "+totalsize);
      }
    */

    // Convert the dictionary into an attribute index
    // and create one attribute per word
    FastVector attributes = new FastVector(totalsize +
					   getInputFormat().numAttributes());

    // Add the non-converted attributes
    int classIndex = -1;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      if (!m_SelectedRange.isInRange(i)) {
        if (getInputFormat().classIndex() == i) {
          classIndex = attributes.size();
        }
	attributes.addElement(getInputFormat().attribute(i).copy());
      }
    }

    // Add the word vector attributes
    TreeMap newDictionary = new TreeMap();

    int index = attributes.size();
    for(int z = 0; z < values; z++) {
      /*
	System.err.print("\nCreating word index...");
	if (values > 1) {
        System.err.print(" for class id=" + z);
	}
	System.err.flush();
      */
      Iterator it = dictionaryArr[z].keySet().iterator();
      while (it.hasNext()) {
        String word = (String)it.next();
        Count count = (Count)dictionaryArr[z].get(word);
        if (count.count >= prune[z]) {
          //          System.err.println(word+" "+newDictionary.get(word));
          if(newDictionary.get(word) == null) {
            /*
	      if (values > 1) {
              System.err.print(getInputFormat().classAttribute().value(z) + " ");
	      }
	      System.err.println(word);
            */
            newDictionary.put(word, new Integer(index++));
            attributes.addElement(new Attribute(word));
          }
        }
      }
    }

    attributes.trimToSize();
    m_Dictionary = newDictionary;

    //System.err.println("done: " + index + " words in total.");


    // Set the filter's output format
    Instances outputFormat = new Instances(getInputFormat().relationName(),
                                           attributes, 0);
    outputFormat.setClassIndex(classIndex);
    setOutputFormat(outputFormat);
  }


  private void convertInstance(Instance instance) {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      if (!m_SelectedRange.isInRange(i)) {

	if (getInputFormat().attribute(i).type() != Attribute.STRING) {
	  // Add simple nominal and numeric attributes directly
	  if (instance.value(i) != 0.0) {
	    contained.put(new Integer(firstCopy),
			  new Double(instance.value(i)));
	  }
	} else {
	  if (instance.isMissing(i)) {
	    contained.put(new Integer(firstCopy),
			  new Double(Instance.missingValue()));
	  } else {

	    // If this is a string attribute, we have to first add
	    // this value to the range of possible values, then add
	    // its new internal index.
	    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
	      // Note that the first string value in a
	      // SparseInstance doesn't get printed.
	      outputFormatPeek().attribute(firstCopy)
		.addStringValue("Hack to defeat SparseInstance bug");
	    }
	    int newIndex = outputFormatPeek().attribute(firstCopy)
	      .addStringValue(instance.stringValue(i));
	    contained.put(new Integer(firstCopy),
			  new Double(newIndex));
	  }
	}
	firstCopy++;
      }
    }
    for (int j = 0; j < instance.numAttributes(); j++) {
      //if ((getInputFormat().attribute(j).type() == Attribute.STRING)
      if (m_SelectedRange.isInRange(j)
	  && (instance.isMissing(j) == false)) {
        StringTokenizer st = new StringTokenizer(instance.stringValue(j),
                                                 delimiters);
        while (st.hasMoreTokens()) {
          String word = st.nextToken();
          Integer index = (Integer) m_Dictionary.get(word);
          if (index != null) {
            contained.put(index, new Double(1));
          }
        }
      }
    }

    // Convert the set to structures needed to create a sparse instance.
    double [] values = new double [contained.size()];
    int [] indices = new int [contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
      Integer index = (Integer)it.next();
      Double value = (Double)contained.get(index);
      values[i] = value.doubleValue();
      indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices,
                                       outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());
    push(inst);
    //System.err.print("#"); System.err.flush();
  }

  /**
   * Main method for testing this class.
   *
   * @param argv should contain arguments to the filter:
   * use -h for help
   */
  public static void main(String [] argv) {

    try {
      if (Utils.getFlag('b', argv)) {
 	Filter.batchFilterFile(new StringToWordVectorFilter(), argv);
      } else {
	Filter.filterFile(new StringToWordVectorFilter(), argv);
      }
    } catch (Exception ex) {
      ex.printStackTrace();
      log.error(ex.getMessage());
    }
  }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -