📄 stringtowordvector.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
  }
  
  /** Gets whether if the tokens are to be formed only from contiguous 
   *  alphabetic sequences. The delimiter string is ignored if this is true.
   *
   * @return true if tokens are to be formed from contiguous alphabetic 
   * characters.
   */
  public boolean getOnlyAlphabeticTokens() {
      return m_onlyAlphabeticTokens;
  }  
  
  /** Sets whether if tokens are to be formed only from contiguous alphabetic
   * character sequences. The delimiter string is ignored if this option is 
   * set to true.
   *
   * @param onlyAlphabeticSequences should be set to true if only alphabetic 
   * tokens should be formed.
   */
  public void setOnlyAlphabeticTokens(boolean tokenizeOnlyAlphabeticSequences) {
      m_onlyAlphabeticTokens = tokenizeOnlyAlphabeticSequences;
  }

  /**
   * Returns the tip text for this property.
   *
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String onlyAlphabeticTokensTipText() {
      return "Sets whether if the word tokens are to be formed only from "+
             "contiguous alphabetic sequences (The delimiter string is "+
             "ignored if this option is set to true).";
  }
  
  /** Gets whether if the tokens are to be downcased or not.
   *
   * @return true if the tokens are to be downcased.
   */
  public boolean getLowerCaseTokens() {
      return this.m_lowerCaseTokens;
  }
  
  /** Sets whether if the tokens are to be downcased or not. (Doesn't affect
   * non-alphabetic characters in tokens).
   *
   * @param downCaseTokens should be true if only lower case tokens are 
   * to be formed.
   */
  public void setLowerCaseTokens(boolean downCaseTokens) {
      this.m_lowerCaseTokens = downCaseTokens;
  }
  
  /**
   * Returns the tip text for this property.
   *
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String lowerCaseTokensTipText() {
      return "If set then all the word tokens are converted to lower case "+
             "before being added to the dictionary.";
  }

  /** Gets whether if the words on the stoplist are to be ignored (The stoplist
   *  is in weka.core.StopWords).
   *
   * @return true if the words on the stoplist are to be ignored.
   */
  public boolean getUseStoplist() {
      return m_useStoplist;
  }  
  
  /** Sets whether if the words that are on a stoplist are to be ignored (The
   * stop list is in weka.core.StopWords).
   *
   * @param useStoplist true if the tokens that are on a stoplist are to be 
   * ignored.
   */
  public void setUseStoplist(boolean useStoplist) {
      m_useStoplist = useStoplist;
  }  
  
  /**
   * Returns the tip text for this property.
   *
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String useStoplistTipText() {
      return "Ignores all the words that are on the stoplist, if set to true.";
  } 
  
  private static void sortArray(int [] array) {
      
    int i, j, h, N = array.length - 1;
	
    for (h = 1; h <= N / 9; h = 3 * h + 1); 
	
    for (; h > 0; h /= 3) {
      for (i = h + 1; i <= N; i++) { 
        int v = array[i]; 
        j = i; 
        while (j > h && array[j - h] > v ) { 
          array[j] = array[j - h]; 
          j -= h; 
        } 
        array[j] = v; 
      } 
    }
  }

  private void determineSelectedRange() {
    
    Instances inputFormat = getInputFormat();
    
    // Calculate the default set of fields to convert
    if (m_SelectedRange == null) {
      StringBuffer fields = new StringBuffer();
      for (int j = 0; j < inputFormat.numAttributes(); j++) { 
	if (inputFormat.attribute(j).type() == Attribute.STRING)
	  fields.append((j + 1) + ",");
      }
      m_SelectedRange = new Range(fields.toString());
    }
    m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
    
    // Prevent the user from converting non-string fields
    StringBuffer fields = new StringBuffer();
    for (int j = 0; j < inputFormat.numAttributes(); j++) { 
      if (m_SelectedRange.isInRange(j) 
	  && inputFormat.attribute(j).type() == Attribute.STRING)
	fields.append((j + 1) + ",");
    }
    m_SelectedRange.setRanges(fields.toString());
    m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);

    // System.err.println("Selected Range: " + getSelectedRange().getRanges()); 
  }
  
  private void determineDictionary() {
    
    // System.err.println("Creating dictionary"); 
    
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (classInd != -1) {
      values = getInputFormat().attribute(classInd).numValues();
    }

    //TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap [] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
      dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
      /*
	if (i % 10 == 0) {
        System.err.print( i + " " + getInputFormat().numInstances() + "\r"); 
        System.err.flush();
	}
      */
      Instance instance = getInputFormat().instance(i);
      int vInd = 0;
      if (classInd != -1) {
          vInd = (int)instance.classValue();
      }
      
      Hashtable h = new Hashtable();
      for (int j = 0; j < instance.numAttributes(); j++) { 
        if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {
	  //getInputFormat().attribute(j).type() == Attribute.STRING 
            
          Enumeration st;
          if(this.m_onlyAlphabeticTokens==false)
              st = new StringTokenizer(instance.stringValue(j),
                                                   delimiters);
          else
              st = new AlphabeticStringTokenizer(instance.stringValue(j));
          
          while (st.hasMoreElements()) {
            String word = ((String)st.nextElement()).intern();
            
            if(this.m_lowerCaseTokens==true)
                word = word.toLowerCase();
            
            if(this.m_useStoplist==true)
                if(weka.core.Stopwords.isStopword(word))
                    continue;
            
            if(!(h.contains(word)))
                h.put(word, new Integer(0));

            Count count = (Count)dictionaryArr[vInd].get(word);
            if (count == null) {
              dictionaryArr[vInd].put(word, new Count(1));
            } else {
                count.count ++;                
            }
          }          
        }
      }
      //updating the docCount for the words that have occurred in this
      //instance(document).
      Enumeration e = h.keys();
      while(e.hasMoreElements()) {
          String word = (String) e.nextElement();
          Count c = (Count)dictionaryArr[vInd].get(word);
      
          if(c!=null) {
              c.docCount++;
          }
          else 
              System.err.println("Warning: A word should definitely be in the "+
                                 "dictionary.Please check the code");
      }
      
    }

    int totalsize = 0;
    int prune[] = new int[values];
    for (int z = 0; z < values; z++) {
      totalsize += dictionaryArr[z].size();

      int array[] = new int[dictionaryArr[z].size()];
      int pos = 0;
      Iterator it = dictionaryArr[z].keySet().iterator();
      while (it.hasNext()) {
        String word = (String)it.next();
        Count count = (Count)dictionaryArr[z].get(word);
        array[pos] = count.count;
        pos++;
      }

      // sort the array
      sortArray(array);
      if (array.length < m_WordsToKeep) {
        // if there aren't enough words, set the threshold to 1
        prune[z] = 1;
      } else {
        // otherwise set it to be at least 1
        prune[z] = Math.max(1, array[array.length - m_WordsToKeep]);
      }

    }

    /*
      for (int z=0;z<values;z++) {
      System.err.println(dictionaryArr[z].size()+" "+totalsize);
      }
    */

    // Convert the dictionary into an attribute index
    // and create one attribute per word
    FastVector attributes = new FastVector(totalsize +
					   getInputFormat().numAttributes());

    // Add the non-converted attributes 
    int classIndex = -1;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      if (!m_SelectedRange.isInRange(i)) { 
        if (getInputFormat().classIndex() == i) {
          classIndex = attributes.size();
        }
	attributes.addElement(getInputFormat().attribute(i).copy());
      }     
    }

    
    // Add the word vector attributes
    TreeMap newDictionary = new TreeMap();

    int index = attributes.size();
    for(int z = 0; z < values; z++) {
      /*
	System.err.print("\nCreating word index...");
	if (values > 1) {
        System.err.print(" for class id=" + z); 
	}
	System.err.flush();
      */
      Iterator it = dictionaryArr[z].keySet().iterator();
      while (it.hasNext()) {
        String word = (String)it.next();
        Count count = (Count)dictionaryArr[z].get(word);
        if (count.count >= prune[z]) {
          //          System.err.println(word+" "+newDictionary.get(word));
          if(newDictionary.get(word) == null) {
            /*
	      if (values > 1) {
              System.err.print(getInputFormat().classAttribute().value(z) + " ");
	      }
	      System.err.println(word);
            */
            newDictionary.put(word, new Integer(index++));
            attributes.addElement(new Attribute(m_Prefix + word));
          }
        }
      }
    }
    
    
    
    docsCounts = new int[attributes.size()];
    Iterator it = newDictionary.keySet().iterator();
    while(it.hasNext()) {
        String word = (String) it.next();
        int idx = ((Integer)newDictionary.get(word)).intValue();
        int docsCount=0;
        for(int j=0; j<values; j++) {
            Count c = (Count) dictionaryArr[j].get(word);
            if(c!=null)
                docsCount += c.docCount;
        }
        docsCounts[idx]=docsCount;
    }
    
    attributes.trimToSize();
    m_Dictionary = newDictionary;

    //System.err.println("done: " + index + " words in total.");
    
    numInstances = getInputFormat().numInstances();
    
    // Set the filter's output format
    Instances outputFormat = new Instances(getInputFormat().relationName(), 
                                           attributes, 0);
    outputFormat.setClassIndex(classIndex);
    setOutputFormat(outputFormat);
  }


  private void convertInstance(Instance instance) throws Exception {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      if (!m_SelectedRange.isInRange(i)) { 
	if (getInputFormat().attribute(i).type() != Attribute.STRING) {
	  // Add simple nominal and numeric attributes directly
	  if (instance.value(i) != 0.0) {
	    contained.put(new Integer(firstCopy), 
			  new Double(instance.value(i)));
	  } 
	} else {
	  if (instance.isMissing(i)) {
	    contained.put(new Integer(firstCopy),
			  new Double(Instance.missingValue()));
	  } else {

	    // If this is a string attribute, we have to first add
	    // this value to the range of possible values, then add
	    // its new internal index.
	    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
	      // Note that the first string value in a
	      // SparseInstance doesn't get printed.
	      outputFormatPeek().attribute(firstCopy)
		.addStringValue("Hack to defeat SparseInstance bug");
	    }
	    int newIndex = outputFormatPeek().attribute(firstCopy)
	      .addStringValue(instance.stringValue(i));
	    contained.put(new Integer(firstCopy), 
			  new Double(newIndex));
	  }
	}
	firstCopy++;
      }     
    }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -