📄 stringtowordvector.java

📁 MacroWeka扩展了著名数据挖掘工具weka
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
    for (int j = 0; j < instance.numAttributes(); j++) { 
      //if ((getInputFormat().attribute(j).type() == Attribute.STRING) 
      if (m_SelectedRange.isInRange(j)
	  && (instance.isMissing(j) == false)) {          
        Enumeration st;
        
        if(this.m_onlyAlphabeticTokens==false)
            st = new StringTokenizer(instance.stringValue(j),
                                                 delimiters);
        else
            st = new AlphabeticStringTokenizer(instance.stringValue(j));
        
        while (st.hasMoreElements()) {
          String word = (String)st.nextElement(); 
          if(this.m_lowerCaseTokens==true)
              word = word.toLowerCase();
          Integer index = (Integer) m_Dictionary.get(word);
          if (index != null) {
            if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup
              Double count = (Double)contained.get(index);
              if (count != null) {
                contained.put(index, new Double(count.doubleValue() + 1.0));
              } else {
                contained.put(index, new Double(1));
              }
            } else {
              contained.put(index, new Double(1));
            }                
          }
        }
      }
    }
    
    //Doing TFTransform
    if(m_TFTransform==true) {
        Iterator it = contained.keySet().iterator();
        for(int i=0; it.hasNext(); i++) {
            Integer index = (Integer)it.next();
            if( index.intValue() >= firstCopy ) { 
                double val = ((Double)contained.get(index)).doubleValue();
                val = Math.log(val+1);
//                if(printInstance==true)
//                  System.out.println("Instance 0"+ //instance.toString()+
//                                   ": setting value "+index.intValue()+
//                                   " from "+((Double)contained.get(index)).doubleValue()+
//                                   " to "+val);                
                contained.put(index, new Double(val));
            }
        }
    }
    
    //Doing IDFTransform
    if(m_IDFTransform==true) {
        Iterator it = contained.keySet().iterator();
        for(int i=0; it.hasNext(); i++) {
            Integer index = (Integer)it.next();
            //int num = getInputFormat().numInstances();
            if( index.intValue() >= firstCopy ) {
                double val = ((Double)contained.get(index)).doubleValue();
                val = val*Math.log( numInstances /    //num /
                                    (double)docsCounts[index.intValue()] );
//                if(printInstance==true)
//                  System.out.println("Instance 0"+ //instance.toString()+
//                                   ": "+
//                                   "num: "+numInstances+" index.intValue(): "+index.intValue()+
//                                   " docsCounts: "+this.docsCounts[index.intValue()]+ //"\n"+
//                                   "setting value "+index.intValue()+
//                                   " from "+((Double)contained.get(index)).doubleValue()+
//                                   " to "+val);                
                contained.put(index, new Double(val));
            }
        }        
    }
    
    //Doing length normalization
    if(m_normalizeDocLength==true) {
      if(avgDocLength<0)
        throw new Exception("Error. Average Doc Length not defined yet.");
      
      double sumSq = 0;
      Iterator it = contained.keySet().iterator();
      for(int i=0; it.hasNext(); i++) {
        Integer index = (Integer)it.next();
        if( index.intValue() >= firstCopy ) {
          double val = ((Double)contained.get(index)).doubleValue();
          sumSq += val*val;
        }
      }
      it = contained.keySet().iterator();
      for(int i=0; it.hasNext(); i++) {
        Integer index = (Integer)it.next();
        if( index.intValue() >= firstCopy ) {
          double val = ((Double)contained.get(index)).doubleValue();
          val = val/Math.sqrt(sumSq);
          val = val*avgDocLength;
//                System.out.println("Instance "+instance.toString()+
//                                   ": setting value "+index.intValue()+
//                                   " from "+((Double)contained.get(index)).doubleValue()+
//                                   " to "+val);
          contained.put(index, new Double(val));
        }
      }
    }
    
    
    // Convert the set to structures needed to create a sparse instance.
    double [] values = new double [contained.size()];
    int [] indices = new int [contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
      Integer index = (Integer)it.next();
      Double value = (Double)contained.get(index);
      values[i] = value.doubleValue();
      indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices, 
                                       outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());
    push(inst);
    
    //System.err.print("#"); System.err.flush();
  }


  private int convertInstancewoDocNorm(Instance instance, FastVector v) {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
      if (!m_SelectedRange.isInRange(i)) { 
	if (getInputFormat().attribute(i).type() != Attribute.STRING) {
	  // Add simple nominal and numeric attributes directly
	  if (instance.value(i) != 0.0) {
	    contained.put(new Integer(firstCopy), 
			  new Double(instance.value(i)));
	  } 
	} else {
	  if (instance.isMissing(i)) {
	    contained.put(new Integer(firstCopy),
			  new Double(Instance.missingValue()));
	  } else {

	    // If this is a string attribute, we have to first add
	    // this value to the range of possible values, then add
	    // its new internal index.
	    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
	      // Note that the first string value in a
	      // SparseInstance doesn't get printed.
	      outputFormatPeek().attribute(firstCopy)
		.addStringValue("Hack to defeat SparseInstance bug");
	    }
	    int newIndex = outputFormatPeek().attribute(firstCopy)
	      .addStringValue(instance.stringValue(i));
	    contained.put(new Integer(firstCopy), 
			  new Double(newIndex));
	  }
	}
	firstCopy++;
      }     
    }
    
    for (int j = 0; j < instance.numAttributes(); j++) { 
      //if ((getInputFormat().attribute(j).type() == Attribute.STRING) 
      if (m_SelectedRange.isInRange(j)
	  && (instance.isMissing(j) == false)) {          
        Enumeration st;
        
        if(this.m_onlyAlphabeticTokens==false)
            st = new StringTokenizer(instance.stringValue(j),
                                                 delimiters);
        else
            st = new AlphabeticStringTokenizer(instance.stringValue(j));
        
        while (st.hasMoreElements()) {
          String word = (String)st.nextElement(); 
          if(this.m_lowerCaseTokens==true)
              word = word.toLowerCase();
          Integer index = (Integer) m_Dictionary.get(word);
          if (index != null) {
            if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup
              Double count = (Double)contained.get(index);
              if (count != null) {
                contained.put(index, new Double(count.doubleValue() + 1.0));
              } else {
                contained.put(index, new Double(1));
              }
            } else {
              contained.put(index, new Double(1));
            }                
          }
        }
      }
    }
    
    //Doing TFTransform
    if(m_TFTransform==true) {
        Iterator it = contained.keySet().iterator();
        for(int i=0; it.hasNext(); i++) {
            Integer index = (Integer)it.next();
            if( index.intValue() >= firstCopy ) { 
                double val = ((Double)contained.get(index)).doubleValue();
                val = Math.log(val+1);
//                if(printInstance==true)
//                  System.out.println("Instance 0"+ //instance.toString()+
//                                   ": setting value "+index.intValue()+
//                                   " from "+((Double)contained.get(index)).doubleValue()+
//                                   " to "+val);                
                contained.put(index, new Double(val));
            }
        }
    }
    
    //Doing IDFTransform
    if(m_IDFTransform==true) {
        Iterator it = contained.keySet().iterator();
        for(int i=0; it.hasNext(); i++) {
            Integer index = (Integer)it.next();
            //int num = getInputFormat().numInstances();
            if( index.intValue() >= firstCopy ) {
                double val = ((Double)contained.get(index)).doubleValue();
                val = val*Math.log( numInstances /    //num /
                                    (double) docsCounts[index.intValue()] );
//                if(printInstance==true)
//                  System.out.println("Instance 0"+ //instance.toString()+
//                                   ": "+
//                                   "num: "+numInstances+" index.intValue(): "+index.intValue()+
//                                   " docsCounts: "+this.docsCounts[index.intValue()]+ //"\n"+
//                                   "setting value "+index.intValue()+
//                                   " from "+((Double)contained.get(index)).doubleValue()+
//                                   " to "+val);                
                contained.put(index, new Double(val));
            }
        }        
    }
    
    //Doing length normalization
    //if(m_normalizeDocLength==true) {
    //    double sumSq = 0;
    //    Iterator it = contained.keySet().iterator();
    //    for(int i=0; it.hasNext(); i++) {
    //        Integer index = (Integer)it.next();
    //        if( index.intValue() >= firstCopy ) { 
    //            double val = ((Double)contained.get(index)).doubleValue();
    //            sumSq += val*val;
    //        }
    //    }
    //    it = contained.keySet().iterator();
    //    for(int i=0; it.hasNext(); i++) {
    //        Integer index = (Integer)it.next();
    //        if( index.intValue() >= firstCopy ) { 
    //            double val = ((Double)contained.get(index)).doubleValue();
    //            val = val/Math.sqrt(sumSq);
//                System.out.println("Instance "+instance.toString()+
//                                   ": setting value "+index.intValue()+
//                                   " from "+((Double)contained.get(index)).doubleValue()+
//                                   " to "+val);                
    //            contained.put(index, new Double(val));
    //        }
    //    }
    //}
    
    
    // Convert the set to structures needed to create a sparse instance.
    double [] values = new double [contained.size()];
    int [] indices = new int [contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
      Integer index = (Integer)it.next();
      Double value = (Double)contained.get(index);
      values[i] = value.doubleValue();
      indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices, 
                                       outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());
    //push(inst);
    v.addElement(inst);
    
    return firstCopy;    
    //System.err.print("#"); System.err.flush();
  }
  
  
  /**
   * Main method for testing this class.
   *
   * @param argv should contain arguments to the filter: 
   * use -h for help
   */
  public static void main(String [] argv) {

    try {
      if (Utils.getFlag('b', argv)) {
 	Filter.batchFilterFile(new StringToWordVector(), argv);
      } else {
	Filter.filterFile(new StringToWordVector(), argv);
      }
    } catch (Exception ex) {
      ex.printStackTrace();
      System.out.println(ex.getMessage());
    }
  }
  
  
  
  private class AlphabeticStringTokenizer implements Enumeration {
      private char[] str;
      int currentPos=0;
      
      public AlphabeticStringTokenizer(String toTokenize) {
          str = new char[toTokenize.length()];
          toTokenize.getChars(0, toTokenize.length(), str, 0);
      }
      
      public boolean hasMoreElements() {
          int beginpos = currentPos;
          
          while( beginpos < str.length && 
                 (str[beginpos]<'a' || str[beginpos]>'z') &&
                 (str[beginpos]<'A' || str[beginpos]>'Z') ) {
                     beginpos++;    
          }
          currentPos = beginpos;
          //System.out.println("Currently looking at "+str[beginpos]);
          
          if( beginpos<str.length && 
              ((str[beginpos]>='a' && str[beginpos]<='z') ||
               (str[beginpos]>='A' && str[beginpos]<='Z')) ) {
                   return true;
          }
          else
              return false;
      }
      
      public Object nextElement() {
          int beginpos, endpos;
          beginpos = currentPos;
          
          while( beginpos < str.length && 
                 (str[beginpos]<'a' && str[beginpos]>'z') &&
                 (str[beginpos]<'A' && str[beginpos]>'Z') ) {
                     beginpos++;    
          }
          currentPos = endpos = beginpos;
          
          if(beginpos>=str.length)
              throw new NoSuchElementException("no more tokens present");
          
          while( endpos < str.length && 
                 ((str[endpos]>='a' && str[endpos]<='z') ||
                  (str[endpos]>='A' && str[endpos]<='Z')) ) {                     
                     endpos++;
          }
          
          String s = new String(str, beginpos, endpos-currentPos);
          currentPos = endpos;
          //System.out.println("found token >"+s+
          //                   "< beginpos: "+beginpos+
          //                   " endpos: "+endpos+
          //                   " str.length: "+str.length+
          //                   " str[beginpos]: "+str[beginpos]);
          return s;
      }      
  }
  
}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -