📄 stringkernel.java

📁 Java 编写的多种数据挖掘算法包括聚类、分类、预处理等
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
  public int getSubsequenceLength() {    return m_subsequenceLength;  }  /**   * Returns the tip text for this property   *    * @return 		tip text for this property suitable for   * 			displaying in the explorer/experimenter gui   */  public String subsequenceLengthTipText() {    return "The subsequence length.";  }  /**   * Sets the maximum length of the subsequence.   *    * @param value	the maximum length   */  public void setMaxSubsequenceLength(int value) {    m_maxSubsequenceLength = value;  }    /**   * Returns the maximum length of the subsequence   *    * @return		the maximum length   */  public int getMaxSubsequenceLength() {    return m_maxSubsequenceLength;  }  /**   * Returns the tip text for this property   *    * @return 		tip text for this property suitable for   * 			displaying in the explorer/experimenter gui   */  public String maxSubsequenceLengthTipText() {    return "The maximum subsequence length (theta in the paper)";  }    /**   * Sets the lambda constant used in the string kernel   *    * @param value	the lambda value to use   */  public void setLambda(double value) {    m_lambda = value;  }    /**   * Gets the lambda constant used in the string kernel   *    * @return		the current lambda constant   */    public double getLambda() {    return m_lambda;  }  /**   * Returns the tip text for this property   *    * @return 		tip text for this property suitable for   * 			displaying in the explorer/experimenter gui   */  public String lambdaTipText(){    return "Penalizes non-continuous subsequence matches, from (0,1)";  }  /**   * Sets whether to use normalization.   * Each time this value is changed, the kernel cache is cleared.   *   * @param value	whether to use normalization   */  public void setUseNormalization(boolean value) {    if  (value != m_normalize)       clean();        m_normalize = value;  }    /**   * Returns whether normalization is used.   *    * @return		true if normalization is used   */  public boolean getUseNormalization() {    return m_normalize;  }  /**   * Returns the tip text for this property   *    * @return 		tip text for this property suitable for   * 			displaying in the explorer/experimenter gui   */  public String useNormalizationTipText(){    return "Whether to use normalization.";  }  /**   * Computes the result of the kernel function for two instances.   * If id1 == -1, eval use inst1 instead of an instance in the dataset.   *   * @param id1 the index of the first instance in the dataset   * @param id2 the index of the second instance in the dataset   * @param inst1 the instance corresponding to id1 (used if id1 == -1)   * @return the result of the kernel function   * @throws Exception if something goes wrong   */  public double eval(int id1, int id2, Instance inst1) throws Exception {    if (m_Debug && id1>-1 && id2>-1) {      System.err.println("\nEvaluation of string kernel for");      System.err.println(m_data.instance(id1).stringValue(m_strAttr));      System.err.println("and");      System.err.println(m_data.instance(id2).stringValue(m_strAttr));    }    //the normalized kernel returns 1 for comparison of     //two identical strings    if (id1 == id2 && m_normalize)       return 1.0;    double result = 0;    long key = -1;    int location = -1;    // we can only cache if we know the indexes    if ((id1 >= 0) && (m_keys != null)) {      if (id1 > id2) {        key = (long)id1 * m_numInsts + id2;      } else {        key = (long)id2 * m_numInsts + id1;      }      if (key < 0) {        throw new Exception("Cache overflow detected!");      }      location = (int)(key % m_keys.length);      if (m_keys[location] == (key + 1)) {        if (m_Debug)           System.err.println("result (cached): " + m_storage[location]);        return m_storage[location];      }    }    m_kernelEvals++;    long start = System.currentTimeMillis();    Instance inst2 = m_data.instance(id2);    char[] s1 = inst1.stringValue(m_strAttr).toCharArray();    char[] s2 = inst2.stringValue(m_strAttr).toCharArray();    // prevent the kernel from returning NaN    if (s1.length == 0 || s2.length == 0) return 0;    if (m_normalize) {      result = normalizedKernel(s1,s2);    } else {      result = unnormalizedKernel(s1, s2);    }    if (m_Debug) {      long duration = System.currentTimeMillis() - start;      System.err.println("result: " + result);      System.err.println("evaluation time:" + duration +"\n");    }    // store result in cache    if (key != -1){      m_storage[location] = result;      m_keys[location] = (key + 1);    }    return result;  }  /**   * Frees the memory used by the kernel.   * (Useful with kernels which use cache.)   * This function is called when the training is done.   * i.e. after that, eval will be called with id1 == -1.   */  public void clean() {    m_storage = null;    m_keys = null;  }  /**   * Returns the number of kernel evaluation performed.   *   * @return the number of kernel evaluation performed.   */  public int numEvals() {    return m_kernelEvals;  }  /**   * Returns the number of dot product cache hits.   *   * @return the number of dot product cache hits, or -1 if not supported by   * this kernel.   */  public int numCacheHits() {    // TODO: implement!    return -1;  }    /**   * evaluates the normalized kernel between s and t. See [1] for details about   * the normalized SSK.    *   * @param s first input string   * @param t second input string   * @return a double indicating their distance, or similarity   */  public double normalizedKernel(char[] s, char[] t){    double k1 = unnormalizedKernel(s, s);    double k2 = unnormalizedKernel(t, t);    double normTerm = Math.sqrt( k1*k2 );    return unnormalizedKernel(s, t) / normTerm;  }  /**   * evaluates the unnormalized kernel between s and t. See [1] for details   * about the unnormalized SSK.   *   * @param s first input string   * @param t second input string   * @return a double indicating their distance, or similarity   */  public double unnormalizedKernel(char[] s, char[] t){    if (t.length > s.length) {      //swap because the algorithm is faster if s is      //the longer string      char[] buf = s;      s = t;      t = buf;    }    if (m_PruningMethod == PRUNING_NONE) {      m_multX=(s.length+1)*(t.length+1);       m_multY=(t.length+1);       m_multZ=1;      maxCache = m_internalCacheSize;      if (maxCache==0) {	maxCache=(m_subsequenceLength+1)*m_multX;      }      else if ((m_subsequenceLength+1)*m_multX<maxCache) { 	maxCache=(m_subsequenceLength+1)*m_multX;       }      m_useRecursionCache=true;      cachekhK = new int[maxCache];      cachekh2K = new int[maxCache];      cachekh = new double[maxCache];      cachekh2 = new double[maxCache];    } else if (m_PruningMethod == PRUNING_LAMBDA) {      maxCache=0;       m_useRecursionCache=false;    }    double res;    if (m_PruningMethod == PRUNING_LAMBDA) {      res = kernelLP(              m_subsequenceLength,s,s.length-1,t,t.length-1,              m_maxSubsequenceLength);    } else {      res = kernel(              m_subsequenceLength,s,s.length-1, t, t.length-1);    }    cachekh = null;    cachekhK = null;    cachekh2 = null;    cachekh2K = null;        return res;  }  /**   * Recursion-ending function that is called at the end of each    * recursion branch.   *    * @param n   * @return   */  protected double getReturnValue(int n){    if (n == 0) return 1; else return 0;  }  /**   * the kernel function (Kn). This function performs the outer loop   * character-wise over the first input string s. For each character   * encountered, a recursion branch is started that identifies all   * subsequences in t starting with that character. <br> See [1] for details   * but note that this code is optimized and may be hard to recognize.   *    * @param n the current length of the matching subsequence   * @param s first string, as a char array   * @param t second string, as a char array   * @param endIndexS the portion of s currently regarded is s[1:endIndexS]   * @param endIndexT the portion of t currently regarded is t[1:endIndexT]   * @return a double indicating the distance or similarity between s and t,    * according to and depending on the initial value for n.   */  protected double kernel(int n, char[] s,int endIndexS, char[] t,     int endIndexT) {    //normal recursion ending case    if (Math.min(endIndexS+1,endIndexT+1) < n) return getReturnValue(n);    //accumulate all recursion results in one:    double result = 0;    //the tail-recursive function defined in [1] is turned into a    //loop here, preventing stack overflows.    //skim s from back to front    for (int iS=endIndexS; iS > n-2; iS--) {      double buf = 0;      //let the current character in s be x       char x = s[iS];      // iterate over all occurrences of x in t      for (int j=0; j <= endIndexT; j++) {        if (t[j] == x){          //this is a match for the current character, hence          //1. use previous chars in both strings (iS-1, j-1)          //2. decrement the remainingMatchLength (n-1)          //and start a recursion branch for these parameters          buf += kernelHelper(n-1,s,iS-1, t, j-1);        }      }      //ok, all occurrences of x in t have been found      //multiply the result with lambda^2      //  (one lambda for x, and the other for all matches of x in t)      result += buf * m_powersOflambda[2];    }    return result;  }  /**   * The kernel helper function, called K' in [1] and [2].   *   * @param n the current length of the matching subsequence   * @param s first string, as a char array   * @param t second string, as a char array   * @param endIndexS the portion of s currently regarded is s[1:endIndexS]   * @param endIndexT the portion of t currently regarded is t[1:endIndexT]   * @return a partial result for K   */  protected double kernelHelper (int n, char[] s,int endIndexS, char[] t,     int endIndexT) {    //recursion ends if the current subsequence has maximal length,     //which is the case here    if (n <= 0 ) {      return getReturnValue(n);    }    //recursion ends, too, if the current subsequence is shorter than    //maximal length, but there is no chance that it will reach maximal length.    //in this case, normally 0 is returned, but the EXPERIMENTAL     //minSubsequenceLength feature allows shorter subsequence matches    //also to contribute    if (Math.min(endIndexS+1,endIndexT+1) < n) {      return getReturnValue(n);    }    int adr = 0;    if (m_useRecursionCache) {      adr=m_multX*n+m_multY*endIndexS+m_multZ*endIndexT;
💿 文件大小 3872 K
👤 上传用户 fengkuangyidao
📂 所属分类 Applet
🏷️ 相关标签

#Java #编写 #数据挖掘算法 #分类
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -