📄 stringkernel.java
字号:
public int getSubsequenceLength() { return m_subsequenceLength; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String subsequenceLengthTipText() { return "The subsequence length."; } /** * Sets the maximum length of the subsequence. * * @param value the maximum length */ public void setMaxSubsequenceLength(int value) { m_maxSubsequenceLength = value; } /** * Returns the maximum length of the subsequence * * @return the maximum length */ public int getMaxSubsequenceLength() { return m_maxSubsequenceLength; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String maxSubsequenceLengthTipText() { return "The maximum subsequence length (theta in the paper)"; } /** * Sets the lambda constant used in the string kernel * * @param value the lambda value to use */ public void setLambda(double value) { m_lambda = value; } /** * Gets the lambda constant used in the string kernel * * @return the current lambda constant */ public double getLambda() { return m_lambda; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String lambdaTipText(){ return "Penalizes non-continuous subsequence matches, from (0,1)"; } /** * Sets whether to use normalization. * Each time this value is changed, the kernel cache is cleared. * * @param value whether to use normalization */ public void setUseNormalization(boolean value) { if (value != m_normalize) clean(); m_normalize = value; } /** * Returns whether normalization is used. * * @return true if normalization is used */ public boolean getUseNormalization() { return m_normalize; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String useNormalizationTipText(){ return "Whether to use normalization."; } /** * Computes the result of the kernel function for two instances. * If id1 == -1, eval use inst1 instead of an instance in the dataset. * * @param id1 the index of the first instance in the dataset * @param id2 the index of the second instance in the dataset * @param inst1 the instance corresponding to id1 (used if id1 == -1) * @return the result of the kernel function * @throws Exception if something goes wrong */ public double eval(int id1, int id2, Instance inst1) throws Exception { if (m_Debug && id1>-1 && id2>-1) { System.err.println("\nEvaluation of string kernel for"); System.err.println(m_data.instance(id1).stringValue(m_strAttr)); System.err.println("and"); System.err.println(m_data.instance(id2).stringValue(m_strAttr)); } //the normalized kernel returns 1 for comparison of //two identical strings if (id1 == id2 && m_normalize) return 1.0; double result = 0; long key = -1; int location = -1; // we can only cache if we know the indexes if ((id1 >= 0) && (m_keys != null)) { if (id1 > id2) { key = (long)id1 * m_numInsts + id2; } else { key = (long)id2 * m_numInsts + id1; } if (key < 0) { throw new Exception("Cache overflow detected!"); } location = (int)(key % m_keys.length); if (m_keys[location] == (key + 1)) { if (m_Debug) System.err.println("result (cached): " + m_storage[location]); return m_storage[location]; } } m_kernelEvals++; long start = System.currentTimeMillis(); Instance inst2 = m_data.instance(id2); char[] s1 = inst1.stringValue(m_strAttr).toCharArray(); char[] s2 = inst2.stringValue(m_strAttr).toCharArray(); // prevent the kernel from returning NaN if (s1.length == 0 || s2.length == 0) return 0; if (m_normalize) { result = normalizedKernel(s1,s2); } else { result = unnormalizedKernel(s1, s2); } if (m_Debug) { long duration = System.currentTimeMillis() - start; System.err.println("result: " + result); System.err.println("evaluation time:" + duration +"\n"); } // store result in cache if (key != -1){ m_storage[location] = result; m_keys[location] = (key + 1); } return result; } /** * Frees the memory used by the kernel. * (Useful with kernels which use cache.) * This function is called when the training is done. * i.e. after that, eval will be called with id1 == -1. */ public void clean() { m_storage = null; m_keys = null; } /** * Returns the number of kernel evaluation performed. * * @return the number of kernel evaluation performed. */ public int numEvals() { return m_kernelEvals; } /** * Returns the number of dot product cache hits. * * @return the number of dot product cache hits, or -1 if not supported by * this kernel. */ public int numCacheHits() { // TODO: implement! return -1; } /** * evaluates the normalized kernel between s and t. See [1] for details about * the normalized SSK. * * @param s first input string * @param t second input string * @return a double indicating their distance, or similarity */ public double normalizedKernel(char[] s, char[] t){ double k1 = unnormalizedKernel(s, s); double k2 = unnormalizedKernel(t, t); double normTerm = Math.sqrt( k1*k2 ); return unnormalizedKernel(s, t) / normTerm; } /** * evaluates the unnormalized kernel between s and t. See [1] for details * about the unnormalized SSK. * * @param s first input string * @param t second input string * @return a double indicating their distance, or similarity */ public double unnormalizedKernel(char[] s, char[] t){ if (t.length > s.length) { //swap because the algorithm is faster if s is //the longer string char[] buf = s; s = t; t = buf; } if (m_PruningMethod == PRUNING_NONE) { m_multX=(s.length+1)*(t.length+1); m_multY=(t.length+1); m_multZ=1; maxCache = m_internalCacheSize; if (maxCache==0) { maxCache=(m_subsequenceLength+1)*m_multX; } else if ((m_subsequenceLength+1)*m_multX<maxCache) { maxCache=(m_subsequenceLength+1)*m_multX; } m_useRecursionCache=true; cachekhK = new int[maxCache]; cachekh2K = new int[maxCache]; cachekh = new double[maxCache]; cachekh2 = new double[maxCache]; } else if (m_PruningMethod == PRUNING_LAMBDA) { maxCache=0; m_useRecursionCache=false; } double res; if (m_PruningMethod == PRUNING_LAMBDA) { res = kernelLP( m_subsequenceLength,s,s.length-1,t,t.length-1, m_maxSubsequenceLength); } else { res = kernel( m_subsequenceLength,s,s.length-1, t, t.length-1); } cachekh = null; cachekhK = null; cachekh2 = null; cachekh2K = null; return res; } /** * Recursion-ending function that is called at the end of each * recursion branch. * * @param n * @return */ protected double getReturnValue(int n){ if (n == 0) return 1; else return 0; } /** * the kernel function (Kn). This function performs the outer loop * character-wise over the first input string s. For each character * encountered, a recursion branch is started that identifies all * subsequences in t starting with that character. <br> See [1] for details * but note that this code is optimized and may be hard to recognize. * * @param n the current length of the matching subsequence * @param s first string, as a char array * @param t second string, as a char array * @param endIndexS the portion of s currently regarded is s[1:endIndexS] * @param endIndexT the portion of t currently regarded is t[1:endIndexT] * @return a double indicating the distance or similarity between s and t, * according to and depending on the initial value for n. */ protected double kernel(int n, char[] s,int endIndexS, char[] t, int endIndexT) { //normal recursion ending case if (Math.min(endIndexS+1,endIndexT+1) < n) return getReturnValue(n); //accumulate all recursion results in one: double result = 0; //the tail-recursive function defined in [1] is turned into a //loop here, preventing stack overflows. //skim s from back to front for (int iS=endIndexS; iS > n-2; iS--) { double buf = 0; //let the current character in s be x char x = s[iS]; // iterate over all occurrences of x in t for (int j=0; j <= endIndexT; j++) { if (t[j] == x){ //this is a match for the current character, hence //1. use previous chars in both strings (iS-1, j-1) //2. decrement the remainingMatchLength (n-1) //and start a recursion branch for these parameters buf += kernelHelper(n-1,s,iS-1, t, j-1); } } //ok, all occurrences of x in t have been found //multiply the result with lambda^2 // (one lambda for x, and the other for all matches of x in t) result += buf * m_powersOflambda[2]; } return result; } /** * The kernel helper function, called K' in [1] and [2]. * * @param n the current length of the matching subsequence * @param s first string, as a char array * @param t second string, as a char array * @param endIndexS the portion of s currently regarded is s[1:endIndexS] * @param endIndexT the portion of t currently regarded is t[1:endIndexT] * @return a partial result for K */ protected double kernelHelper (int n, char[] s,int endIndexS, char[] t, int endIndexT) { //recursion ends if the current subsequence has maximal length, //which is the case here if (n <= 0 ) { return getReturnValue(n); } //recursion ends, too, if the current subsequence is shorter than //maximal length, but there is no chance that it will reach maximal length. //in this case, normally 0 is returned, but the EXPERIMENTAL //minSubsequenceLength feature allows shorter subsequence matches //also to contribute if (Math.min(endIndexS+1,endIndexT+1) < n) { return getReturnValue(n); } int adr = 0; if (m_useRecursionCache) { adr=m_multX*n+m_multY*endIndexS+m_multZ*endIndexT;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -