learnabletokenedaffine.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 1,105 行 · 第 1/3 页
JAVA
1,105 行
   * @param _b number log(b)   * @returns log(a+b)   */  protected double logSum(double _logA, double _logB) {    double logSum = 0;    // make logA the smaller of the two    double logA = (_logA < _logB) ? _logA : _logB;    double logB = (_logA < _logB) ? _logB : _logA;        if (logA - logB < -324 || logA == Double.NEGATIVE_INFINITY) {      logSum = logB;    } else {      logSum = logA + Math.log(1 + Math.exp(logB - logA));    }    return logSum;  }  /**   * Calculate affine gapped distance using learned costs   * @param s1 first string   * @param s2 second string   * @return minimum number of deletions/insertions/substitutions to be performed   * to transform s1 into s2 (or vice versa)   */  public double costDistance(String string1, String string2) {    TokenString ts1;    if (m_stringTokenStringMap.containsKey(string1)) {      ts1 = ((TokenString)m_stringTokenStringMap.get(string1));    } else {      ts1 = m_tokenizer.getTokenString(string1);      m_stringTokenStringMap.put(string1, ts1);    }    TokenString ts2;    if (m_stringTokenStringMap.containsKey(string2)) {      ts2 = ((TokenString)m_stringTokenStringMap.get(string2));    } else {      ts2 = m_tokenizer.getTokenString(string2);      m_stringTokenStringMap.put(string2, ts2);    }        int [] s1 = ts1.tokenIDs;     int [] s2 = ts2.tokenIDs;    int l1 = s1.length, l2 = s2.length;    double T[][] = new double[l1+1][l2+1];    double I[][] = new double[l1+1][l2+1];    double D[][] = new double[l1+1][l2+1];    double subCost = 0, subTokenCost = 0, ret;    int i, j;    if (l1==0 || l2==0) {      return m_gapStartCost + (l1+l2-1) * m_gapExtendCost;    }    for (j = 0; j < l2+1; j++) {      I[0][j] = Double.MAX_VALUE;      D[0][j] = Double.MAX_VALUE;    }    for (j = 0; j < l1+1; j++) {      I[j][0] = Double.MAX_VALUE;      D[j][0] = Double.MAX_VALUE;    }    T[0][0] = 0;    T[0][1] = m_gapStartCost;    T[1][0] = m_gapStartCost;    for (j = 2; j < l2+1; j++) {      T[0][j] = T[0][j-1] + m_gapExtendCost;    }    for (j = 2; j < l1+1; j++) {      T[j][0] = T[j-1][0] + m_gapExtendCost;    }    for (i = 1; i < l1+1; i++) {      for (j = 1; j < l2+1; j++) {	int t1 = s1[i-1];	int t2 = s2[j-1];	subTokenCost = (t1 == t2) ? m_matchCost : m_nonMatchCost;  // TODO:  experiment with 0 matchCost	if (D[i-1][j]+m_gapExtendCost > T[i-1][j]+m_gapStartCost) {	  D[i][j] = T[i-1][j]+m_gapStartCost;	} else {	  D[i][j] = D[i-1][j]+m_gapExtendCost;	}			if (I[i][j-1]+m_gapExtendCost > T[i][j-1]+m_gapStartCost) {	  I[i][j] = T[i][j-1] + m_gapStartCost;	} else {	  I[i][j] = I[i][j-1] + m_gapExtendCost;	}			//	subCost = m_subCost + sub_charCost;//  	subCost =((c1 == c2) ? 0 : (m_subCost + m_gapEndCost));//  	subCost = subCost + sub_charCost;//    	subCost = (c1 == c2) ? 0 : m_subCost;	//	subCost = m_subCost;			if  ((T[i-1][j-1] + m_subCost < D[i-1][j-1] + m_gapEndCost) &&    /// d[i][j] or d[i-1][j-1]??	     (T[i-1][j-1] + m_subCost < I[i-1][j-1] + m_gapEndCost )) {	  T[i][j] = T[i-1][j-1] + m_subCost + subTokenCost;   // ?? do we add subCharCost?	} else {	  if (D[i-1][j-1] < I[i-1][j-1]) {	    T[i][j] = D[i-1][j-1] + m_gapEndCost + subTokenCost;	  } else {	    T[i][j] = I[i-1][j-1] + m_gapEndCost + subTokenCost;	  }	}      }    }	    if (T[l1][l2] < D[l1][l2] && T[l1][l2] < I[l1][l2]) {      ret = T[l1][l2];    } else if (D[l1][l2] < I[l1][l2]) {      ret = D[l1][l2];    } else {      ret = I[l1][l2];    }    if (m_normalized) {//        // get the normalization factor as P(x,y)=P(x)P(y)//        double Pxy = 2 * m_gapStartCost; //        for (int k = 0; k < l1; k++) {//  	Pxy += s1[k] + m_gapExtendCost;//        }//        for (int k = 0; k < l2; k++) {//  	Pxy += s2[k] + m_gapExtendCost;//        }//        ret /= Pxy;      ret /= 4*(l1 + l2);    }    return ret;  }  public static void print3dMatrix(double [][][] matrix) {    DecimalFormat fmt = new DecimalFormat ("0.0000E00");    for (int i = 0; i < matrix[0][0].length; i++) {      System.out.println ("\nMatrix[][][" + i + "]");      for (int j = 0; j < matrix[0].length; j++) {	for (int k = 0; k < matrix.length; k++) {	  System.out.print(fmt.format(matrix[k][j][i]) + "\t");	}	System.out.println();      }    }  }  /** Set the distance to be normalized by the sum of the string's lengths   * @param normalized if true, distance is normalized by the sum of string's lengths   */  public void setNormalized(boolean normalized) {    m_normalized = normalized;  }   /** Get whether the distance is normalized by the sum of the string's lengths   * @return if true, distance is normalized by the sum of string's lengths   */  public boolean getNormalized() {    return m_normalized;  }  /** Set the distance to use the generative model or convert back to the additive model   * @param useGenerativeModel if true, the generative model is used   */  public void setUseGenerativeModel(boolean useGenerativeModel) {    m_useGenerativeModel = useGenerativeModel;  }   /** Do we use the generative model or convert back to the additive model?   * @param useGenerativeModel if true, the generative model is used   */  public boolean getUseGenerativeModel() {    return m_useGenerativeModel;  }   /** Set the clamping probability value   * @param clampProb a lower bound for all probability values to prevent underflow   */  public void setClampProb(double clampProb) {    m_clampProb = clampProb;  }    /** Get the clamping probability value   * @return a lower bound for all probability values to prevent underflow   */  public double getClampProb() {    return m_clampProb;  }  /** Set the number of training iterations   * @param numIterations the number of iterations   */  public void setNumIterations(int numIterations) {    m_numIterations = numIterations;  }   /** Get the number of training iterations   * @return the number of training iterations   */  public int setNumIterations() {    return m_numIterations;  }   /** Create a copy of this metric   * @return another AffineMetric with the same exact parameters as this  metric   */  public Object clone() {    LearnableTokenEDAffine metric = new LearnableTokenEDAffine();    metric.setNormalized(m_normalized);    metric.setTokenizer(m_tokenizer);    System.out.println("Tokenizer: + "+ ((WordTokenizer)m_tokenizer).getStopwordRemoval());    metric.setUseGenerativeModel(m_useGenerativeModel);    metric.setClampProb(m_clampProb);    metric.setNumIterations(m_numIterations);    return metric;  }    /**   * Gets the current settings of WeightedDotP.   *   * @return an array of strings suitable for passing to setOptions()   * TODO!!!!   */  public String [] getOptions() {    String [] options = new String [40];    int current = 0;    if (m_normalized) {      options[current++] = "-N";    }    if (m_useGenerativeModel) {      options[current++] = "-G";    } else {      options[current++] = "-A";    }    options[current++] = "-c";    options[current++] = "" + m_clampProb;    options[current++] = "-T";    options[current++] = Utils.removeSubstring(m_tokenizer.getClass().getName(), "weka.deduping.metrics.");    if (m_tokenizer instanceof OptionHandler) {      String[] tokenizerOptions = ((OptionHandler)m_tokenizer).getOptions();      for (int i = 0; i < tokenizerOptions.length; i++) {	options[current++] = tokenizerOptions[i];      }    }    while (current < options.length) {      options[current++] = "";    }    return options;  }  /**   * Parses a given list of options. Valid options are:<p>   *   * -N normalize by length   * -m matchCost   * -s subCost   * -g gapStartCost   * -e gapExtendCost      */  public void setOptions(String[] options) throws Exception {    setNormalized(Utils.getFlag('N', options));    System.out.println("Setting options - BZZZZ!");  }  /**   * Returns an enumeration describing the available options.   *   * @return an enumeration of all the available options.   * TODO!!!   */  public Enumeration listOptions() {    Vector newVector = new Vector(5);    newVector.addElement(new Option("\tNormalize by lengths\n",				    "N", 0, "-N"));        return newVector.elements();  }      /** The computation of a metric can be either based on distance, or on similarity   * @returns true   */  public boolean isDistanceBased() {    return true;  }  /**   * Returns a similarity estimate between two strings. Similarity is obtained by   * inverting the distance value using one of three methods:   * CONVERSION_LAPLACIAN, CONVERSION_EXPONENTIAL, CONVERSION_UNIT.   * @param string1 First string.   * @param string2 Second string.   * @exception Exception if similarity could not be estimated.   */  public double similarity(String string1, String string2) throws Exception {    switch (m_conversionType) {    case CONVERSION_LAPLACIAN:       return 1 / (1 + distance(string1, string2));    case CONVERSION_UNIT:      return 2 * (1 - distance(string1, string2));    case CONVERSION_EXPONENTIAL:      return Math.exp(-distance(string1, string2));    default:      throw new Exception ("Unknown distance to similarity conversion method");    }  }  /**   * Set the type of similarity to distance conversion. Values other   * than CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL will be ignored   *    * @param type type of the similarity to distance conversion to use   */  public void setConversionType(SelectedTag conversionType) {    if (conversionType.getTags() == TAGS_CONVERSION) {      m_conversionType = conversionType.getSelectedTag().getID();    }  }  /**   * return the type of similarity to distance conversion   * @return one of CONVERSION_LAPLACIAN, CONVERSION_UNIT, or CONVERSION_EXPONENTIAL   */  public SelectedTag getConversionType() {    return new SelectedTag(m_conversionType, TAGS_CONVERSION);  }  /** Set the tokenizer to use   * @param tokenizer the tokenizer that is used   */  public void setTokenizer(Tokenizer tokenizer) {    m_tokenizer = tokenizer;  }  /** Get the tokenizer to use   * @return the tokenizer that is used   */  public Tokenizer getTokenizer() {    return m_tokenizer;  }      public static void main(String[] args) {    try {     LearnableTokenEDAffine metric = new LearnableTokenEDAffine();    //    metric.trainMetric(new ArrayList());    Tokenizer tokenizer = new WordTokenizer();    TokenString ts1 = tokenizer.getTokenString("Matthew Turk and Alex");    TokenString ts2 = tokenizer.getTokenString("Matthew Turk and Alex");    metric.printMatrices(ts1, ts2);    } catch (Exception e) { e.printStackTrace();}  }    }
learnabletokenedaffine.java - 源码说明

本页面展示了「wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器」中的 learnabletokenedaffine.java 源码文件，采用 Java 编程语言编写，共 1,105 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与university相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?