⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 affineprobmetric.java

📁 wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
/* *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2 of the License, or *    (at your option) any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *    AffineProbMetric.java *    Copyright (C) 2002-3 Mikhail Bilenko * */package weka.deduping.metrics;import java.text.*;import java.io.*;import java.util.*;import weka.core.*;import weka.deduping.*;  /** AffineProbMetric class implements a probabilistic model string edit distance with affine-cost gaps * *   @author Mikhail Bilenko<mbilenko@cs.utexas.edu> *   @version 1.1 **/public class AffineProbMetric extends StringMetric implements LearnableStringMetric, Serializable, OptionHandler {  /* Current probabilities, log-probabilities and accumulated expectations     for each edit operation */  protected double [][] m_editopProbs;  protected double [][] m_editopLogProbs;  protected double [][] m_editopOccs;  /** Parameters for the generative model */  protected double m_noopProb, m_noopLogProb, m_noopOccs;  // matching  protected double m_endAtSubProb, m_endAtSubLogProb, m_endAtSubOccs; // ending the alignment at M state  protected double m_endAtGapProb, m_endAtGapLogProb, m_endAtGapOccs; // ending the alignment at D/I states  protected double m_gapStartProb, m_gapStartLogProb, m_gapStartOccs; // starting a gap in the alignment  protected double m_gapExtendProb, m_gapExtendLogProb, m_gapExtendOccs; // extending a gap in the alignment  protected double m_gapEndProb, m_gapEndLogProb, m_gapEndOccs;  // ending a gap in the alignment  protected double m_subProb, m_subLogProb, m_subOccs;  // continuing to match/substitute in state M  /** parameters for the additive model, obtained from log-probs to speed up      computations in the "testing" phase after weights have been learned */  protected double [][] m_editopCosts;   protected double m_noopCost;  protected double m_endAtSubCost;   protected double m_endAtGapCost;   protected double m_gapStartCost;  protected double m_gapExtendCost;  protected double m_gapEndCost;  protected double m_subCost;  /** true if we are using a generative model for distance in the "testing" phase after learning the parameters      By default we want to use the additive model that uses probabilities converted to costs*/  protected boolean m_useGenerativeModel = false;  /** Maximum number of iterations for training the model; usually converge in <10 iterations */  protected int m_numIterations = 20;  /** Normalization of edit distance by string length; equivalent to    using the posterior probability in the generative model*/  protected boolean m_normalized = true;  /** Minimal value of a probability parameter.  Particularly important when   training sets are small to prevent zero probabilities. */  protected double m_clampProb = 1e-5;  /** A handy constant for insertions/deletions, we treat them as substitution with a null character */  protected final char blank = 0;  /** TODO:  given a corpus, populate this array with the characters that are actually encountered */  protected char [] m_usedChars = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',				    'q','r','s','t','u','v','w','x','y','z',' ','!','\"','#','$','%',				    '&','\'','(',')','*','+',',','-','.','/','0','1','2','3','4','5','6',				    '7','8','9',':',';','<','=','>','?','@','[','\\',']','^','_','`','{',				    '|','}','~'};    /** We can have different ways of converting from distance to similarity  */  public static final int CONVERSION_LAPLACIAN = 1;  public static final int CONVERSION_UNIT = 2;  public static final int CONVERSION_EXPONENTIAL = 4;  public static final Tag[] TAGS_CONVERSION = {    new Tag(CONVERSION_UNIT, "similarity = 1-distance"),    new Tag(CONVERSION_LAPLACIAN, "similarity=1/(1+distance)"),    new Tag(CONVERSION_EXPONENTIAL, "similarity=exp(-distance)")      };  /** The method of converting, by default laplacian */  protected int m_conversionType = CONVERSION_EXPONENTIAL;  protected boolean m_verbose = false;   /**   * set up an instance of AffineProbMetric   */  public AffineProbMetric () {    m_editopProbs = new double[128][128];    m_editopLogProbs = new double[128][128];    m_editopOccs = new double[128][128];    m_editopCosts = new double[128][128];    initProbs();    normalizeEmissionProbs();    normalizeTransitionProbs();    updateLogProbs();    initCosts();  }  /**   * Calculate the forward matrices   * @param _s1 first string   * @param _s2 second string   * @return m_endAtSubProb*matrix[l1][l2][0] + m_endAtGapProb(matrix[l1][l2][1] +   * matrix[l1][l2][2]) extendains the distance value   */  protected double[][][] forward (String _s1, String _s2) {    char [] s1 = _s1.toCharArray();    char [] s2 = _s2.toCharArray();    int l1 = s1.length, l2 = s2.length;    double matrix[][][] = new double[l1 + 1][l2 + 1][3];    double tmpLog, subProb, tmpLog1;    // initialization    for (int i = 0; i <=l1; i++)      matrix[i][0][0] = matrix[i][0][1] = matrix[i][0][2] = Double.NEGATIVE_INFINITY;    for (int j = 1; j <=l2; j++)      matrix[0][j][0] = matrix[0][j][1] = matrix[0][j][2] = Double.NEGATIVE_INFINITY;    matrix[0][0][0] = 0;	    // border rows    for (int j = 1; j <=l2; j++) {      tmpLog = logSum(m_gapExtendLogProb + matrix[0][j-1][2], m_gapStartLogProb + matrix[0][j-1][0]);       matrix[0][j][2] = m_editopLogProbs[blank][s2[j-1]] + tmpLog;    }    for (int i = 1; i <= l1; i++) {      tmpLog = logSum(m_gapStartLogProb + matrix[i-1][0][0], m_gapExtendLogProb + matrix[i-1][0][1]);      matrix[i][0][1] = m_editopLogProbs[blank][s1[i-1]] + tmpLog;    }        // the rest    for (int i = 1; i <= l1; i++) {      for (int j = 1; j <= l2; j++) {	tmpLog = logSum(m_gapStartLogProb + matrix[i-1][j][0], m_gapExtendLogProb + matrix[i-1][j][1]);	matrix[i][j][1] = m_editopLogProbs[blank][s1[i-1]] + tmpLog;	tmpLog = logSum(m_gapExtendLogProb + matrix[i][j-1][2], m_gapStartLogProb + matrix[i][j-1][0]);	matrix[i][j][2] = m_editopLogProbs[blank][s2[j-1]] + tmpLog;	subProb = ((s1[i-1] == s2[j-1]) ? m_noopLogProb : m_editopLogProbs[s1[i-1]][s2[j-1]]);	tmpLog1 = logSum(m_subLogProb + matrix[i-1][j-1][0], m_gapEndLogProb + matrix[i-1][j-1][2]);	tmpLog = logSum(m_gapEndLogProb + matrix[i-1][j-1][1], tmpLog1);	matrix[i][j][0] =  subProb + tmpLog;      }    }    return matrix;  }  /**   * Calculate the backward matrices   * @param _s1 first string   * @param _s2 second string   * @return matrix[0][0][0] extendains the distance value   */  protected double[][][] backward (String _s1, String _s2) {    char [] s1 = _s1.toCharArray();    char [] s2 = _s2.toCharArray();    int l1 = s1.length, l2 = s2.length;    double matrix[][][] = new double[l1 + 1][l2 + 1][3];    double sub_pairProb, del_charProb, ins_charProb, tmpLog;    // initialize    for (int i = 0; i <=l1; i++)      matrix[i][l2][0] = matrix[i][l2][1] = matrix[i][l2][2] = Double.NEGATIVE_INFINITY;    for (int j = 0; j <=l2; j++)      matrix[l1][j][0] = matrix[l1][j][1] = matrix[l1][j][2] = Double.NEGATIVE_INFINITY;    matrix[l1][l2][0] = m_endAtSubLogProb;    matrix[l1][l2][1] = matrix[l1][l2][2] = m_endAtGapLogProb;    // border rows    for (int i = l1-1; i >= 0; i--) {      matrix[i][l2][0] = m_editopLogProbs[blank][s1[i]] + m_gapStartLogProb + matrix[i+1][l2][1];      matrix[i][l2][1] = m_editopLogProbs[blank][s1[i]] + m_gapExtendLogProb + matrix[i+1][l2][1];    }    for (int j = l2-1; j >= 0; j--) {      matrix[l1][j][0] = m_editopLogProbs[blank][s2[j]] + m_gapStartLogProb + matrix[l1][j+1][2];      matrix[l1][j][2] = m_editopLogProbs[blank][s2[j]] + m_gapExtendLogProb + matrix[l1][j+1][2];    }    // fill the rest of the matrix    for (int i = l1-1; i >= 0; i--) {      for (int j = l2-1; j >= 0; j--) {	ins_charProb = m_editopLogProbs[blank][s1[i]];	del_charProb = m_editopLogProbs[blank][s2[j]];	sub_pairProb = ((s1[i] == s2[j]) ? m_noopLogProb : m_editopLogProbs[s1[i]][s2[j]]);	matrix[i][j][1] = logSum(ins_charProb + m_gapExtendLogProb + matrix[i+1][j][1],				 sub_pairProb + m_gapEndLogProb + matrix[i+1][j+1][0]);	matrix[i][j][2] = logSum(del_charProb + m_gapExtendLogProb + matrix[i][j+1][2],				 sub_pairProb + m_gapEndLogProb + matrix[i+1][j+1][0]);	tmpLog = logSum(ins_charProb + matrix[i+1][j][1], del_charProb + matrix[i][j+1][2]);	matrix[i][j][0] = logSum(sub_pairProb + m_subLogProb + matrix[i+1][j+1][0],				 m_gapStartLogProb + tmpLog);      }    }    return matrix;  }    /**   * print out the three matrices   */  public void printMatrices(String s1, String s2) {    double[][][] forward = forward(s1, s2);    double[][][] backward = backward(s1, s2);    int l1 = s1.length(), l2 = s2.length();    double totalForward = logSum(m_endAtSubLogProb + forward[l1][l2][0], m_endAtGapLogProb + forward[l1][l2][1]);    totalForward = logSum(totalForward, m_endAtGapLogProb + forward[l1][l2][2]);    System.out.println("\nB:" + backward[0][0][0] + "\tF:" + totalForward);        System.out.println("\n***FORWARD***\nSUBSTITUTION:");    printAlignmentMatrix(s1, s2, 0, forward);    System.out.println("\n\nDELETION:");    printAlignmentMatrix(s1, s2, 1, forward);    System.out.println("\n\nINSERTION:");    printAlignmentMatrix(s1, s2, 2, forward);    System.out.println("\n***BACKWARD***\nSUBSTITUTION:");    printAlignmentMatrix(s1, s2, 0, backward);    System.out.println("\n\nDELETION:");    printAlignmentMatrix(s1, s2, 1, backward);    System.out.println("\n\nINSERTION:");    printAlignmentMatrix(s1, s2, 2, backward);  }  public void printAlignmentMatrix(String _s1, String _s2, int idx, double[][][] matrix) {    DecimalFormat fmt = new DecimalFormat ("0.000");    char[] s1 = _s1.toCharArray();    char[] s2 = _s1.toCharArray();        System.out.print('\t');    for (int i = 0; i < s2.length; i++) {      System.out.print("\t" + s2[i]);    }    System.out.println();    for (int i = 0; i < matrix.length; i++) {      if (i > 0) System.out.print(s1[i-1] + "\t");  else System.out.print("\t");      for (int j = 0; j < matrix[i].length; j++) {	System.out.print(fmt.format(matrix[i][j][idx]) + "\t");      }      System.out.println();    }  }     /**   * print out some data in case things go wrong   */  protected void printOpProbs() {    System.out.println("extend_gap_op.prob=" + m_gapExtendProb +		       "  end_gap_op.prob=" + m_gapEndProb +		       "  subst_op.prob=" + m_subProb);  }          /**   *  Train the distance parameters using provided examples using EM   * @param matched_pairs Each member is a String[] extendaining two matching fields   * @param matched_pairs Each member is a String[] extendaining two non-matching fields   */   public void trainMetric (ArrayList pairList) throws Exception {    initProbs();    recordCosts(0);    // convert the training data to lower case    for (int j = 0; j < pairList.size(); j++) {      StringPair pair = (StringPair)pairList.get(j);      pair.str1 = pair.str1.toLowerCase();      pair.str2 = pair.str2.toLowerCase();    }	    try {      // dump out the current probablities      PrintWriter out = new PrintWriter(new FileWriter("/tmp/probs1"));      double totalProb = 0;      double prevTotalProb = -Double.MAX_VALUE;      for (int i = 1; i <= m_numIterations && Math.abs(totalProb - prevTotalProb) > 1; i++) {	resetOccurrences();	out.println(i + "\t" + m_endAtSubProb + "\t" + m_subProb + "\t" + m_gapStartProb +		    "\t" + m_endAtGapProb + "\t" + m_gapEndProb + "\t" + m_gapExtendProb + "\t" + m_noopProb);	// go through positives	prevTotalProb = totalProb;	totalProb = 0;	for (int j = 0; j < pairList.size(); j++) {	  StringPair pair = (StringPair)pairList.get(j);	  if (pair.positive) {	    totalProb += expectationStep (pair.str1, pair.str2, 1, true);	  }	}	// go through negatives  - TODO - discriminative training	//	    for (int j = 0; j < negExamples.length; j++)	//		expectationStep (negExamples[j][1], negExamples[j][0], 1, false);	System.out.println(i + ". Total likelihood=" + totalProb + ";  prev=" + prevTotalProb);	System.out.println("************ Accumulated expectations ******************** ");	System.out.println("End_s=" + m_endAtSubOccs + "\tSub=" + m_subOccs + "\tStGap=" + m_gapStartOccs +			   "\nEnd_g=" + m_endAtGapOccs + "\tEndGap=" + m_gapEndOccs + " ContGap=" + m_gapExtendOccs +			   "\nNoop=" + m_noopOccs);	System.out.println("********************************");	maximizationStep ();      }      out.close();    } catch (Exception e) { e.printStackTrace();}    initCosts();    recordCosts(1);  }      /**   * Expectation  part of the EM algorithm   *  accumulates expectations of editop probabilities over example pairs   *  Expectation is calculated based on two examples which are either duplicates (pos=true)   *  or non-duplicates (pos=false).  Lambda is a weighting parameter, 1 by default.   * @param _s1 first string   * @param _s2 second string   * @param lambda learning rate parameter, 1 by default   * @param pos_training true if strings are matched, false if mismatched   */  protected double expectationStep (String _s1, String _s2, int lambda, boolean pos_training) {    int l1 = _s1.length(), l2 = _s2.length();    if (l1 == 0 || l2 == 0) {      return 0;    }    char [] s1 = _s1.toCharArray();    char [] s2 = _s2.toCharArray();    double fMatrix[][][] = forward (_s1, _s2);    double bMatrix[][][] = backward (_s1, _s2);    double stringProb = bMatrix[0][0][0];//  NB: b[0][0][0]must be equal to endAtSub*f[l1][l2][0] + endAtGap*(f[l1][l2][1]+f[l1][l2][2]); uncomment below for sanity check//  double totalForward = logSum(m_endAtSubLogProb + fMatrix[l1][l2][0], m_endAtSubLogProb + fMatrix[l1][l2][1]);//  totalForward = logSum(totalForward, m_endAtSubLogProb + fMatrix[l1][l2][2]);//  System.out.println("b:" + bMatrix[0][0][0] + "\tf:" + totalForward);        double occsSubst, occsStartGap_1, occsStartGap_2, occsExtendGap_1, occsExtendGap_2;    double occsEndGap_1, occsEndGap_2;    double sub_pairProb, ins_charProb, del_charProb;    char s1_i, s2_j;    if (stringProb == 0.0) {      System.out.println("TROUBLE!!!!  s1=" + _s1 + " s2=" + _s2);      printMatrices(_s1,_s2);      return 0;    }    m_endAtSubOccs += lambda;     m_endAtGapOccs += 2*lambda;    for (int i = 1; i < l1; i++) {      for (int j = 1; j< l2; j++) {	s1_i = s1[i-1];	s2_j = s2[j-1];	if (s1_i == s2_j) {	  sub_pairProb = m_noopLogProb;	} else {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -