affineprobmetric.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 1,128 行 · 第 1/3 页
JAVA
1,128 行
	  sub_pairProb = m_editopLogProbs[s1_i][s2_j];	}	ins_charProb = m_editopLogProbs[blank][s1_i]; 	del_charProb = m_editopLogProbs[blank][s2_j];	// substituting or matching	occsSubst = Math.exp(fMatrix[i-1][j-1][0] + sub_pairProb + m_subLogProb + bMatrix[i][j][0] - stringProb);	if (s1_i == s2_j) {	  m_noopOccs += occsSubst;	} else { 	  m_editopOccs[s1_i][s2_j] +=occsSubst;	}	m_subOccs += occsSubst;			// starting a gap	occsStartGap_1 = Math.exp(fMatrix[i-1][j][0] + ins_charProb + m_gapStartLogProb + bMatrix[i][j][1]-stringProb);	m_editopOccs[blank][s1_i] += occsStartGap_1;	occsStartGap_2 = Math.exp(fMatrix[i][j-1][0] + del_charProb + m_gapStartLogProb + bMatrix[i][j][2]-stringProb);	m_editopOccs[blank][s2_j] += occsStartGap_2;	m_gapStartOccs += occsStartGap_1 + occsStartGap_2;			// extendinuing a gap     	occsExtendGap_1 = Math.exp(fMatrix[i-1][j][1] + ins_charProb + m_gapExtendLogProb + bMatrix[i][j][1]-stringProb);	m_editopOccs[blank][s1_i] += occsExtendGap_1;	occsExtendGap_2 = Math.exp(fMatrix[i][j-1][2] + del_charProb + m_gapExtendLogProb + bMatrix[i][j][2]-stringProb);	m_editopOccs[blank][s2_j] += occsExtendGap_2;	m_gapExtendOccs += occsExtendGap_1 + occsExtendGap_2;			// ending a gap	occsEndGap_1 = Math.exp(fMatrix[i-1][j-1][1] + sub_pairProb + m_gapEndLogProb + bMatrix[i][j][0] - stringProb);	if (s1_i == s2_j) {	  m_noopOccs += occsEndGap_1;	} else { 	  m_editopOccs[s1_i][s2_j] += occsEndGap_1;	}	occsEndGap_2 = Math.exp(fMatrix[i-1][j-1][2] + sub_pairProb + m_gapEndLogProb + bMatrix[i][j][0] - stringProb);	if (s1_i == s2_j) {	  m_noopOccs += occsEndGap_2;	} else { 	  m_editopOccs[s1_i][s2_j] += occsEndGap_2;	}	m_gapEndOccs += occsEndGap_1 + occsEndGap_2;      }    }    // border rows.  We can't end gap, and can start/extend gap only one way    for (int i = 1; i < l1; i++) {      s1_i = s1[i-1];      s2_j = s2[l2-1];      ins_charProb = m_editopLogProbs[blank][s1_i];      if (s1_i == s2_j) {	sub_pairProb = m_noopLogProb;      } else {	sub_pairProb = m_editopLogProbs[s1_i][s2_j];      }      occsSubst = Math.exp(fMatrix[i-1][l2-1][0] + sub_pairProb + m_subLogProb + bMatrix[i][l2][0] - stringProb);      if (s1_i == s2_j) {	m_noopOccs += occsSubst;      } else { 	m_editopOccs[s1_i][s2_j] += occsSubst;      }      m_subOccs += occsSubst;		      occsStartGap_1 = Math.exp(fMatrix[i-1][l2][0] + ins_charProb + m_gapStartLogProb + bMatrix[i][l2][1] - stringProb);      m_editopOccs[blank][s1_i] += occsStartGap_1;      m_gapStartOccs += occsStartGap_1;      occsExtendGap_1 = Math.exp(fMatrix[i-1][l2][1] + ins_charProb + m_gapExtendLogProb + bMatrix[i][l2][1] - stringProb);      m_editopOccs[blank][s1_i] += occsExtendGap_1;      m_gapExtendOccs += occsExtendGap_1;   //  DO WE NEED THIS??? WE HAD NO CHOICE!    }    for (int j = 1; j < l2; j++) {      s1_i = s1[l1-1];      s2_j = s2[j-1];      del_charProb = m_editopLogProbs[blank][s2_j];      if (s1_i == s2_j) {	sub_pairProb = m_noopLogProb;      } else {	sub_pairProb = m_editopLogProbs[s1_i][s2_j];      }      occsSubst = Math.exp(fMatrix[l1-1][j-1][0] + sub_pairProb + m_subLogProb + bMatrix[l1][j][0] - stringProb);      if (s1_i == s2_j) {	m_noopOccs += occsSubst;      } else { 	m_editopOccs[s1_i][s2_j] += occsSubst;      }      m_subOccs += occsSubst;	          occsStartGap_2 = Math.exp(fMatrix[l1][j-1][0] + del_charProb + m_gapStartLogProb + bMatrix[l1][j][2] - stringProb);      m_editopOccs[blank][s2_j] += occsStartGap_2;      m_gapStartOccs += occsStartGap_2;	          occsExtendGap_2 = Math.exp(fMatrix[l1][j-1][2] + del_charProb + m_gapExtendLogProb + bMatrix[l1][j][2] - stringProb);      m_editopOccs[blank][s2_j] += occsExtendGap_2;      m_gapExtendOccs += occsExtendGap_2;  //   DO WE NEED THIS??? WE HAD NO CHOICE!    }    return stringProb;  }  /**    *  Maximization step of the EM algorithm   */  protected void maximizationStep () {    double N, N_s, N_id;    // TODO:  when trying to incorporate discriminative training, see EditDistance.java    // in old codebase for an earlier attempt to deal with negative expectations    // Sum up expectations for transitions in substitution state    N = m_subOccs + 2*m_gapStartOccs + m_endAtSubOccs;    m_subProb = m_subOccs / N;    m_gapStartProb = m_gapStartOccs / N;    m_endAtSubProb = m_endAtSubOccs / N;    if (m_subProb < m_clampProb) m_subProb = m_clampProb;    if (m_gapStartProb < m_clampProb) m_gapStartProb = m_clampProb;    if (m_endAtSubProb < m_clampProb) m_endAtSubProb = m_clampProb;    // Sum up expectations for occurrences in deletion/insertion states    N = m_gapExtendOccs + m_gapEndOccs + m_endAtGapOccs;    m_gapExtendProb = m_gapExtendOccs / N;    m_gapEndProb = m_gapEndOccs / N;    m_endAtGapProb = m_endAtGapOccs / N;    if (m_gapExtendProb < m_clampProb) m_gapExtendProb = m_clampProb;    if (m_gapEndProb < m_gapEndProb) m_gapEndProb = m_clampProb;    if (m_endAtGapProb < m_endAtGapProb) m_endAtGapProb = m_clampProb;	    // Now let's add up expectations for actual edit operators    // we add up separately for substitution and deletion/insertion    N_s = m_noopOccs;    N_id = 0;    for (int i = 0; i < m_usedChars.length; i++) {      N_id += m_editopOccs[blank][m_usedChars[i]];      for (int j = 0; j < m_usedChars.length; j++) {	if (i != j) {	  N_s += m_editopOccs[m_usedChars[i]][m_usedChars[j]];	}       }    }    // Recalculate the probabilities    m_noopProb = m_noopOccs / N_s;    for (int i = 0; i < m_usedChars.length; i++) {      m_editopProbs[blank][m_usedChars[i]] = Math.max(m_clampProb, m_editopOccs[blank][m_usedChars[i]] / N_id);      for (int j = i+1; j < m_usedChars.length; j++) {	m_editopProbs[m_usedChars[i]][m_usedChars[j]] = Math.max(m_clampProb, 	  (m_editopOccs[m_usedChars[i]][m_usedChars[j]] + m_editopOccs[m_usedChars[j]][m_usedChars[i]])/ N_s);	m_editopProbs[m_usedChars[j]][m_usedChars[i]] = m_editopProbs[m_usedChars[i]][m_usedChars[j]];      }    }    normalizeTransitionProbs();    normalizeEmissionProbs();    updateLogProbs();  }      /**    * Normalize the probabilities of emission editops so that they sum to 1   * for each state   */  protected void normalizeEmissionProbs() {    double N_s = m_noopProb, N_id = 0;    for (int i = 0; i < m_usedChars.length; i++) {      N_id += m_editopProbs[blank][m_usedChars[i]];      for (int j = i+1; j < m_usedChars.length; j++) {	N_s += m_editopProbs[m_usedChars[i]][m_usedChars[j]];      }    }    // Recalculate the probabilities    m_noopProb = m_noopProb / N_s;    for (int i = 0; i < m_usedChars.length; i++) {      m_editopProbs[blank][m_usedChars[i]] /= N_id;      for (int j = i+1; j < m_usedChars.length; j++) {	m_editopProbs[m_usedChars[i]][m_usedChars[j]] /= N_s;	m_editopProbs[m_usedChars[j]][m_usedChars[i]] = m_editopProbs[m_usedChars[i]][m_usedChars[j]];      }    }  }  /**    * Normalize the probabilities of transitions so that they sum to 1   * for each state   */  protected void normalizeTransitionProbs() {    double P;    // M-state    P = m_subProb + 2 * m_gapStartProb + m_endAtSubProb;    m_subProb /= P;    m_gapStartProb /= P;    m_endAtSubProb /= P;    // I/D states    P = m_gapExtendProb + m_gapEndProb + m_endAtGapProb;    m_gapExtendProb /= P;    m_gapEndProb /= P;    m_endAtGapProb /= P;  }      /**    *  reset the number of occurrences of all ops in the set   */  protected void resetOccurrences () {    m_noopOccs = 0;    m_endAtSubOccs = 0;    m_endAtGapOccs = 0;    m_gapStartOccs = 0;    m_gapExtendOccs = 0;    m_gapEndOccs = 0;    m_subOccs = 0;    for (int i = 0; i < m_usedChars.length; i++) {      m_editopOccs[blank][m_usedChars[i]] = 0;      for (int j = 0; j < m_usedChars.length; j++) {	m_editopOccs[m_usedChars[i]][m_usedChars[j]] = 0;      }    }  }  /**    *  initialize the probabilities to some startup values   */  protected void initProbs () {    double probDeletionUniform = 1.0 / m_usedChars.length;    double probSubUniform = probDeletionUniform * probDeletionUniform;    m_endAtSubProb = 0.05;    m_endAtGapProb = 0.1;    m_gapStartProb = 0.05;    m_gapExtendProb = 0.5;    m_gapEndProb = 0.4;    m_subProb = 0.9;    m_noopProb = 0.9;//      m_endAtSubProb = 0.1;//      m_endAtGapProb = 0.1;//      m_gapStartProb = 0.4;//      m_gapExtendProb = 0.6;//      m_gapEndProb = 0.3;//      m_subProb = 0.5;//      m_noopProb = 0.3;    for (int i = 0; i < m_usedChars.length; i++) {      m_editopProbs[blank][m_usedChars[i]] = probDeletionUniform;      for (int j = 0; j < m_usedChars.length; j++) {	m_editopProbs[m_usedChars[i]][m_usedChars[j]] = probSubUniform;      }    }  }  /**    *  initialize the costs using current values of the probabilities   */  protected void initCosts () {    m_gapStartCost =  -m_gapStartLogProb;    m_gapExtendCost =   -m_gapExtendLogProb;    m_endAtSubCost = -m_endAtSubLogProb;    m_endAtGapCost = -m_endAtGapLogProb;    m_gapEndCost =    -m_gapEndLogProb;    m_subCost =        -m_subLogProb;    m_noopCost =     -m_noopLogProb;    if (m_verbose) {      System.out.println("\nScaled by extend cost:\nGapStrt=" + (m_gapStartCost/m_gapExtendCost) +			 "\tGapExt=" + (m_gapExtendCost/m_gapExtendCost) +			 "\tGapEnd=" + (m_gapEndCost/m_gapExtendCost) + 			 "\tSub=" + (m_subCost/m_gapExtendCost) +			 "\tNoop=" + (m_noopCost/m_gapExtendCost));      System.out.println("\nActual costs:\nGapStrt=" + (m_gapStartCost) +			 "\tGapExt=" + (m_gapExtendCost) +			 "\tGapEnd=" + (m_gapEndCost) + 			 "\tSub=" + (m_subCost) +			 "\tNoop=" + (m_noopCost));    }    if (!m_useGenerativeModel) {      for (int i = 0; i < m_usedChars.length; i++) {	m_editopCosts[blank][m_usedChars[i]] =   -m_editopLogProbs[blank][m_usedChars[i]];	for (int j = 0; j < m_usedChars.length; j++) {	  m_editopCosts[m_usedChars[i]][m_usedChars[j]] =	    -m_editopLogProbs[m_usedChars[i]][m_usedChars[j]];	}      }    } else {      m_editopCosts = new double[128][128];    }  }  /**   * store logs of all probabilities in m_editopLogProbs   */  protected void updateLogProbs() {    for (int i = 0; i < 128; i++) {      for (int j = 0; j < 128; j++) {	m_editopLogProbs[i][j] = (m_editopProbs[i][j] == 0) ? -Double.MAX_VALUE : Math.log(m_editopProbs[i][j]);      }    }    m_noopLogProb = Math.log(m_noopProb);    m_endAtSubLogProb = Math.log(m_endAtSubProb);    m_endAtGapLogProb = Math.log(m_endAtGapProb);    m_gapStartLogProb = Math.log(m_gapStartProb);    m_gapExtendLogProb = Math.log(m_gapExtendProb);    m_gapEndLogProb = Math.log(m_gapEndProb);    m_subLogProb = Math.log(m_subProb);    DecimalFormat fmt = new DecimalFormat ("0.0000");    System.out.println("After update:\tNOOP=" + fmt.format(m_noopProb) + "=" + fmt.format(m_noopLogProb) +		       "\tSUB=" + fmt.format(m_subProb) + "=" + fmt.format(m_subLogProb) +		       "\n\t\tGAPst=" + fmt.format(m_gapStartProb) + "=" + fmt.format(m_gapStartLogProb) +		       "\tGAPcont=" + fmt.format(m_gapExtendProb) + "=" + fmt.format(m_gapExtendLogProb) +		       "\tGAPend=" + fmt.format(m_gapEndProb) + "=" + fmt.format(m_gapEndLogProb) +		       "\n\t\tendAtGap=" + fmt.format(m_endAtGapProb) + "=" + fmt.format(m_endAtGapLogProb) +		       "\tendAtSub=" + fmt.format(m_endAtSubProb) + "=" + fmt.format(m_endAtSubLogProb));  }      /**    * Get the distance between two strings   * @param s1 first string   * @param s2 second string   * @return a value of this distance between these two strings   */  public double distance (String s1, String s2) {    if (m_useGenerativeModel) {      double d = backward(s1,s2)[0][0][0];      if (m_normalized) {	//	for (int i = 0; i < (s1.length() + s2.length()); i++) 	  // TODO:  fix the posteriors; don't care for now - we always use the additive model	//	  d -= m_noopLogProb + m_subLogProb;	//  		for (int i = 0; i < s1.length(); i++) {	//  		    d -= m_editopLogProbs[blank][s1.charAt(i)];	//  		}	//  		for (int i = 0; i < s2.length(); i++) {	//  		    d -= m_editopLogProbs[blank][s2.charAt(i)];	//  		}      }      return -d;    } else {      return costDistance(s1, s2);    }  }  /** Method:  recordCosts      Record probability matrix for further MatLab use  */  void recordCosts(int id) {    try {      FileOutputStream fstr = new FileOutputStream ("/tmp/probs/ProbAffineCosts.txt", true);      DataOutputStream out = new DataOutputStream (fstr);      char s, t;      DecimalFormat fmt = new DecimalFormat ("0.00");	          out.writeBytes(id + " " + m_noopCost + "\n" + "char\tblank   ");      for (int i = 0; i < m_usedChars.length; i++) {	out.writeBytes("\'" + m_usedChars[i] + "\'" + "\t");      }      out.writeBytes("\n");      for (int i = 0; i < m_usedChars.length; i++) {	out.writeBytes("\'" + m_usedChars[i] + "\'" + "\t" +		       fmt.format(m_editopCosts[blank][m_usedChars[i]]) + "\t"); 	for (int j = 0; j < i; j++) {	  out.writeBytes(fmt.format(m_editopCosts[m_usedChars[i]][m_usedChars[j]]) + "\t");	}	out.writeBytes("\n");      }      out.close();    } catch (Exception x) {}  }	      static String MatrixToString (double matrix[][]) {    DecimalFormat fmt = new DecimalFormat ("0.00");    String s = "";    for (int i = 0; i < matrix.length; i++) {      for (int j = 0; j < matrix[0].length; j++)	s = s + fmt.format(matrix[i][j]) + "  ";      s = s + "\n";    }    return s;
affineprobmetric.java - 源码说明

本页面展示了「wekaUT是 university texas austin 开发的基于weka的半指导学习(semi supervised learning)的分类器」中的 affineprobmetric.java 源码文件，采用 Java 编程语言编写，共 1,128 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与university相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?