📄 affineprobmetric.java
字号:
sub_pairProb = m_editopLogProbs[s1_i][s2_j]; } ins_charProb = m_editopLogProbs[blank][s1_i]; del_charProb = m_editopLogProbs[blank][s2_j]; // substituting or matching occsSubst = Math.exp(fMatrix[i-1][j-1][0] + sub_pairProb + m_subLogProb + bMatrix[i][j][0] - stringProb); if (s1_i == s2_j) { m_noopOccs += occsSubst; } else { m_editopOccs[s1_i][s2_j] +=occsSubst; } m_subOccs += occsSubst; // starting a gap occsStartGap_1 = Math.exp(fMatrix[i-1][j][0] + ins_charProb + m_gapStartLogProb + bMatrix[i][j][1]-stringProb); m_editopOccs[blank][s1_i] += occsStartGap_1; occsStartGap_2 = Math.exp(fMatrix[i][j-1][0] + del_charProb + m_gapStartLogProb + bMatrix[i][j][2]-stringProb); m_editopOccs[blank][s2_j] += occsStartGap_2; m_gapStartOccs += occsStartGap_1 + occsStartGap_2; // extendinuing a gap occsExtendGap_1 = Math.exp(fMatrix[i-1][j][1] + ins_charProb + m_gapExtendLogProb + bMatrix[i][j][1]-stringProb); m_editopOccs[blank][s1_i] += occsExtendGap_1; occsExtendGap_2 = Math.exp(fMatrix[i][j-1][2] + del_charProb + m_gapExtendLogProb + bMatrix[i][j][2]-stringProb); m_editopOccs[blank][s2_j] += occsExtendGap_2; m_gapExtendOccs += occsExtendGap_1 + occsExtendGap_2; // ending a gap occsEndGap_1 = Math.exp(fMatrix[i-1][j-1][1] + sub_pairProb + m_gapEndLogProb + bMatrix[i][j][0] - stringProb); if (s1_i == s2_j) { m_noopOccs += occsEndGap_1; } else { m_editopOccs[s1_i][s2_j] += occsEndGap_1; } occsEndGap_2 = Math.exp(fMatrix[i-1][j-1][2] + sub_pairProb + m_gapEndLogProb + bMatrix[i][j][0] - stringProb); if (s1_i == s2_j) { m_noopOccs += occsEndGap_2; } else { m_editopOccs[s1_i][s2_j] += occsEndGap_2; } m_gapEndOccs += occsEndGap_1 + occsEndGap_2; } } // border rows. We can't end gap, and can start/extend gap only one way for (int i = 1; i < l1; i++) { s1_i = s1[i-1]; s2_j = s2[l2-1]; ins_charProb = m_editopLogProbs[blank][s1_i]; if (s1_i == s2_j) { sub_pairProb = m_noopLogProb; } else { sub_pairProb = m_editopLogProbs[s1_i][s2_j]; } occsSubst = Math.exp(fMatrix[i-1][l2-1][0] + sub_pairProb + m_subLogProb + bMatrix[i][l2][0] - stringProb); if (s1_i == s2_j) { m_noopOccs += occsSubst; } else { m_editopOccs[s1_i][s2_j] += occsSubst; } m_subOccs += occsSubst; occsStartGap_1 = Math.exp(fMatrix[i-1][l2][0] + ins_charProb + m_gapStartLogProb + bMatrix[i][l2][1] - stringProb); m_editopOccs[blank][s1_i] += occsStartGap_1; m_gapStartOccs += occsStartGap_1; occsExtendGap_1 = Math.exp(fMatrix[i-1][l2][1] + ins_charProb + m_gapExtendLogProb + bMatrix[i][l2][1] - stringProb); m_editopOccs[blank][s1_i] += occsExtendGap_1; m_gapExtendOccs += occsExtendGap_1; // DO WE NEED THIS??? WE HAD NO CHOICE! } for (int j = 1; j < l2; j++) { s1_i = s1[l1-1]; s2_j = s2[j-1]; del_charProb = m_editopLogProbs[blank][s2_j]; if (s1_i == s2_j) { sub_pairProb = m_noopLogProb; } else { sub_pairProb = m_editopLogProbs[s1_i][s2_j]; } occsSubst = Math.exp(fMatrix[l1-1][j-1][0] + sub_pairProb + m_subLogProb + bMatrix[l1][j][0] - stringProb); if (s1_i == s2_j) { m_noopOccs += occsSubst; } else { m_editopOccs[s1_i][s2_j] += occsSubst; } m_subOccs += occsSubst; occsStartGap_2 = Math.exp(fMatrix[l1][j-1][0] + del_charProb + m_gapStartLogProb + bMatrix[l1][j][2] - stringProb); m_editopOccs[blank][s2_j] += occsStartGap_2; m_gapStartOccs += occsStartGap_2; occsExtendGap_2 = Math.exp(fMatrix[l1][j-1][2] + del_charProb + m_gapExtendLogProb + bMatrix[l1][j][2] - stringProb); m_editopOccs[blank][s2_j] += occsExtendGap_2; m_gapExtendOccs += occsExtendGap_2; // DO WE NEED THIS??? WE HAD NO CHOICE! } return stringProb; } /** * Maximization step of the EM algorithm */ protected void maximizationStep () { double N, N_s, N_id; // TODO: when trying to incorporate discriminative training, see EditDistance.java // in old codebase for an earlier attempt to deal with negative expectations // Sum up expectations for transitions in substitution state N = m_subOccs + 2*m_gapStartOccs + m_endAtSubOccs; m_subProb = m_subOccs / N; m_gapStartProb = m_gapStartOccs / N; m_endAtSubProb = m_endAtSubOccs / N; if (m_subProb < m_clampProb) m_subProb = m_clampProb; if (m_gapStartProb < m_clampProb) m_gapStartProb = m_clampProb; if (m_endAtSubProb < m_clampProb) m_endAtSubProb = m_clampProb; // Sum up expectations for occurrences in deletion/insertion states N = m_gapExtendOccs + m_gapEndOccs + m_endAtGapOccs; m_gapExtendProb = m_gapExtendOccs / N; m_gapEndProb = m_gapEndOccs / N; m_endAtGapProb = m_endAtGapOccs / N; if (m_gapExtendProb < m_clampProb) m_gapExtendProb = m_clampProb; if (m_gapEndProb < m_gapEndProb) m_gapEndProb = m_clampProb; if (m_endAtGapProb < m_endAtGapProb) m_endAtGapProb = m_clampProb; // Now let's add up expectations for actual edit operators // we add up separately for substitution and deletion/insertion N_s = m_noopOccs; N_id = 0; for (int i = 0; i < m_usedChars.length; i++) { N_id += m_editopOccs[blank][m_usedChars[i]]; for (int j = 0; j < m_usedChars.length; j++) { if (i != j) { N_s += m_editopOccs[m_usedChars[i]][m_usedChars[j]]; } } } // Recalculate the probabilities m_noopProb = m_noopOccs / N_s; for (int i = 0; i < m_usedChars.length; i++) { m_editopProbs[blank][m_usedChars[i]] = Math.max(m_clampProb, m_editopOccs[blank][m_usedChars[i]] / N_id); for (int j = i+1; j < m_usedChars.length; j++) { m_editopProbs[m_usedChars[i]][m_usedChars[j]] = Math.max(m_clampProb, (m_editopOccs[m_usedChars[i]][m_usedChars[j]] + m_editopOccs[m_usedChars[j]][m_usedChars[i]])/ N_s); m_editopProbs[m_usedChars[j]][m_usedChars[i]] = m_editopProbs[m_usedChars[i]][m_usedChars[j]]; } } normalizeTransitionProbs(); normalizeEmissionProbs(); updateLogProbs(); } /** * Normalize the probabilities of emission editops so that they sum to 1 * for each state */ protected void normalizeEmissionProbs() { double N_s = m_noopProb, N_id = 0; for (int i = 0; i < m_usedChars.length; i++) { N_id += m_editopProbs[blank][m_usedChars[i]]; for (int j = i+1; j < m_usedChars.length; j++) { N_s += m_editopProbs[m_usedChars[i]][m_usedChars[j]]; } } // Recalculate the probabilities m_noopProb = m_noopProb / N_s; for (int i = 0; i < m_usedChars.length; i++) { m_editopProbs[blank][m_usedChars[i]] /= N_id; for (int j = i+1; j < m_usedChars.length; j++) { m_editopProbs[m_usedChars[i]][m_usedChars[j]] /= N_s; m_editopProbs[m_usedChars[j]][m_usedChars[i]] = m_editopProbs[m_usedChars[i]][m_usedChars[j]]; } } } /** * Normalize the probabilities of transitions so that they sum to 1 * for each state */ protected void normalizeTransitionProbs() { double P; // M-state P = m_subProb + 2 * m_gapStartProb + m_endAtSubProb; m_subProb /= P; m_gapStartProb /= P; m_endAtSubProb /= P; // I/D states P = m_gapExtendProb + m_gapEndProb + m_endAtGapProb; m_gapExtendProb /= P; m_gapEndProb /= P; m_endAtGapProb /= P; } /** * reset the number of occurrences of all ops in the set */ protected void resetOccurrences () { m_noopOccs = 0; m_endAtSubOccs = 0; m_endAtGapOccs = 0; m_gapStartOccs = 0; m_gapExtendOccs = 0; m_gapEndOccs = 0; m_subOccs = 0; for (int i = 0; i < m_usedChars.length; i++) { m_editopOccs[blank][m_usedChars[i]] = 0; for (int j = 0; j < m_usedChars.length; j++) { m_editopOccs[m_usedChars[i]][m_usedChars[j]] = 0; } } } /** * initialize the probabilities to some startup values */ protected void initProbs () { double probDeletionUniform = 1.0 / m_usedChars.length; double probSubUniform = probDeletionUniform * probDeletionUniform; m_endAtSubProb = 0.05; m_endAtGapProb = 0.1; m_gapStartProb = 0.05; m_gapExtendProb = 0.5; m_gapEndProb = 0.4; m_subProb = 0.9; m_noopProb = 0.9;// m_endAtSubProb = 0.1;// m_endAtGapProb = 0.1;// m_gapStartProb = 0.4;// m_gapExtendProb = 0.6;// m_gapEndProb = 0.3;// m_subProb = 0.5;// m_noopProb = 0.3; for (int i = 0; i < m_usedChars.length; i++) { m_editopProbs[blank][m_usedChars[i]] = probDeletionUniform; for (int j = 0; j < m_usedChars.length; j++) { m_editopProbs[m_usedChars[i]][m_usedChars[j]] = probSubUniform; } } } /** * initialize the costs using current values of the probabilities */ protected void initCosts () { m_gapStartCost = -m_gapStartLogProb; m_gapExtendCost = -m_gapExtendLogProb; m_endAtSubCost = -m_endAtSubLogProb; m_endAtGapCost = -m_endAtGapLogProb; m_gapEndCost = -m_gapEndLogProb; m_subCost = -m_subLogProb; m_noopCost = -m_noopLogProb; if (m_verbose) { System.out.println("\nScaled by extend cost:\nGapStrt=" + (m_gapStartCost/m_gapExtendCost) + "\tGapExt=" + (m_gapExtendCost/m_gapExtendCost) + "\tGapEnd=" + (m_gapEndCost/m_gapExtendCost) + "\tSub=" + (m_subCost/m_gapExtendCost) + "\tNoop=" + (m_noopCost/m_gapExtendCost)); System.out.println("\nActual costs:\nGapStrt=" + (m_gapStartCost) + "\tGapExt=" + (m_gapExtendCost) + "\tGapEnd=" + (m_gapEndCost) + "\tSub=" + (m_subCost) + "\tNoop=" + (m_noopCost)); } if (!m_useGenerativeModel) { for (int i = 0; i < m_usedChars.length; i++) { m_editopCosts[blank][m_usedChars[i]] = -m_editopLogProbs[blank][m_usedChars[i]]; for (int j = 0; j < m_usedChars.length; j++) { m_editopCosts[m_usedChars[i]][m_usedChars[j]] = -m_editopLogProbs[m_usedChars[i]][m_usedChars[j]]; } } } else { m_editopCosts = new double[128][128]; } } /** * store logs of all probabilities in m_editopLogProbs */ protected void updateLogProbs() { for (int i = 0; i < 128; i++) { for (int j = 0; j < 128; j++) { m_editopLogProbs[i][j] = (m_editopProbs[i][j] == 0) ? -Double.MAX_VALUE : Math.log(m_editopProbs[i][j]); } } m_noopLogProb = Math.log(m_noopProb); m_endAtSubLogProb = Math.log(m_endAtSubProb); m_endAtGapLogProb = Math.log(m_endAtGapProb); m_gapStartLogProb = Math.log(m_gapStartProb); m_gapExtendLogProb = Math.log(m_gapExtendProb); m_gapEndLogProb = Math.log(m_gapEndProb); m_subLogProb = Math.log(m_subProb); DecimalFormat fmt = new DecimalFormat ("0.0000"); System.out.println("After update:\tNOOP=" + fmt.format(m_noopProb) + "=" + fmt.format(m_noopLogProb) + "\tSUB=" + fmt.format(m_subProb) + "=" + fmt.format(m_subLogProb) + "\n\t\tGAPst=" + fmt.format(m_gapStartProb) + "=" + fmt.format(m_gapStartLogProb) + "\tGAPcont=" + fmt.format(m_gapExtendProb) + "=" + fmt.format(m_gapExtendLogProb) + "\tGAPend=" + fmt.format(m_gapEndProb) + "=" + fmt.format(m_gapEndLogProb) + "\n\t\tendAtGap=" + fmt.format(m_endAtGapProb) + "=" + fmt.format(m_endAtGapLogProb) + "\tendAtSub=" + fmt.format(m_endAtSubProb) + "=" + fmt.format(m_endAtSubLogProb)); } /** * Get the distance between two strings * @param s1 first string * @param s2 second string * @return a value of this distance between these two strings */ public double distance (String s1, String s2) { if (m_useGenerativeModel) { double d = backward(s1,s2)[0][0][0]; if (m_normalized) { // for (int i = 0; i < (s1.length() + s2.length()); i++) // TODO: fix the posteriors; don't care for now - we always use the additive model // d -= m_noopLogProb + m_subLogProb; // for (int i = 0; i < s1.length(); i++) { // d -= m_editopLogProbs[blank][s1.charAt(i)]; // } // for (int i = 0; i < s2.length(); i++) { // d -= m_editopLogProbs[blank][s2.charAt(i)]; // } } return -d; } else { return costDistance(s1, s2); } } /** Method: recordCosts Record probability matrix for further MatLab use */ void recordCosts(int id) { try { FileOutputStream fstr = new FileOutputStream ("/tmp/probs/ProbAffineCosts.txt", true); DataOutputStream out = new DataOutputStream (fstr); char s, t; DecimalFormat fmt = new DecimalFormat ("0.00"); out.writeBytes(id + " " + m_noopCost + "\n" + "char\tblank "); for (int i = 0; i < m_usedChars.length; i++) { out.writeBytes("\'" + m_usedChars[i] + "\'" + "\t"); } out.writeBytes("\n"); for (int i = 0; i < m_usedChars.length; i++) { out.writeBytes("\'" + m_usedChars[i] + "\'" + "\t" + fmt.format(m_editopCosts[blank][m_usedChars[i]]) + "\t"); for (int j = 0; j < i; j++) { out.writeBytes(fmt.format(m_editopCosts[m_usedChars[i]][m_usedChars[j]]) + "\t"); } out.writeBytes("\n"); } out.close(); } catch (Exception x) {} } static String MatrixToString (double matrix[][]) { DecimalFormat fmt = new DecimalFormat ("0.00"); String s = ""; for (int i = 0; i < matrix.length; i++) { for (int j = 0; j < matrix[0].length; j++) s = s + fmt.format(matrix[i][j]) + " "; s = s + "\n"; } return s;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -