blast_psi_priv.c
来自「ncbi源码」· C语言 代码 · 共 1,546 行 · 第 1/4 页
C
1,546 行
/* Calculate row_sigma */ for (asi = 0; asi < num_aligned_seqs; asi++) { const Uint4 seq_idx = aligned_seqs[asi]; const Uint1 residue = alignment->desc_matrix[seq_idx][i].letter; seq_weights->row_sigma[seq_idx] += (1.0 / (double) (residue_counts[residue] * num_distinct_residues) ); } } return;}/** Calculates the normalized sequence weights for the requested position */static void_PSICalculateNormalizedSequenceWeights( const PsiAlignedBlock* aligned_blocks, /* [in] */ Uint4 position, /* [in] */ const Uint4* aligned_seqs, /* [in] */ Uint4 num_aligned_seqs, /* [in] */ double sigma, /* [in] */ PsiSequenceWeights* seq_weights) /* [out] */{ Uint4 asi = 0; /**< index into array of aligned sequences */ if (sigma > 0) { double weight_sum = 0.0; for (asi = 0; asi < num_aligned_seqs; asi++) { const Uint4 seq_idx = aligned_seqs[asi]; seq_weights->norm_seq_weights[seq_idx] = seq_weights->row_sigma[seq_idx] / (aligned_blocks->pos_extnt[position].right - aligned_blocks->pos_extnt[position].left + 1);#ifndef PSI_IGNORE_GAPS_IN_COLUMNS weight_sum += seq_weights->norm_seq_weights[seq_idx];#endif } for (asi = 0; asi < num_aligned_seqs; asi++) { const Uint4 seq_idx = aligned_seqs[asi]; seq_weights->norm_seq_weights[seq_idx] /= weight_sum; } } else { for (asi = 0; asi < num_aligned_seqs; asi++) { const Uint4 seq_idx = aligned_seqs[asi]; seq_weights->norm_seq_weights[seq_idx] = (1.0/(double) num_aligned_seqs); } }}static void_PSICalculateMatchWeights( const PsiAlignmentData* alignment, /* [in] */ Uint4 position, /* [in] */ const Uint4* aligned_seqs, /* [in] */ Uint4 num_aligned_seqs, /* [in] */ PsiSequenceWeights* seq_weights) /* [out] */{ const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-']; Uint4 asi = 0; /**< index into array of aligned sequences */ for (asi = 0; asi < num_aligned_seqs; asi++) { const Uint4 seq_idx = aligned_seqs[asi]; const Uint1 residue = alignment->desc_matrix[seq_idx][position].letter; seq_weights->match_weights[position][residue] += seq_weights->norm_seq_weights[seq_idx]; /* FIXME: this field is populated but never used */ if (residue == GAP) { /*seq_weights->gapless_column_weights[position] += * seq_weights->a[seq_idx]; */ ; } }}/** Finds the sequences aligned in a given position. * @param alignment Multiple-alignment data [in] * @param position position of interest [in] * @param aligned_sequences array which will contain the indices of the * sequences aligned at the requested position. This array must have size * greater than or equal to the number of sequences + 1 in multiple alignment * data structure (alignment->dimensions->num_seqs + 1) [out] * @return number of sequences aligned at the requested position */static Uint4_PSIGetAlignedSequencesForPosition(const PsiAlignmentData* alignment, Uint4 position, Uint4* aligned_sequences){#ifdef PSI_IGNORE_GAPS_IN_COLUMNS const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];#endif Uint4 retval = 0; Uint4 i = 0; ASSERT(alignment); ASSERT(position < alignment->dimensions->query_sz); ASSERT(aligned_sequences); for (i = 0; i < alignment->dimensions->num_seqs + 1; i++) { if ( !alignment->use_sequences[i] ) { continue; }#ifdef PSI_IGNORE_GAPS_IN_COLUMNS if (alignment->desc_matrix[i][position].used && alignment->desc_matrix[i][position].letter != GAP) {#else if (alignment->desc_matrix[i][position].used) {#endif aligned_sequences[retval++] = i; } } return retval;}/* The second parameter is not really const, it's updated! */static int_PSICheckSequenceWeights(const PsiAlignmentData* alignment, PsiSequenceWeights* seq_weights){ const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-']; const Uint1 X = AMINOACID_TO_NCBISTDAA['X']; Uint4 pos = 0; /* residue position (ie: column number) */ Uint4 res = 0; /* residue */ ASSERT(alignment); ASSERT(seq_weights); for (pos = 0; pos < alignment->dimensions->query_sz; pos++) { double running_total = 0.0; if (alignment->match_seqs[pos] <= 1 || alignment->desc_matrix[kQueryIndex][pos].letter == X) { continue; } for (res = 0; res < PSI_ALPHABET_SIZE; res++) { running_total += seq_weights->match_weights[pos][res]; } ASSERT(running_total < 0.99 || running_total > 1.01);#ifndef PSI_IGNORE_GAPS_IN_COLUMNS /* Disperse method of spreading the gap weights */ for (res = 0; res < PSI_ALPHABET_SIZE; res++) { if (seq_weights->std_prob[res] > kEpsilon) { seq_weights->match_weights[pos][res] += (seq_weights->match_weights[pos][GAP] * seq_weights->std_prob[res]); } }#endif seq_weights->match_weights[pos][GAP] = 0.0; running_total = 0.0; for (res = 0; res < PSI_ALPHABET_SIZE; res++) { running_total += seq_weights->match_weights[pos][res]; } if (running_total < 0.99 || running_total > 1.01) { return PSIERR_BADSEQWEIGHTS; } } return PSI_SUCCESS;}/****************************************************************************//******* Compute residue frequencies stage of PSSM creation *****************//* port of posComputePseudoFreqs */intPSIComputeResidueFrequencies(const PsiAlignmentData* alignment, /* [in] */ const PsiSequenceWeights* seq_weights, /* [in] */ const BlastScoreBlk* sbp, /* [in] */ const PsiAlignedBlock* aligned_blocks, /* [in] */ const PSIBlastOptions* opts, /* [in] */ PsiMatrix* score_matrix) /* [out] */{ const Uint1 X = AMINOACID_TO_NCBISTDAA['X']; Uint4 i = 0; /* loop index into query positions */ SFreqRatios* freq_ratios; /* matrix-specific frequency ratios */ if ( !alignment || !seq_weights || !sbp || !aligned_blocks || !opts || !score_matrix ) { return PSIERR_BADPARAM; } freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name); for (i = 0; i < alignment->dimensions->query_sz; i++) { Uint4 j = 0; /* loop index into alphabet */ double info_sum = 0.0; /* for information content - FIXME calculate separately */ /* If there is an 'X' in the query sequence at position i... */ if (alignment->desc_matrix[kQueryIndex][i].letter == X) { for (j = 0; j < (Uint4) sbp->alphabet_size; j++) { score_matrix->res_freqs[i][j] = 0.0; } } else { for (j = 0; j < (Uint4) sbp->alphabet_size; j++) { if (seq_weights->std_prob[j] > kEpsilon) { Uint4 interval_size = 0; /* length of a block */ Uint4 idx = 0; /* loop index */ double sigma = 0.0; /* number of chars in an interval */ double pseudo = 0.0; /* intermediate term */ double numerator = 0.0; /* intermediate term */ double denominator = 0.0; /* intermediate term */ double qOverPEstimate = 0.0; /* intermediate term */ /* changed to matrix specific ratios here May 2000 */ for (idx = 0; idx < (Uint4) sbp->alphabet_size; idx++) { if (sbp->matrix[j][idx] != BLAST_SCORE_MIN) { pseudo += (seq_weights->match_weights[i][idx] * freq_ratios->data[j][idx]); } } pseudo *= opts->pseudo_count; /* FIXME: document where this formula is coming from * (probably 2001 paper, p 2996) */ sigma = seq_weights->sigma[i]; interval_size = aligned_blocks->size[i]; numerator = pseudo + ((sigma/interval_size-1) * seq_weights->match_weights[i][j] / seq_weights->std_prob[j]); denominator = (sigma/interval_size-1) + opts->pseudo_count; qOverPEstimate = numerator/denominator; /* Note artificial multiplication by standard probability * to normalize */ score_matrix->res_freqs[i][j] = qOverPEstimate * seq_weights->std_prob[j]; if ( qOverPEstimate != 0.0 && (seq_weights->std_prob[j] > kEpsilon) ) { info_sum += qOverPEstimate * seq_weights->std_prob[j] * log(qOverPEstimate)/NCBIMATH_LN2; } } else { score_matrix->res_freqs[i][j] = 0.0; } /* END: if (seq_weights->std_prob[j] > kEpsilon) { */ } /* END: for (j = 0; j < sbp->alphabet_size; j++) */ } /* FIXME: Should move out the calculation of information content to its * own function (see posFreqsToInformation)! */ seq_weights->info_content[i] = info_sum; } freq_ratios = _PSIMatrixFrequencyRatiosFree(freq_ratios); return PSI_SUCCESS;}/****************************************************************************//**************** Convert residue frequencies to PSSM stage *****************//* FIXME: Answer questions FIXME: need ideal_labmda, regular scoring matrix, length of query*/intPSIConvertResidueFreqsToPSSM(PsiMatrix* score_matrix, /* [in|out] */ const Uint1* query, /* [in] */ const BlastScoreBlk* sbp, /* [in] */ const double* std_probs) /* [in] */{ const Uint4 X = AMINOACID_TO_NCBISTDAA['X']; const Uint4 Star = AMINOACID_TO_NCBISTDAA['*']; Uint4 i = 0; Uint4 j = 0; SFreqRatios* std_freq_ratios = NULL; /* only needed when there are not residue frequencies for a given column */ double ideal_lambda; if ( !score_matrix || !sbp || !std_probs ) return PSIERR_BADPARAM; std_freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name); ideal_lambda = sbp->kbp_ideal->Lambda; /* Each column is a position in the query */ for (i = 0; i < score_matrix->ncols; i++) { /* True if all frequencies in column i are zero */ Boolean is_unaligned_column = TRUE; const Uint4 query_res = query[i]; for (j = 0; j < (Uint4) sbp->alphabet_size; j++) { double qOverPEstimate = 0.0; /* Division compensates for multiplication in previous function */ if (std_probs[j] > kEpsilon) { qOverPEstimate = score_matrix->res_freqs[i][j] / std_probs[j]; } if (is_unaligned_column && qOverPEstimate != 0.0) { is_unaligned_column = FALSE; } /* Populate scaled matrix */ if (qOverPEstimate == 0.0 || std_probs[j] < kEpsilon) { score_matrix->scaled_pssm[i][j] = BLAST_SCORE_MIN; } else { double tmp = log(qOverPEstimate)/ideal_lambda; score_matrix->scaled_pssm[i][j] = (int) BLAST_Nint(kPsiScaleFactor * tmp); } if ( (j == X || j == Star) && (sbp->matrix[query_res][X] != BLAST_SCORE_MIN) ) { score_matrix->scaled_pssm[i][j] = sbp->matrix[query_res][j] * kPsiScaleFactor; } } if (is_unaligned_column) { for (j = 0; j < (Uint4) sbp->alphabet_size; j++) { score_matrix->pssm[i][j] = sbp->matrix[query_res][j]; if (sbp->matrix[query_res][j] != BLAST_SCORE_MIN) { double tmp = kPsiScaleFactor * std_freq_ratios->bit_scale_factor * log(std_freq_ratios->data[query_res][j])/NCBIMATH_LN2; score_matrix->scaled_pssm[i][j] = BLAST_Nint(tmp); } else { score_matrix->scaled_pssm[i][j] = BLAST_SCORE_MIN; } } } } std_freq_ratios = _PSIMatrixFrequencyRatiosFree(std_freq_ratios); /* Set the last column of the matrix to BLAST_SCORE_MIN (why?) */ for (j = 0; j < (Uint4) sbp->alphabet_size; j++) { score_matrix->scaled_pssm[score_matrix->ncols-1][j] = BLAST_SCORE_MIN; } return PSI_SUCCESS;}/****************************************************************************//************************* Scaling of PSSM stage ****************************//** * @param initial_lambda_guess value to be used when calculating lambda if this * is not null [in] * @param sbp Score block structure where the calculated lambda and K will be * returned */void_PSIUpdateLambdaK(const int** pssm, /* [in] */ const Uint1* query, /* [in] */ Uint4 query_length, /* [in] */ const double* std_probs, /* [in] */ double* initial_lambda_guess, /* [in] */ BlastScoreBlk* sbp); /* [in|out] */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?