blast_psi_priv.c

来自「ncbi源码」· C语言 代码 · 共 1,546 行 · 第 1/4 页

C
1,546
字号
        /* Calculate row_sigma */        for (asi = 0; asi < num_aligned_seqs; asi++) {            const Uint4 seq_idx = aligned_seqs[asi];            const Uint1 residue =                alignment->desc_matrix[seq_idx][i].letter;            seq_weights->row_sigma[seq_idx] +=                 (1.0 / (double)                  (residue_counts[residue] * num_distinct_residues) );        }    }    return;}/** Calculates the normalized sequence weights for the requested position */static void_PSICalculateNormalizedSequenceWeights(    const PsiAlignedBlock* aligned_blocks, /* [in] */    Uint4 position,                        /* [in] */    const Uint4* aligned_seqs,             /* [in] */    Uint4 num_aligned_seqs,                /* [in] */    double sigma,                          /* [in] */    PsiSequenceWeights* seq_weights)       /* [out] */{    Uint4 asi = 0;   /**< index into array of aligned sequences */    if (sigma > 0) {        double weight_sum = 0.0;        for (asi = 0; asi < num_aligned_seqs; asi++) {            const Uint4 seq_idx = aligned_seqs[asi];            seq_weights->norm_seq_weights[seq_idx] =                 seq_weights->row_sigma[seq_idx] /                 (aligned_blocks->pos_extnt[position].right -                 aligned_blocks->pos_extnt[position].left + 1);#ifndef PSI_IGNORE_GAPS_IN_COLUMNS            weight_sum += seq_weights->norm_seq_weights[seq_idx];#endif        }        for (asi = 0; asi < num_aligned_seqs; asi++) {            const Uint4 seq_idx = aligned_seqs[asi];            seq_weights->norm_seq_weights[seq_idx] /= weight_sum;        }    } else {        for (asi = 0; asi < num_aligned_seqs; asi++) {            const Uint4 seq_idx = aligned_seqs[asi];            seq_weights->norm_seq_weights[seq_idx] =                 (1.0/(double) num_aligned_seqs);        }    }}static void_PSICalculateMatchWeights(    const PsiAlignmentData* alignment,  /* [in] */    Uint4 position,                     /* [in] */    const Uint4* aligned_seqs,          /* [in] */    Uint4 num_aligned_seqs,             /* [in] */    PsiSequenceWeights* seq_weights)    /* [out] */{    const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];    Uint4 asi = 0;   /**< index into array of aligned sequences */    for (asi = 0; asi < num_aligned_seqs; asi++) {        const Uint4 seq_idx = aligned_seqs[asi];        const Uint1 residue =            alignment->desc_matrix[seq_idx][position].letter;        seq_weights->match_weights[position][residue] +=             seq_weights->norm_seq_weights[seq_idx];        /* FIXME: this field is populated but never used */        if (residue == GAP) {            /*seq_weights->gapless_column_weights[position] +=             * seq_weights->a[seq_idx]; */            ;        }    }}/** Finds the sequences aligned in a given position. * @param alignment Multiple-alignment data [in] * @param position position of interest [in] * @param aligned_sequences array which will contain the indices of the * sequences aligned at the requested position. This array must have size * greater than or equal to the number of sequences + 1 in multiple alignment  * data structure (alignment->dimensions->num_seqs + 1) [out] * @return number of sequences aligned at the requested position */static Uint4_PSIGetAlignedSequencesForPosition(const PsiAlignmentData* alignment,                                    Uint4 position,                                   Uint4* aligned_sequences){#ifdef PSI_IGNORE_GAPS_IN_COLUMNS    const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];#endif    Uint4 retval = 0;    Uint4 i = 0;    ASSERT(alignment);    ASSERT(position < alignment->dimensions->query_sz);    ASSERT(aligned_sequences);    for (i = 0; i < alignment->dimensions->num_seqs + 1; i++) {        if ( !alignment->use_sequences[i] ) {            continue;        }#ifdef PSI_IGNORE_GAPS_IN_COLUMNS        if (alignment->desc_matrix[i][position].used &&            alignment->desc_matrix[i][position].letter != GAP) {#else        if (alignment->desc_matrix[i][position].used) {#endif            aligned_sequences[retval++] = i;        }    }    return retval;}/* The second parameter is not really const, it's updated! */static int_PSICheckSequenceWeights(const PsiAlignmentData* alignment,                         PsiSequenceWeights* seq_weights){    const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];    const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];    Uint4 pos = 0;   /* residue position (ie: column number) */    Uint4 res = 0;   /* residue */    ASSERT(alignment);    ASSERT(seq_weights);    for (pos = 0; pos < alignment->dimensions->query_sz; pos++) {        double running_total = 0.0;        if (alignment->match_seqs[pos] <= 1 ||            alignment->desc_matrix[kQueryIndex][pos].letter == X) {            continue;        }        for (res = 0; res < PSI_ALPHABET_SIZE; res++) {            running_total += seq_weights->match_weights[pos][res];        }        ASSERT(running_total < 0.99 || running_total > 1.01);#ifndef PSI_IGNORE_GAPS_IN_COLUMNS        /* Disperse method of spreading the gap weights */        for (res = 0; res < PSI_ALPHABET_SIZE; res++) {            if (seq_weights->std_prob[res] > kEpsilon) {                seq_weights->match_weights[pos][res] +=                     (seq_weights->match_weights[pos][GAP] *                      seq_weights->std_prob[res]);            }        }#endif        seq_weights->match_weights[pos][GAP] = 0.0;        running_total = 0.0;        for (res = 0; res < PSI_ALPHABET_SIZE; res++) {            running_total += seq_weights->match_weights[pos][res];        }        if (running_total < 0.99 || running_total > 1.01) {            return PSIERR_BADSEQWEIGHTS;        }    }    return PSI_SUCCESS;}/****************************************************************************//******* Compute residue frequencies stage of PSSM creation *****************//* port of posComputePseudoFreqs */intPSIComputeResidueFrequencies(const PsiAlignmentData* alignment,     /* [in] */                             const PsiSequenceWeights* seq_weights, /* [in] */                             const BlastScoreBlk* sbp,              /* [in] */                             const PsiAlignedBlock* aligned_blocks, /* [in] */                             const PSIBlastOptions* opts,           /* [in] */                             PsiMatrix* score_matrix)               /* [out] */{    const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];    Uint4 i = 0;             /* loop index into query positions */    SFreqRatios* freq_ratios;       /* matrix-specific frequency ratios */    if ( !alignment || !seq_weights || !sbp ||          !aligned_blocks || !opts || !score_matrix ) {        return PSIERR_BADPARAM;    }    freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name);    for (i = 0; i < alignment->dimensions->query_sz; i++) {        Uint4 j = 0;     /* loop index into alphabet */        double info_sum = 0.0;  /* for information content - FIXME calculate                                   separately */        /* If there is an 'X' in the query sequence at position i... */        if (alignment->desc_matrix[kQueryIndex][i].letter == X) {            for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {                score_matrix->res_freqs[i][j] = 0.0;            }        } else {            for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {                if (seq_weights->std_prob[j] > kEpsilon) {                    Uint4 interval_size = 0; /* length of a block */                    Uint4 idx = 0;  /* loop index */                    double sigma = 0.0;    /* number of chars in an interval */                    double pseudo = 0.0;            /* intermediate term */                    double numerator = 0.0;         /* intermediate term */                    double denominator = 0.0;       /* intermediate term */                    double qOverPEstimate = 0.0;    /* intermediate term */                    /* changed to matrix specific ratios here May 2000 */                    for (idx = 0; idx < (Uint4) sbp->alphabet_size; idx++) {                        if (sbp->matrix[j][idx] != BLAST_SCORE_MIN) {                            pseudo += (seq_weights->match_weights[i][idx] *                                       freq_ratios->data[j][idx]);                        }                    }                    pseudo *= opts->pseudo_count;                    /* FIXME: document where this formula is coming from                     * (probably 2001 paper, p 2996) */                    sigma = seq_weights->sigma[i];                    interval_size = aligned_blocks->size[i];                    numerator = pseudo +                         ((sigma/interval_size-1) *                          seq_weights->match_weights[i][j] /                          seq_weights->std_prob[j]);                    denominator = (sigma/interval_size-1) +                        opts->pseudo_count;                    qOverPEstimate = numerator/denominator;                    /* Note artificial multiplication by standard probability                     * to normalize */                    score_matrix->res_freqs[i][j] = qOverPEstimate *                        seq_weights->std_prob[j];                    if ( qOverPEstimate != 0.0 &&                         (seq_weights->std_prob[j] > kEpsilon) ) {                        info_sum += qOverPEstimate * seq_weights->std_prob[j] *                            log(qOverPEstimate)/NCBIMATH_LN2;                    }                } else {                    score_matrix->res_freqs[i][j] = 0.0;                } /* END: if (seq_weights->std_prob[j] > kEpsilon) { */            } /* END: for (j = 0; j < sbp->alphabet_size; j++) */        }        /* FIXME: Should move out the calculation of information content to its         * own function (see posFreqsToInformation)! */        seq_weights->info_content[i] = info_sum;    }    freq_ratios = _PSIMatrixFrequencyRatiosFree(freq_ratios);    return PSI_SUCCESS;}/****************************************************************************//**************** Convert residue frequencies to PSSM stage *****************//* FIXME: Answer questions   FIXME: need ideal_labmda, regular scoring matrix, length of query*/intPSIConvertResidueFreqsToPSSM(PsiMatrix* score_matrix,           /* [in|out] */                             const Uint1* query,                /* [in] */                             const BlastScoreBlk* sbp,          /* [in] */                             const double* std_probs)           /* [in] */{    const Uint4 X = AMINOACID_TO_NCBISTDAA['X'];    const Uint4 Star = AMINOACID_TO_NCBISTDAA['*'];    Uint4 i = 0;    Uint4 j = 0;    SFreqRatios* std_freq_ratios = NULL;    /* only needed when there are not                                               residue frequencies for a given                                                column */    double ideal_lambda;    if ( !score_matrix || !sbp || !std_probs )        return PSIERR_BADPARAM;    std_freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name);    ideal_lambda = sbp->kbp_ideal->Lambda;    /* Each column is a position in the query */    for (i = 0; i < score_matrix->ncols; i++) {        /* True if all frequencies in column i are zero */        Boolean is_unaligned_column = TRUE;        const Uint4 query_res = query[i];        for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {            double qOverPEstimate = 0.0;            /* Division compensates for multiplication in previous function */            if (std_probs[j] > kEpsilon) {                qOverPEstimate =                     score_matrix->res_freqs[i][j] / std_probs[j];            }            if (is_unaligned_column && qOverPEstimate != 0.0) {                is_unaligned_column = FALSE;            }            /* Populate scaled matrix */            if (qOverPEstimate == 0.0 || std_probs[j] < kEpsilon) {                score_matrix->scaled_pssm[i][j] = BLAST_SCORE_MIN;            } else {                double tmp = log(qOverPEstimate)/ideal_lambda;                score_matrix->scaled_pssm[i][j] = (int)                    BLAST_Nint(kPsiScaleFactor * tmp);            }            if ( (j == X || j == Star) &&                 (sbp->matrix[query_res][X] != BLAST_SCORE_MIN) ) {                score_matrix->scaled_pssm[i][j] =                     sbp->matrix[query_res][j] * kPsiScaleFactor;            }        }        if (is_unaligned_column) {            for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {                score_matrix->pssm[i][j] = sbp->matrix[query_res][j];                if (sbp->matrix[query_res][j] != BLAST_SCORE_MIN) {                    double tmp =                         kPsiScaleFactor * std_freq_ratios->bit_scale_factor *                        log(std_freq_ratios->data[query_res][j])/NCBIMATH_LN2;                    score_matrix->scaled_pssm[i][j] = BLAST_Nint(tmp);                } else {                    score_matrix->scaled_pssm[i][j] = BLAST_SCORE_MIN;                }            }        }    }    std_freq_ratios = _PSIMatrixFrequencyRatiosFree(std_freq_ratios);    /* Set the last column of the matrix to BLAST_SCORE_MIN (why?) */    for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {        score_matrix->scaled_pssm[score_matrix->ncols-1][j] = BLAST_SCORE_MIN;    }    return PSI_SUCCESS;}/****************************************************************************//************************* Scaling of PSSM stage ****************************//** * @param initial_lambda_guess value to be used when calculating lambda if this * is not null [in] * @param sbp Score block structure where the calculated lambda and K will be * returned */void_PSIUpdateLambdaK(const int** pssm,              /* [in] */                  const Uint1* query,            /* [in] */                  Uint4 query_length,            /* [in] */                  const double* std_probs,       /* [in] */                  double* initial_lambda_guess,  /* [in] */                  BlastScoreBlk* sbp);           /* [in|out] */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?