blast_psi_priv.c

来自「ncbi源码」· C语言 代码 · 共 1,546 行 · 第 1/4 页

C
1,546
字号
        /* percentage of similarity of an aligned region between seq1 and            seq2 */        {            double percent_identity = (double) nidentical / align_length;                fprintf(stderr, "%f%% for seqs %d and %d\n",                        percent_identity, seq_index1, seq_index2);            if (percent_identity >= max_percent_identity) {                int rv = _PSIPurgeAlignedRegion(alignment, seq_index2,                                                 align_start,                                                 align_start+align_length);                ASSERT(rv == PSI_SUCCESS);            }        }    }}/****************************************************************************//* Function prototypes */static void_PSIGetLeftExtents(const PsiAlignmentData* alignment, Uint4 seq_index);static void_PSIGetRightExtents(const PsiAlignmentData* alignment, Uint4 seq_index);static void_PSIComputePositionExtents(const PsiAlignmentData* alignment,                            Uint4 seq_index,                           PsiAlignedBlock* aligned_blocks);static void_PSIComputeAlignedRegionLengths(const PsiAlignmentData* alignment,                                PsiAlignedBlock* aligned_blocks);/****************************************************************************//******* Compute alignment extents stage of PSSM creation *******************//* posComputeExtents in posit.c */intPSIComputeAlignmentBlocks(const PsiAlignmentData* alignment,        /* [in] */                          PsiAlignedBlock* aligned_blocks)          /* [out] */{    Uint4 s = 0;     /* index on aligned sequences */    if ( !alignment || !aligned_blocks ) {        return PSIERR_BADPARAM;    }    /* no need to compute extents for query sequence */    for (s = kQueryIndex + 1; s < alignment->dimensions->num_seqs + 1; s++) {        if ( !alignment->use_sequences[s] )            continue;        _PSIGetLeftExtents(alignment, s);        _PSIGetRightExtents(alignment, s);        _PSIComputePositionExtents(alignment, s, aligned_blocks);    }    _PSIComputeAlignedRegionLengths(alignment, aligned_blocks);    return PSI_SUCCESS;}static void_PSIGetLeftExtents(const PsiAlignmentData* alignment, Uint4 seq_index){    const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];    PsiDesc* sequence_position = NULL;    Uint4 prev = 0;  /* index for the first and previous position */    Uint4 curr = 0;  /* index for the current position */    ASSERT(alignment);    ASSERT(seq_index < alignment->dimensions->num_seqs + 1);    ASSERT(alignment->use_sequences[seq_index]);    sequence_position = alignment->desc_matrix[seq_index];    if (sequence_position[prev].used && sequence_position[prev].letter != GAP) {        sequence_position[prev].extents.left = prev;    }    for (curr = prev + 1; curr < alignment->dimensions->query_sz;          curr++, prev++) {        if ( !sequence_position[curr].used ) {            continue;        }        if (sequence_position[prev].used) {            sequence_position[curr].extents.left =                sequence_position[prev].extents.left;        } else {            sequence_position[curr].extents.left = curr;        }    }}static void_PSIGetRightExtents(const PsiAlignmentData* alignment, Uint4 seq_index){    const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];    PsiDesc* sequence_position = NULL;    Uint4 last = 0;      /* index for the last position */    Uint4 curr = 0;      /* index for the current position */    ASSERT(alignment);    ASSERT(seq_index < alignment->dimensions->num_seqs + 1);    ASSERT(alignment->use_sequences[seq_index]);    sequence_position = alignment->desc_matrix[seq_index];    last = alignment->dimensions->query_sz - 1;    if (sequence_position[last].used && sequence_position[last].letter != GAP) {        sequence_position[last].extents.right = last;    }    for (curr = last - 1; curr >= 0; curr--, last--) {        if ( !sequence_position[curr].used ) {            continue;        }        if (sequence_position[last].used) {            sequence_position[curr].extents.right =                sequence_position[last].extents.right;        } else {            sequence_position[curr].extents.right = curr;        }    }}static void_PSIComputePositionExtents(const PsiAlignmentData* alignment,                            Uint4 seq_index,                           PsiAlignedBlock* aligned_blocks){#ifdef PSI_IGNORE_GAPS_IN_COLUMNS    const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];#endif    PsiDesc* sequence_position = NULL;    Uint4 i = 0;    ASSERT(aligned_blocks);    ASSERT(alignment);    ASSERT(seq_index < alignment->dimensions->num_seqs + 1);    ASSERT(alignment->use_sequences[seq_index]);    sequence_position = alignment->desc_matrix[seq_index];    for (i = 0; i < alignment->dimensions->query_sz; i++) {#ifdef PSI_IGNORE_GAPS_IN_COLUMNS        if (sequence_position[i].used &&             sequence_position[i].letter != GAP) {#else        if (sequence_position[i].used) {#endif            aligned_blocks->pos_extnt[i].left =                 MAX(aligned_blocks->pos_extnt[i].left,                     sequence_position[i].extents.left);            aligned_blocks->pos_extnt[i].right =                 MIN(aligned_blocks->pos_extnt[i].right,                     sequence_position[i].extents.right);        }    }}static void_PSIComputeAlignedRegionLengths(const PsiAlignmentData* alignment,                                PsiAlignedBlock* aligned_blocks){    const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];    PsiDesc* query_seq = NULL;    Uint4 i = 0;        ASSERT(alignment);    ASSERT(aligned_blocks);    for (i = 0; i < alignment->dimensions->query_sz; i++) {        aligned_blocks->size[i] = aligned_blocks->pos_extnt[i].right -                                    aligned_blocks->pos_extnt[i].left;    }    query_seq = alignment->desc_matrix[kQueryIndex];    /* Do not include X's in aligned region lengths */    for (i = 0; i < alignment->dimensions->query_sz; i++) {        if (query_seq[i].letter == X) {            Uint4 idx = 0;            for (idx = 0; idx < i; idx++) {                if ((Uint4)aligned_blocks->pos_extnt[idx].right >= i &&                    query_seq[idx].letter != X) {                    aligned_blocks->size[idx]--;                }            }            for (idx = alignment->dimensions->query_sz - 1; idx > i; idx--) {                if ((Uint4)aligned_blocks->pos_extnt[idx].left <= i &&                    query_seq[idx].letter != X) {                    aligned_blocks->size[idx]--;                }            }        }    }}/****************************************************************************/static Uint4_PSIGetAlignedSequencesForPosition(    const PsiAlignmentData* alignment,     Uint4 position,    Uint4* aligned_sequences);static void_PSICalculatePositionWeightsAndIntervalSigmas(    const PsiAlignmentData* alignment,     /* [in] */    const PsiAlignedBlock* aligned_blocks, /* [in] */    Uint4 position,                        /* [in] */    const Uint4* aligned_seqs,             /* [in] */    Uint4 num_aligned_seqs,                /* [in] */    PsiSequenceWeights* seq_weights,       /* [out] */    double* sigma,                         /* [out] */    double* interval_sigma);               /* [out] */static void_PSICalculateNormalizedSequenceWeights(    const PsiAlignedBlock* aligned_blocks, /* [in] */    Uint4 position,                        /* [in] */    const Uint4* aligned_seqs,             /* [in] */    Uint4 num_aligned_seqs,                /* [in] */    double sigma,                          /* [in] */    PsiSequenceWeights* seq_weights);      /* [out] */static void_PSICalculateMatchWeights(    const PsiAlignmentData* alignment,  /* [in] */    Uint4 position,                     /* [in] */    const Uint4* aligned_seqs,          /* [in] */    Uint4 num_aligned_seqs,             /* [in] */    PsiSequenceWeights* seq_weights);   /* [out] */static int_PSICheckSequenceWeights(    const PsiAlignmentData* alignment,         /* [in] */    PsiSequenceWeights* seq_weights);          /* [in] *//****************************************************************************//******* Calculate sequence weights stage of PSSM creation ******************//* Needs the PsiAlignedBlock structure calculated in previous stage as well * as PsiAlignmentData structure */intPSIComputeSequenceWeights(const PsiAlignmentData* alignment,        /* [in] */                          const PsiAlignedBlock* aligned_blocks,    /* [in] */                          PsiSequenceWeights* seq_weights)          /* [out] */{    Uint4* aligned_seqs = NULL;     /* list of indices of sequences                                              which participate in an                                              aligned position */    Uint4 pos = 0;                  /* position index */    ASSERT(alignment);    ASSERT(aligned_blocks);    ASSERT(seq_weights);    aligned_seqs = (Uint4*) calloc(alignment->dimensions->num_seqs + 1,                                          sizeof(Uint4));    if ( !aligned_seqs ) {        return PSIERR_OUTOFMEM;    }    for (pos = 0; pos < alignment->dimensions->query_sz; pos++) {        Uint4 num_aligned_seqs = 0;        double sigma = 0.0; /**< number of different characters                                  occurring in matches within a multiple                                  alignment block, excluding identical                                  columns */        double interval_sigma = 0.0; /**< same as sigma but includes identical                                  columns */        /* ignore positions of no interest */        if (aligned_blocks->size[pos] == 0 || alignment->match_seqs[pos] <= 1) {            continue;        }        memset((void*)aligned_seqs, 0,                sizeof(Uint4) * (alignment->dimensions->num_seqs + 1));        num_aligned_seqs = _PSIGetAlignedSequencesForPosition(alignment, pos,                                                              aligned_seqs);        if (num_aligned_seqs <= 1) {            continue;        }        /* Skipping optimization about redundant sets */        /* if (newSequenceSet) in posit.c */        memset((void*)seq_weights->norm_seq_weights, 0,                sizeof(double)*(alignment->dimensions->num_seqs+1));        memset((void*)seq_weights->row_sigma, 0,               sizeof(double)*(alignment->dimensions->num_seqs+1));        _PSICalculatePositionWeightsAndIntervalSigmas(alignment,                                      aligned_blocks, pos, aligned_seqs,                                      num_aligned_seqs, seq_weights,                                      &sigma, &interval_sigma);        seq_weights->sigma[pos] = interval_sigma;        /* Populates norm_seq_weights */        _PSICalculateNormalizedSequenceWeights(aligned_blocks, pos,                                  aligned_seqs, num_aligned_seqs, sigma,                                 seq_weights);        /* Uses seq_weights->norm_seq_weights to populate match_weights */        _PSICalculateMatchWeights(alignment, pos, aligned_seqs,                                  num_aligned_seqs, seq_weights);    }    sfree(aligned_seqs);    /* Check that the sequence weights add up to 1 in each column */    _PSICheckSequenceWeights(alignment, seq_weights);    /* Return seq_weights->match_weigths, should free others? FIXME: need to     * keep sequence weights for diagnostics for structure group */    return PSI_SUCCESS;}/* Calculates the position based weights using a modified version of the * Henikoff's algorithm presented in "Position-based sequence weights". * More documentation pending */static void_PSICalculatePositionWeightsAndIntervalSigmas(    const PsiAlignmentData* alignment,     /* [in] */    const PsiAlignedBlock* aligned_blocks, /* [in] */    Uint4 position,                        /* [in] */    const Uint4* aligned_seqs,             /* [in] */    Uint4 num_aligned_seqs,                /* [in] */    PsiSequenceWeights* seq_weights,       /* [out] */    double* sigma,                         /* [out] */    double* interval_sigma)                /* [out] */{    /** keeps track of how many occurrences of each residue was found at a     * given position */    Uint4 residue_counts[PSI_ALPHABET_SIZE];    Uint4 num_distinct_residues = 0; /**< number of distinct                                               residues found at a given                                               position */    Uint4 i = 0;         /**< index into aligned block for requested                                   position */    ASSERT(seq_weights);    ASSERT(sigma);    ASSERT(interval_sigma);    *sigma = 0.0;    *interval_sigma = 0.0;    for (i = 0; i < sizeof(residue_counts); i++) {        residue_counts[i] = 0;    }    for (i = (Uint4) aligned_blocks->pos_extnt[position].left;          i <= (Uint4) aligned_blocks->pos_extnt[position].right; i++) {        Uint4 asi = 0;   /**< index into array of aligned sequences */        /**** Count number of occurring residues at a position ****/        for (asi = 0; asi < num_aligned_seqs; asi++) {            const Uint4 seq_idx = aligned_seqs[asi];            const Uint1 residue =                 alignment->desc_matrix[seq_idx][i].letter;            if (residue_counts[residue] == 0) {                num_distinct_residues++;            }            residue_counts[residue]++;        }        /**** END: Count number of occurring residues at a position ****/        /* FIXME: see Alejandro's email about this */        (*interval_sigma) += num_distinct_residues;        if (num_distinct_residues > 1) {    /* if this is not 1 */            (*sigma) += num_distinct_residues;        }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?