blast_psi_priv.c
来自「ncbi源码」· C语言 代码 · 共 1,546 行 · 第 1/4 页
C
1,546 行
/* percentage of similarity of an aligned region between seq1 and seq2 */ { double percent_identity = (double) nidentical / align_length; fprintf(stderr, "%f%% for seqs %d and %d\n", percent_identity, seq_index1, seq_index2); if (percent_identity >= max_percent_identity) { int rv = _PSIPurgeAlignedRegion(alignment, seq_index2, align_start, align_start+align_length); ASSERT(rv == PSI_SUCCESS); } } }}/****************************************************************************//* Function prototypes */static void_PSIGetLeftExtents(const PsiAlignmentData* alignment, Uint4 seq_index);static void_PSIGetRightExtents(const PsiAlignmentData* alignment, Uint4 seq_index);static void_PSIComputePositionExtents(const PsiAlignmentData* alignment, Uint4 seq_index, PsiAlignedBlock* aligned_blocks);static void_PSIComputeAlignedRegionLengths(const PsiAlignmentData* alignment, PsiAlignedBlock* aligned_blocks);/****************************************************************************//******* Compute alignment extents stage of PSSM creation *******************//* posComputeExtents in posit.c */intPSIComputeAlignmentBlocks(const PsiAlignmentData* alignment, /* [in] */ PsiAlignedBlock* aligned_blocks) /* [out] */{ Uint4 s = 0; /* index on aligned sequences */ if ( !alignment || !aligned_blocks ) { return PSIERR_BADPARAM; } /* no need to compute extents for query sequence */ for (s = kQueryIndex + 1; s < alignment->dimensions->num_seqs + 1; s++) { if ( !alignment->use_sequences[s] ) continue; _PSIGetLeftExtents(alignment, s); _PSIGetRightExtents(alignment, s); _PSIComputePositionExtents(alignment, s, aligned_blocks); } _PSIComputeAlignedRegionLengths(alignment, aligned_blocks); return PSI_SUCCESS;}static void_PSIGetLeftExtents(const PsiAlignmentData* alignment, Uint4 seq_index){ const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-']; PsiDesc* sequence_position = NULL; Uint4 prev = 0; /* index for the first and previous position */ Uint4 curr = 0; /* index for the current position */ ASSERT(alignment); ASSERT(seq_index < alignment->dimensions->num_seqs + 1); ASSERT(alignment->use_sequences[seq_index]); sequence_position = alignment->desc_matrix[seq_index]; if (sequence_position[prev].used && sequence_position[prev].letter != GAP) { sequence_position[prev].extents.left = prev; } for (curr = prev + 1; curr < alignment->dimensions->query_sz; curr++, prev++) { if ( !sequence_position[curr].used ) { continue; } if (sequence_position[prev].used) { sequence_position[curr].extents.left = sequence_position[prev].extents.left; } else { sequence_position[curr].extents.left = curr; } }}static void_PSIGetRightExtents(const PsiAlignmentData* alignment, Uint4 seq_index){ const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-']; PsiDesc* sequence_position = NULL; Uint4 last = 0; /* index for the last position */ Uint4 curr = 0; /* index for the current position */ ASSERT(alignment); ASSERT(seq_index < alignment->dimensions->num_seqs + 1); ASSERT(alignment->use_sequences[seq_index]); sequence_position = alignment->desc_matrix[seq_index]; last = alignment->dimensions->query_sz - 1; if (sequence_position[last].used && sequence_position[last].letter != GAP) { sequence_position[last].extents.right = last; } for (curr = last - 1; curr >= 0; curr--, last--) { if ( !sequence_position[curr].used ) { continue; } if (sequence_position[last].used) { sequence_position[curr].extents.right = sequence_position[last].extents.right; } else { sequence_position[curr].extents.right = curr; } }}static void_PSIComputePositionExtents(const PsiAlignmentData* alignment, Uint4 seq_index, PsiAlignedBlock* aligned_blocks){#ifdef PSI_IGNORE_GAPS_IN_COLUMNS const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];#endif PsiDesc* sequence_position = NULL; Uint4 i = 0; ASSERT(aligned_blocks); ASSERT(alignment); ASSERT(seq_index < alignment->dimensions->num_seqs + 1); ASSERT(alignment->use_sequences[seq_index]); sequence_position = alignment->desc_matrix[seq_index]; for (i = 0; i < alignment->dimensions->query_sz; i++) {#ifdef PSI_IGNORE_GAPS_IN_COLUMNS if (sequence_position[i].used && sequence_position[i].letter != GAP) {#else if (sequence_position[i].used) {#endif aligned_blocks->pos_extnt[i].left = MAX(aligned_blocks->pos_extnt[i].left, sequence_position[i].extents.left); aligned_blocks->pos_extnt[i].right = MIN(aligned_blocks->pos_extnt[i].right, sequence_position[i].extents.right); } }}static void_PSIComputeAlignedRegionLengths(const PsiAlignmentData* alignment, PsiAlignedBlock* aligned_blocks){ const Uint1 X = AMINOACID_TO_NCBISTDAA['X']; PsiDesc* query_seq = NULL; Uint4 i = 0; ASSERT(alignment); ASSERT(aligned_blocks); for (i = 0; i < alignment->dimensions->query_sz; i++) { aligned_blocks->size[i] = aligned_blocks->pos_extnt[i].right - aligned_blocks->pos_extnt[i].left; } query_seq = alignment->desc_matrix[kQueryIndex]; /* Do not include X's in aligned region lengths */ for (i = 0; i < alignment->dimensions->query_sz; i++) { if (query_seq[i].letter == X) { Uint4 idx = 0; for (idx = 0; idx < i; idx++) { if ((Uint4)aligned_blocks->pos_extnt[idx].right >= i && query_seq[idx].letter != X) { aligned_blocks->size[idx]--; } } for (idx = alignment->dimensions->query_sz - 1; idx > i; idx--) { if ((Uint4)aligned_blocks->pos_extnt[idx].left <= i && query_seq[idx].letter != X) { aligned_blocks->size[idx]--; } } } }}/****************************************************************************/static Uint4_PSIGetAlignedSequencesForPosition( const PsiAlignmentData* alignment, Uint4 position, Uint4* aligned_sequences);static void_PSICalculatePositionWeightsAndIntervalSigmas( const PsiAlignmentData* alignment, /* [in] */ const PsiAlignedBlock* aligned_blocks, /* [in] */ Uint4 position, /* [in] */ const Uint4* aligned_seqs, /* [in] */ Uint4 num_aligned_seqs, /* [in] */ PsiSequenceWeights* seq_weights, /* [out] */ double* sigma, /* [out] */ double* interval_sigma); /* [out] */static void_PSICalculateNormalizedSequenceWeights( const PsiAlignedBlock* aligned_blocks, /* [in] */ Uint4 position, /* [in] */ const Uint4* aligned_seqs, /* [in] */ Uint4 num_aligned_seqs, /* [in] */ double sigma, /* [in] */ PsiSequenceWeights* seq_weights); /* [out] */static void_PSICalculateMatchWeights( const PsiAlignmentData* alignment, /* [in] */ Uint4 position, /* [in] */ const Uint4* aligned_seqs, /* [in] */ Uint4 num_aligned_seqs, /* [in] */ PsiSequenceWeights* seq_weights); /* [out] */static int_PSICheckSequenceWeights( const PsiAlignmentData* alignment, /* [in] */ PsiSequenceWeights* seq_weights); /* [in] *//****************************************************************************//******* Calculate sequence weights stage of PSSM creation ******************//* Needs the PsiAlignedBlock structure calculated in previous stage as well * as PsiAlignmentData structure */intPSIComputeSequenceWeights(const PsiAlignmentData* alignment, /* [in] */ const PsiAlignedBlock* aligned_blocks, /* [in] */ PsiSequenceWeights* seq_weights) /* [out] */{ Uint4* aligned_seqs = NULL; /* list of indices of sequences which participate in an aligned position */ Uint4 pos = 0; /* position index */ ASSERT(alignment); ASSERT(aligned_blocks); ASSERT(seq_weights); aligned_seqs = (Uint4*) calloc(alignment->dimensions->num_seqs + 1, sizeof(Uint4)); if ( !aligned_seqs ) { return PSIERR_OUTOFMEM; } for (pos = 0; pos < alignment->dimensions->query_sz; pos++) { Uint4 num_aligned_seqs = 0; double sigma = 0.0; /**< number of different characters occurring in matches within a multiple alignment block, excluding identical columns */ double interval_sigma = 0.0; /**< same as sigma but includes identical columns */ /* ignore positions of no interest */ if (aligned_blocks->size[pos] == 0 || alignment->match_seqs[pos] <= 1) { continue; } memset((void*)aligned_seqs, 0, sizeof(Uint4) * (alignment->dimensions->num_seqs + 1)); num_aligned_seqs = _PSIGetAlignedSequencesForPosition(alignment, pos, aligned_seqs); if (num_aligned_seqs <= 1) { continue; } /* Skipping optimization about redundant sets */ /* if (newSequenceSet) in posit.c */ memset((void*)seq_weights->norm_seq_weights, 0, sizeof(double)*(alignment->dimensions->num_seqs+1)); memset((void*)seq_weights->row_sigma, 0, sizeof(double)*(alignment->dimensions->num_seqs+1)); _PSICalculatePositionWeightsAndIntervalSigmas(alignment, aligned_blocks, pos, aligned_seqs, num_aligned_seqs, seq_weights, &sigma, &interval_sigma); seq_weights->sigma[pos] = interval_sigma; /* Populates norm_seq_weights */ _PSICalculateNormalizedSequenceWeights(aligned_blocks, pos, aligned_seqs, num_aligned_seqs, sigma, seq_weights); /* Uses seq_weights->norm_seq_weights to populate match_weights */ _PSICalculateMatchWeights(alignment, pos, aligned_seqs, num_aligned_seqs, seq_weights); } sfree(aligned_seqs); /* Check that the sequence weights add up to 1 in each column */ _PSICheckSequenceWeights(alignment, seq_weights); /* Return seq_weights->match_weigths, should free others? FIXME: need to * keep sequence weights for diagnostics for structure group */ return PSI_SUCCESS;}/* Calculates the position based weights using a modified version of the * Henikoff's algorithm presented in "Position-based sequence weights". * More documentation pending */static void_PSICalculatePositionWeightsAndIntervalSigmas( const PsiAlignmentData* alignment, /* [in] */ const PsiAlignedBlock* aligned_blocks, /* [in] */ Uint4 position, /* [in] */ const Uint4* aligned_seqs, /* [in] */ Uint4 num_aligned_seqs, /* [in] */ PsiSequenceWeights* seq_weights, /* [out] */ double* sigma, /* [out] */ double* interval_sigma) /* [out] */{ /** keeps track of how many occurrences of each residue was found at a * given position */ Uint4 residue_counts[PSI_ALPHABET_SIZE]; Uint4 num_distinct_residues = 0; /**< number of distinct residues found at a given position */ Uint4 i = 0; /**< index into aligned block for requested position */ ASSERT(seq_weights); ASSERT(sigma); ASSERT(interval_sigma); *sigma = 0.0; *interval_sigma = 0.0; for (i = 0; i < sizeof(residue_counts); i++) { residue_counts[i] = 0; } for (i = (Uint4) aligned_blocks->pos_extnt[position].left; i <= (Uint4) aligned_blocks->pos_extnt[position].right; i++) { Uint4 asi = 0; /**< index into array of aligned sequences */ /**** Count number of occurring residues at a position ****/ for (asi = 0; asi < num_aligned_seqs; asi++) { const Uint4 seq_idx = aligned_seqs[asi]; const Uint1 residue = alignment->desc_matrix[seq_idx][i].letter; if (residue_counts[residue] == 0) { num_distinct_residues++; } residue_counts[residue]++; } /**** END: Count number of occurring residues at a position ****/ /* FIXME: see Alejandro's email about this */ (*interval_sigma) += num_distinct_residues; if (num_distinct_residues > 1) { /* if this is not 1 */ (*sigma) += num_distinct_residues; }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?