📄 phi_lookup.c
字号:
/* * =========================================================================== * PRODUCTION $Log: phi_lookup.c,v $ * PRODUCTION Revision 1000.2 2004/06/01 18:08:29 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.14 * PRODUCTION * =========================================================================== *//* $Id: phi_lookup.c,v 1000.2 2004/06/01 18:08:29 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's offical duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Ilya Dondoshansky * *//** @file phi_lookup.c * Functions for accessing the lookup table for PHI-BLAST * @todo FIXME needs doxygen comments and lines shorter than 80 characters */static char const rcsid[] = "$Id: phi_lookup.c,v 1000.2 2004/06/01 18:08:29 gouriano Exp $";#include <algo/blast/core/blast_def.h>#include <algo/blast/core/blast_util.h>#include <algo/blast/core/pattern.h>#include <algo/blast/core/phi_lookup.h>#include <algo/blast/core/blast_message.h>#define seedepsilon 0.00001#define allone ((1 << ALPHABET_SIZE) - 1)typedef struct seedSearchItems { double charMultiple[ALPHABET_SIZE]; double paramC; /*used in e-value computation*/ double paramLambda; /*used in e-value computation*/ double paramK; /*used in the bit score computation*/ Int4 cutoffScore; /*lower bound for what is a hit*/ double standardProb[ALPHABET_SIZE]; /*probability of each letter*/ char order[ASCII_SIZE]; char pchars[ALPHABET_SIZE+1]; char name_space[BUF_SIZE]; /*name of a pattern*/ char pat_space[PATTERN_SPACE_SIZE]; /*string description of pattern*/} seedSearchItems;/*Initialize the order of letters in the alphabet, the score matrix,and the row sums of the score matrix. matrixToFill is thescore matrix, program_flag says which variant of the program isused; is_dna tells whether the strings are DNA or protein*/static void init_order(Int4 **matrix, Int4 program_flag, Boolean is_dna, seedSearchItems *seedSearch){ Uint1 i, j; /*loop indices for matrix*/ Int4 *matrixRow; /*row of matrixToFill*/ double rowSum; /*sum of scaled substitution probabilities on matrixRow*/ if (is_dna) { seedSearch->order['A'] = 0; seedSearch->order['C'] = 1; seedSearch->order['G'] = 2; seedSearch->order['T'] = 3; } else { for (i = 0; i < ALPHABET_SIZE; i++) seedSearch->order[(Uint1)seedSearch->pchars[i]] = i; } if (program_flag == SEED_FLAG) { for (i = 0; i < ALPHABET_SIZE; i++) seedSearch->charMultiple[i] = 0; for (i = 0; i < ALPHABET_SIZE; i++) { if (seedSearch->standardProb[i] > seedepsilon) { matrixRow = matrix[i]; rowSum= 0; for (j = 0; j < ALPHABET_SIZE; j++) { if (seedSearch->standardProb[j] > seedepsilon) rowSum += seedSearch->standardProb[j]*exp(-(seedSearch->paramLambda)*matrixRow[j]); } seedSearch->charMultiple[i] = rowSum; } } }}/*Initialize occurrence probabilities for each amino acid*/static void initProbs(seedSearchItems * seedSearch){ double totalCount; /*for Robinson frequencies*/ seedSearch->pchars[0] = '-'; seedSearch->pchars[1] = 'A'; seedSearch->pchars[2] = 'B'; seedSearch->pchars[3] = 'C'; seedSearch->pchars[4] = 'D'; seedSearch->pchars[5] = 'E'; seedSearch->pchars[6] = 'F'; seedSearch->pchars[7] = 'G'; seedSearch->pchars[8] = 'H'; seedSearch->pchars[9] = 'I'; seedSearch->pchars[10] = 'K'; seedSearch->pchars[11] = 'L'; seedSearch->pchars[12] = 'M'; seedSearch->pchars[13] = 'N'; seedSearch->pchars[14] = 'P'; seedSearch->pchars[15] = 'Q'; seedSearch->pchars[16] = 'R'; seedSearch->pchars[17] = 'S'; seedSearch->pchars[18] = 'T'; seedSearch->pchars[19] = 'V'; seedSearch->pchars[20] = 'W'; seedSearch->pchars[21] = 'X'; seedSearch->pchars[22] = 'Y'; seedSearch->pchars[23] = 'Z'; seedSearch->pchars[24] = 'U'; seedSearch->pchars[25] = '*'; totalCount = 78.0 + 19.0 + 54.0 + 63.0 + 39.0 + 74.0 + 22.0 + 52.0 + 57.0 + 90.0 + 22.0 + 45.0 + 52.0 + 43.0 + 51.0 + 71.0 + 59.0 + 64.0 + 13.0 + 32.0; seedSearch->standardProb[0] = 0.0; seedSearch->standardProb[1] = 78.0/totalCount; /*A*/ seedSearch->standardProb[2] = 0.0; seedSearch->standardProb[3] = 19.0/totalCount; /*C*/ seedSearch->standardProb[4] = 54.0/totalCount; /*D*/ seedSearch->standardProb[5] = 63.0/totalCount; /*E*/ seedSearch->standardProb[6] = 39.0/totalCount; /*F*/ seedSearch->standardProb[7] = 74.0/totalCount; /*G*/ seedSearch->standardProb[8] = 22.0/totalCount; /*H*/ seedSearch->standardProb[9] = 52.0/totalCount; /*I*/ seedSearch->standardProb[10] = 57.0/totalCount; /*K*/ seedSearch->standardProb[11] = 90.0/totalCount; /*L*/ seedSearch->standardProb[12] = 22.0/totalCount; /*M*/ seedSearch->standardProb[13] = 45.0/totalCount; /*N*/ seedSearch->standardProb[14] = 52.0/totalCount; /*P*/ seedSearch->standardProb[15] = 43.0/totalCount; /*Q*/ seedSearch->standardProb[16] = 51.0/totalCount; /*R*/ seedSearch->standardProb[17] = 71.0/totalCount; /*S*/ seedSearch->standardProb[18] = 59.0/totalCount; /*T*/ seedSearch->standardProb[19] = 64.0/totalCount; /*V*/ seedSearch->standardProb[20] = 13.0/totalCount; /*W*/ seedSearch->standardProb[21] = 0.0; /*X*/ seedSearch->standardProb[22] = 32.0/totalCount; /*Y*/ seedSearch->standardProb[23] = 0.0; /*Z*/ seedSearch->standardProb[24] = 0.0; /*U*/}/*set uo matches for words that encode 4 DNA characters; figure out for each of 256 possible DNA 4-mers, where a prefix matches the pattern and where a suffix matches the pattern; store in prefixPos and suffixPos; mask has 1 bits for whatever lengths of stringthe pattern can match, mask2 has 4 1 bits corresponding tothe last 4 positions of a match; they are used todo the prefixPos and suffixPos claculations with bit arithmetic*/static void setting_tt(Int4* S, Int4 mask, Int4 mask2, Uint4* prefixPos, Uint4* suffixPos){ Int4 i; /*index over possible DNA encoded words, 4 bases per word*/ Int4 tmp; /*holds different mask combinations*/ Int4 maskLeftPlusOne; /*mask shifted left 1 plus 1; guarantees 1 1 character match effectively */ Uint1 a1, a2, a3, a4; /*four bases packed into an integer*/ maskLeftPlusOne = (mask << 1)+1; for (i = 0; i < ASCII_SIZE; i++) { /*find out the 4 bases packed in integer i*/ a1 = NCBI2NA_UNPACK_BASE(i, 3); a2 = NCBI2NA_UNPACK_BASE(i, 2); a3 = NCBI2NA_UNPACK_BASE(i, 1); a4 = NCBI2NA_UNPACK_BASE(i, 0); /*what positions match a prefix of a4 followed by a3*/ tmp = ((S[a4]>>1) | mask) & S[a3]; /*what positions match a prefix of a4 followed by a3 followed by a2*/ tmp = ((tmp >>1) | mask) & S[a2]; /*what positions match a prefix of a4, a3, a2,a1*/ prefixPos[i] = mask2 & ((tmp >>1) | mask) & S[a1]; /*what positions match a suffix of a2, a1*/ tmp = ((S[a1]<<1) | maskLeftPlusOne) & S[a2]; /* what positions match a suffix of a3, a2, a1*/ tmp = ((tmp <<1) | maskLeftPlusOne) & S[a3]; /*what positions match a suffix of a4, a3, a2, a1*/ suffixPos[i] = ((((tmp <<1) | maskLeftPlusOne) & S[a4]) << 1) | maskLeftPlusOne; }}/*initialize mask and other arrays for DNA patterns*/static void init_pattern_DNA(patternSearchItems *patternSearch){ Int4 mask1; /*mask for one word in a set position*/ Int4 compositeMask; /*superimposed mask1 in 4 adjacent positions*/ Int4 wordIndex; /*index over words in pattern*/ if (patternSearch->flagPatternLength != ONE_WORD_PATTERN) { for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) { mask1 = patternSearch->match_maskL[wordIndex]; compositeMask = mask1 + (mask1>>1)+(mask1>>2)+(mask1>>3); setting_tt(patternSearch->SLL[wordIndex], patternSearch->match_maskL[wordIndex], compositeMask, patternSearch->DNAprefixSLL[wordIndex], patternSearch->DNAsuffixSLL[wordIndex]); } } else { compositeMask = patternSearch->match_mask + (patternSearch->match_mask>>1) + (patternSearch->match_mask>>2) + (patternSearch->match_mask>>3); patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAwhichPrefixPositions; patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAwhichSuffixPositions; setting_tt(patternSearch->whichPositionsByCharacter, patternSearch->match_mask, compositeMask, patternSearch->DNAwhichPrefixPositions, patternSearch->DNAwhichSuffixPositions); }}/*length is the length of inputPattern, maxLength is a limit on how long inputPattern can get; return the final length of the pattern or -1 if too long*/static Int4 expanding(Int4 *inputPatternMasked, Uint1 *inputPattern, Int4 length, Int4 maxLength){ Int4 i, j; /*pattern indices*/ Int4 numPos; /*number of positions index*/ Int4 k, t; /*loop indices*/ Int4 recReturnValue1, recReturnValue2; /*values returned from recursive calls*/ Int4 thisPlaceMasked; /*value of one place in inputPatternMasked*/ Int4 tempPatternMask[MaxP]; /*used as a local representation of part of inputPatternMasked*/ Uint1 tempPattern[MaxP]; /*used as a local representation of part of inputPattern*/ for (i = 0; i < length; i++) { thisPlaceMasked = -inputPatternMasked[i]; if (thisPlaceMasked > 0) { /*represented variable wildcard*/ inputPatternMasked[i] = allone; for (j = 0; j < length; j++) { /*use this to keep track of pattern*/ tempPatternMask[j] = inputPatternMasked[j]; tempPattern[j] = inputPattern[j]; } recReturnValue2 = recReturnValue1 = expanding(inputPatternMasked, inputPattern, length, maxLength); if (recReturnValue1 == -1) return -1; for (numPos = 0; numPos <= thisPlaceMasked; numPos++) { if (numPos == 1) continue; for (k = 0; k < length; k++) { if (k == i) { for (t = 0; t < numPos; t++) { inputPatternMasked[recReturnValue1++] = allone; if (recReturnValue1 >= maxLength) return(-1); } } else { inputPatternMasked[recReturnValue1] = tempPatternMask[k]; inputPattern[recReturnValue1++] = tempPattern[k]; if (recReturnValue1 >= maxLength) return(-1); } if (recReturnValue1 >= maxLength) return (-1); } recReturnValue1 = expanding(&inputPatternMasked[recReturnValue2], &inputPattern[recReturnValue2], length + numPos - 1, maxLength - recReturnValue2); if (recReturnValue1 == -1) return -1; recReturnValue2 += recReturnValue1; recReturnValue1 = recReturnValue2; } return recReturnValue1; } } return length;}/*Pack the next length bytes of inputPattern into a bit vector where the bit is 1 if and only if the byte is non-0 Returns packed bit vector*/static Int4 packing(Uint1 *inputPattern, Int4 length){ Int4 i; /*loop index*/ Int4 returnValue = 0; /*value to return*/ for (i = 0; i < length; i++) { if (inputPattern[i]) returnValue += (1 << i); } return returnValue;}/*Pack the bit representation of the inputPattern into the array patternSearch->match_maskL numPlaces is the number of positions in inputPattern Also packs patternSearch->bitPatternByLetter */static void longpacking(Int4 numPlaces, Uint1 *inputPattern, patternSearchItems *patternSearch){ Int4 charIndex; /*index over characters in alphabet*/ Int4 bitPattern; /*bit pattern for one word to pack*/ Int4 i; /*loop index over places*/ Int4 wordIndex; /*loop counter over words to pack into*/ patternSearch->numWords = (numPlaces-1) / BITS_PACKED_PER_WORD +1; for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) { bitPattern = 0; for (i = 0; i < BITS_PACKED_PER_WORD; i++) { if (inputPattern[wordIndex*BITS_PACKED_PER_WORD+i]) bitPattern += (1 << i); } patternSearch->match_maskL[wordIndex] = bitPattern; } for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) { for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) { bitPattern = 0; for (i = 0; i < BITS_PACKED_PER_WORD; i++) { if ((1<< charIndex) & patternSearch->inputPatternMasked[wordIndex*BITS_PACKED_PER_WORD + i]) bitPattern = bitPattern | (1 << i); } patternSearch->bitPatternByLetter[charIndex][wordIndex] = bitPattern; } }}/*Return the number of 1 bits in the base 2 representation of a*/static Int4 num_of_one(Int4 a){ Int4 returnValue; returnValue = 0; while (a > 0) { if (a % 2 == 1) returnValue++; a = (a >> 1); } return returnValue;}/*Sets up field in patternSearch when pattern is very long*/static void longpacking2(Int4 *inputPatternMasked, Int4 numPlacesInPattern, patternSearchItems *patternSearch){ Int4 placeIndex; /*index over places in pattern rep.*/ Int4 wordIndex; /*index over words*/ Int4 placeInWord, placeInWord2; /*index for places in a single word*/ Int4 charIndex; /*index over characters in alphabet*/ Int4 oneWordMask; /*mask of matching characters for one word in pattern representation*/ double patternWordProbability; double most_specific; /*lowest probability of a word in the pattern*/ Int4 *oneWordSLL; /*holds patternSearch->SLL for one word*/ most_specific = 1.0; patternSearch->whichMostSpecific = 0; patternWordProbability = 1.0; for (placeIndex = 0, wordIndex = 0, placeInWord=0; placeIndex <= numPlacesInPattern; placeIndex++, placeInWord++) { if (placeIndex==numPlacesInPattern || inputPatternMasked[placeIndex] < 0 || placeInWord == BITS_PACKED_PER_WORD ) { patternSearch->match_maskL[wordIndex] = 1 << (placeInWord-1); oneWordSLL = patternSearch->SLL[wordIndex]; for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) { oneWordMask = 0; for (placeInWord2 = 0; placeInWord2 < placeInWord; placeInWord2++) { if ((1<< charIndex) & inputPatternMasked[placeIndex-placeInWord+placeInWord2]) oneWordMask |= (1 << placeInWord2); } oneWordSLL[charIndex] = oneWordMask; } patternSearch->numPlacesInWord[wordIndex] = placeInWord; if (patternWordProbability < most_specific) { most_specific = patternWordProbability; patternSearch->whichMostSpecific = wordIndex;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -