⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 phi_lookup.c

📁 ncbi源码
💻 C
📖 第 1 页 / 共 2 页
字号:
/* * =========================================================================== * PRODUCTION $Log: phi_lookup.c,v $ * PRODUCTION Revision 1000.2  2004/06/01 18:08:29  gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.14 * PRODUCTION * =========================================================================== *//* $Id: phi_lookup.c,v 1000.2 2004/06/01 18:08:29 gouriano Exp $ * =========================================================================== * *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's offical duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Ilya Dondoshansky * *//** @file phi_lookup.c * Functions for accessing the lookup table for PHI-BLAST * @todo FIXME needs doxygen comments and lines shorter than 80 characters */static char const rcsid[] =     "$Id: phi_lookup.c,v 1000.2 2004/06/01 18:08:29 gouriano Exp $";#include <algo/blast/core/blast_def.h>#include <algo/blast/core/blast_util.h>#include <algo/blast/core/pattern.h>#include <algo/blast/core/phi_lookup.h>#include <algo/blast/core/blast_message.h>#define seedepsilon 0.00001#define allone  ((1 << ALPHABET_SIZE) - 1)typedef struct seedSearchItems {    double  charMultiple[ALPHABET_SIZE];    double  paramC; /*used in e-value computation*/    double  paramLambda; /*used in e-value computation*/    double  paramK; /*used in the bit score computation*/    Int4         cutoffScore; /*lower bound for what is a hit*/    double  standardProb[ALPHABET_SIZE]; /*probability of each letter*/    char         order[ASCII_SIZE];    char         pchars[ALPHABET_SIZE+1];    char         name_space[BUF_SIZE];  /*name of a pattern*/    char         pat_space[PATTERN_SPACE_SIZE];  /*string description                                                   of pattern*/} seedSearchItems;/*Initialize the order of letters in the alphabet, the score matrix,and the row sums of the score matrix. matrixToFill is thescore matrix, program_flag says which variant of the program isused; is_dna tells whether the strings are DNA or protein*/static void init_order(Int4 **matrix, Int4 program_flag, Boolean is_dna, seedSearchItems *seedSearch){    Uint1 i, j; /*loop indices for matrix*/     Int4 *matrixRow; /*row of matrixToFill*/     double rowSum; /*sum of scaled substitution probabilities on matrixRow*/        if (is_dna) {      seedSearch->order['A'] = 0;       seedSearch->order['C'] = 1;      seedSearch->order['G'] = 2;       seedSearch->order['T'] = 3;    }     else {      for (i = 0; i < ALPHABET_SIZE; i++) 	seedSearch->order[(Uint1)seedSearch->pchars[i]] = i;    }    if (program_flag == SEED_FLAG) {      for (i = 0; i < ALPHABET_SIZE; i++)         seedSearch->charMultiple[i] = 0;      for (i = 0; i < ALPHABET_SIZE; i++) {	if (seedSearch->standardProb[i] > seedepsilon) {	  matrixRow = matrix[i];	  rowSum= 0;	  for (j = 0; j < ALPHABET_SIZE; j++) {	    if (seedSearch->standardProb[j] > seedepsilon) 	      rowSum += seedSearch->standardProb[j]*exp(-(seedSearch->paramLambda)*matrixRow[j]);	  }	  seedSearch->charMultiple[i] = rowSum;	}      }    }}/*Initialize occurrence probabilities for each amino acid*/static void initProbs(seedSearchItems * seedSearch){   double totalCount;  /*for Robinson frequencies*/   seedSearch->pchars[0] = '-';   seedSearch->pchars[1] = 'A';   seedSearch->pchars[2] = 'B';   seedSearch->pchars[3] = 'C';   seedSearch->pchars[4] = 'D';   seedSearch->pchars[5] = 'E';   seedSearch->pchars[6] = 'F';   seedSearch->pchars[7] = 'G';   seedSearch->pchars[8] = 'H';   seedSearch->pchars[9] = 'I';   seedSearch->pchars[10] = 'K';   seedSearch->pchars[11] = 'L';   seedSearch->pchars[12] = 'M';   seedSearch->pchars[13] = 'N';   seedSearch->pchars[14] = 'P';   seedSearch->pchars[15] = 'Q';   seedSearch->pchars[16] = 'R';   seedSearch->pchars[17] = 'S';   seedSearch->pchars[18] = 'T';   seedSearch->pchars[19] = 'V';   seedSearch->pchars[20] = 'W';   seedSearch->pchars[21] = 'X';   seedSearch->pchars[22] = 'Y';   seedSearch->pchars[23] = 'Z';   seedSearch->pchars[24] = 'U';   seedSearch->pchars[25] = '*';   totalCount = 78.0 + 19.0 + 54.0 + 63.0 + 39.0 +    74.0 + 22.0 + 52.0 + 57.0 + 90.0 + 22.0 + 45.0 + 52.0 +     43.0 + 51.0 + 71.0 + 59.0 + 64.0 + 13.0 + 32.0;   seedSearch->standardProb[0] = 0.0;   seedSearch->standardProb[1] = 78.0/totalCount; /*A*/   seedSearch->standardProb[2] = 0.0;   seedSearch->standardProb[3] = 19.0/totalCount; /*C*/   seedSearch->standardProb[4] = 54.0/totalCount; /*D*/   seedSearch->standardProb[5] = 63.0/totalCount; /*E*/   seedSearch->standardProb[6] = 39.0/totalCount; /*F*/   seedSearch->standardProb[7] = 74.0/totalCount; /*G*/   seedSearch->standardProb[8] = 22.0/totalCount; /*H*/   seedSearch->standardProb[9] = 52.0/totalCount; /*I*/   seedSearch->standardProb[10] = 57.0/totalCount; /*K*/   seedSearch->standardProb[11] = 90.0/totalCount; /*L*/   seedSearch->standardProb[12] = 22.0/totalCount; /*M*/   seedSearch->standardProb[13] = 45.0/totalCount; /*N*/   seedSearch->standardProb[14] = 52.0/totalCount; /*P*/   seedSearch->standardProb[15] = 43.0/totalCount; /*Q*/   seedSearch->standardProb[16] = 51.0/totalCount; /*R*/   seedSearch->standardProb[17] = 71.0/totalCount; /*S*/   seedSearch->standardProb[18] = 59.0/totalCount; /*T*/   seedSearch->standardProb[19] = 64.0/totalCount; /*V*/   seedSearch->standardProb[20] = 13.0/totalCount; /*W*/   seedSearch->standardProb[21] = 0.0;   /*X*/   seedSearch->standardProb[22] = 32.0/totalCount;   /*Y*/   seedSearch->standardProb[23] = 0.0;   /*Z*/   seedSearch->standardProb[24] = 0.0;   /*U*/}/*set uo matches for words that encode 4 DNA characters; figure out  for each of 256 possible DNA 4-mers, where a prefix matches the pattern and where a suffix matches the pattern; store in prefixPos and suffixPos; mask has 1 bits for whatever lengths of stringthe pattern can match, mask2 has 4 1 bits corresponding tothe last 4 positions of a match; they are used todo the prefixPos and suffixPos claculations with bit arithmetic*/static void setting_tt(Int4* S, Int4 mask, Int4 mask2, Uint4* prefixPos, 		       Uint4* suffixPos){  Int4 i; /*index over possible DNA encoded words, 4 bases per word*/  Int4 tmp; /*holds different mask combinations*/  Int4 maskLeftPlusOne; /*mask shifted left 1 plus 1; guarantees 1                           1 character match effectively */  Uint1 a1, a2, a3, a4;  /*four bases packed into an integer*/  maskLeftPlusOne = (mask << 1)+1;  for (i = 0; i < ASCII_SIZE; i++) {    /*find out the 4 bases packed in integer i*/    a1 = NCBI2NA_UNPACK_BASE(i, 3);    a2 = NCBI2NA_UNPACK_BASE(i, 2);    a3 = NCBI2NA_UNPACK_BASE(i, 1);    a4 = NCBI2NA_UNPACK_BASE(i, 0);    /*what positions match a prefix of a4 followed by a3*/    tmp = ((S[a4]>>1) | mask) & S[a3];    /*what positions match a prefix of a4 followed by a3 followed by a2*/    tmp = ((tmp >>1) | mask) & S[a2];    /*what positions match a prefix of a4, a3, a2,a1*/    prefixPos[i] = mask2 & ((tmp >>1) | mask) & S[a1];        /*what positions match a suffix of a2, a1*/    tmp = ((S[a1]<<1) | maskLeftPlusOne) & S[a2];    /* what positions match a suffix of a3, a2, a1*/    tmp = ((tmp <<1) | maskLeftPlusOne) & S[a3];    /*what positions match a suffix of a4, a3, a2, a1*/    suffixPos[i] = ((((tmp <<1) | maskLeftPlusOne) & S[a4]) << 1) | maskLeftPlusOne;  }}/*initialize mask and other arrays for DNA patterns*/static void init_pattern_DNA(patternSearchItems *patternSearch){  Int4 mask1; /*mask for one word in a set position*/  Int4 compositeMask; /*superimposed mask1 in 4 adjacent positions*/  Int4 wordIndex; /*index over words in pattern*/  if (patternSearch->flagPatternLength != ONE_WORD_PATTERN) {    for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {      mask1 = patternSearch->match_maskL[wordIndex];      compositeMask = mask1 + (mask1>>1)+(mask1>>2)+(mask1>>3);      setting_tt(patternSearch->SLL[wordIndex],       patternSearch->match_maskL[wordIndex], 	 compositeMask, patternSearch->DNAprefixSLL[wordIndex], patternSearch->DNAsuffixSLL[wordIndex]);    }  }   else {    compositeMask = patternSearch->match_mask +       (patternSearch->match_mask>>1) +       (patternSearch->match_mask>>2) + (patternSearch->match_mask>>3);     patternSearch->DNAwhichPrefixPosPtr = patternSearch->DNAwhichPrefixPositions;     patternSearch->DNAwhichSuffixPosPtr = patternSearch->DNAwhichSuffixPositions;    setting_tt(patternSearch->whichPositionsByCharacter,     patternSearch->match_mask, compositeMask,     patternSearch->DNAwhichPrefixPositions, patternSearch->DNAwhichSuffixPositions);  }}/*length is the length of inputPattern, maxLength is a limit on   how long inputPattern can get;   return the final length of the pattern or -1 if too long*/static Int4 expanding(Int4 *inputPatternMasked, Uint1 *inputPattern, 		      Int4 length, Int4 maxLength){    Int4 i, j; /*pattern indices*/    Int4 numPos; /*number of positions index*/    Int4  k, t; /*loop indices*/    Int4 recReturnValue1, recReturnValue2; /*values returned from                                             recursive calls*/    Int4 thisPlaceMasked; /*value of one place in inputPatternMasked*/    Int4 tempPatternMask[MaxP]; /*used as a local representation of                               part of inputPatternMasked*/    Uint1 tempPattern[MaxP]; /*used as a local representation of part of                               inputPattern*/    for (i = 0; i < length; i++) {      thisPlaceMasked = -inputPatternMasked[i];      if (thisPlaceMasked > 0) {  /*represented variable wildcard*/	inputPatternMasked[i] = allone;	for (j = 0; j < length; j++) {	  /*use this to keep track of pattern*/	  tempPatternMask[j] = inputPatternMasked[j]; 	  tempPattern[j] = inputPattern[j];	}	recReturnValue2 = recReturnValue1 = 	  expanding(inputPatternMasked, inputPattern, length, maxLength);	if (recReturnValue1 == -1)	  return -1;	for (numPos = 0; numPos <= thisPlaceMasked; numPos++) {	  if (numPos == 1)	    continue;	  for (k = 0; k < length; k++) {	    if (k == i) {	      for (t = 0; t < numPos; t++) {		inputPatternMasked[recReturnValue1++] = allone;                if (recReturnValue1 >= maxLength)                  return(-1);	      }	    }	    else {	      inputPatternMasked[recReturnValue1] = tempPatternMask[k];	      inputPattern[recReturnValue1++] = tempPattern[k];              if (recReturnValue1 >= maxLength)                  return(-1);	    }	    if (recReturnValue1 >= maxLength) 	      return (-1);	  }	  recReturnValue1 = 	    expanding(&inputPatternMasked[recReturnValue2], 		      &inputPattern[recReturnValue2], 		      length + numPos - 1, 		      maxLength - recReturnValue2);	  if (recReturnValue1 == -1) 	    return -1;	  recReturnValue2 += recReturnValue1; 	  recReturnValue1 = recReturnValue2;	}	return recReturnValue1;      }    }    return length;}/*Pack the next length bytes of inputPattern into a bit vector  where the bit is 1 if and only if the byte is non-0   Returns packed bit vector*/static Int4 packing(Uint1 *inputPattern, Int4 length){    Int4 i; /*loop index*/    Int4 returnValue = 0; /*value to return*/    for (i = 0; i < length; i++) {      if (inputPattern[i])	returnValue += (1 << i);    }    return returnValue;}/*Pack the bit representation of the inputPattern into   the array patternSearch->match_maskL   numPlaces is the number of positions in   inputPattern   Also packs patternSearch->bitPatternByLetter  */static void longpacking(Int4 numPlaces, Uint1 *inputPattern, patternSearchItems *patternSearch){    Int4 charIndex; /*index over characters in alphabet*/    Int4 bitPattern; /*bit pattern for one word to pack*/    Int4 i;  /*loop index over places*/    Int4 wordIndex; /*loop counter over words to pack into*/        patternSearch->numWords = (numPlaces-1) / BITS_PACKED_PER_WORD +1;    for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {      bitPattern = 0;      for (i = 0; i < BITS_PACKED_PER_WORD; i++) {	if (inputPattern[wordIndex*BITS_PACKED_PER_WORD+i]) 	  bitPattern += (1 << i);      }      patternSearch->match_maskL[wordIndex] = bitPattern;    }    for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) {      for (wordIndex = 0; wordIndex < patternSearch->numWords; wordIndex++) {	bitPattern = 0;	for (i = 0; i < BITS_PACKED_PER_WORD; i++) {	  if ((1<< charIndex) & patternSearch->inputPatternMasked[wordIndex*BITS_PACKED_PER_WORD + i]) 	    bitPattern = bitPattern | (1 << i);	}	patternSearch->bitPatternByLetter[charIndex][wordIndex] = 	  bitPattern;	}    }}/*Return the number of 1 bits in the base 2 representation of a*/static Int4 num_of_one(Int4 a){  Int4 returnValue;  returnValue = 0;  while (a > 0) {    if (a % 2 == 1)       returnValue++;    a = (a >> 1);  }  return returnValue;}/*Sets up field in patternSearch when pattern is very long*/static void longpacking2(Int4 *inputPatternMasked, Int4 numPlacesInPattern, patternSearchItems *patternSearch){    Int4 placeIndex; /*index over places in pattern rep.*/    Int4 wordIndex; /*index over words*/    Int4 placeInWord, placeInWord2;  /*index for places in a single word*/    Int4 charIndex; /*index over characters in alphabet*/    Int4 oneWordMask; /*mask of matching characters for one word in                        pattern representation*/    double patternWordProbability;    double  most_specific; /*lowest probability of a word in the pattern*/    Int4 *oneWordSLL; /*holds patternSearch->SLL for one word*/    most_specific = 1.0;     patternSearch->whichMostSpecific = 0;     patternWordProbability = 1.0;    for (placeIndex = 0, wordIndex = 0, placeInWord=0; 	 placeIndex <= numPlacesInPattern; 	 placeIndex++, placeInWord++) {      if (placeIndex==numPlacesInPattern || inputPatternMasked[placeIndex] < 0 	  || placeInWord == BITS_PACKED_PER_WORD ) {	patternSearch->match_maskL[wordIndex] = 1 << (placeInWord-1);	oneWordSLL = patternSearch->SLL[wordIndex];	for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) {	  oneWordMask = 0;	  for (placeInWord2 = 0; placeInWord2 < placeInWord; placeInWord2++) {	    if ((1<< charIndex) & 		inputPatternMasked[placeIndex-placeInWord+placeInWord2]) 	      oneWordMask |= (1 << placeInWord2);	  }	  oneWordSLL[charIndex] = oneWordMask;	}	patternSearch->numPlacesInWord[wordIndex] = placeInWord;	if (patternWordProbability < most_specific) {	  most_specific = patternWordProbability;	  patternSearch->whichMostSpecific = wordIndex;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -