📄 phi_lookup.c

📁 ncbi源码
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
	}	if (placeIndex == numPlacesInPattern) 	  patternSearch->spacing[wordIndex++] = 0; 	else 	  if (inputPatternMasked[placeIndex] < 0) { 	    patternSearch->spacing[wordIndex++] = -inputPatternMasked[placeIndex];	  }	  else { 	    placeIndex--; 	    patternSearch->spacing[wordIndex++] = 0;	  }	placeInWord = -1; 	patternWordProbability = 1.0;      }      else {	patternWordProbability *= (double) 	  num_of_one(inputPatternMasked[placeIndex])/ (double) ALPHABET_SIZE;	}    }    patternSearch->numWords = wordIndex;}/*pattern is a string describing the pattern to search for;  is_dna is a boolean describing the strings are DNA or protein*/static Int4 init_pattern(Uint1 *pattern, Boolean is_dna, BlastScoreBlk* sbp,              patternSearchItems* *pattern_info,             Blast_Message* *error_msg){    Uint4 i; /*index over string describing the pattern*/    Uint4 j; /*index for position in pattern*/    Int4 charIndex; /*index over characters in alphabet*/    Int4 secondIndex; /*second index into pattern*/    Int4 numIdentical; /*number of consec. positions with identical specification*/    Uint4 charSetMask;  /*index over masks for specific characters*/    Int4 currentSetMask, prevSetMask ; /*mask for current and previous character positions*/        Int4 thisMask;    /*integer representing a bit pattern for a                         set of characters*/    Int4 minWildcard, maxWildcard; /*used for variable number of wildcard                                     positions*/    Uint4  tj=0; /*temporary copy of j*/    Int4 tempInputPatternMasked[MaxP]; /*local copy of parts            of inputPatternMasked*/    Uint1 c;  /*character occurring in pattern*/    Uint1 localPattern[MaxP]; /*local variable to hold                               for each position whether it is                               last in pattern (1) or not (0) */    double positionProbability; /*probability of a set of characters                                    allowed in one position*/    Int4 currentWildcardProduct; /*product of wildcard lengths for                                   consecutive character positions that                                   overlap*/    seedSearchItems *seedSearch;    patternSearchItems* patternSearch;    seedSearch = (seedSearchItems*) calloc(1, sizeof(seedSearchItems));    patternSearch = *pattern_info =        (patternSearchItems*) calloc(1, sizeof(patternSearchItems));    initProbs(seedSearch);    init_order(sbp->matrix, PAT_SEED_FLAG, FALSE, seedSearch);    patternSearch->flagPatternLength = ONE_WORD_PATTERN;     patternSearch->patternProbability = 1.0;    patternSearch->minPatternMatchLength = 0;    patternSearch->wildcardProduct = 1;    currentWildcardProduct = 1;    prevSetMask = 0;    currentSetMask = 0;    for (i = 0 ; i < MaxP; i++) {      patternSearch->inputPatternMasked[i] = 0;       localPattern[i] = 0;    }    for (i = 0, j = 0; i < strlen((Char *) pattern); i++) {      if ((c=pattern[i]) == '-' || c == '\n' || c == '.' || c =='>' || c ==' ' || c == '<')  /*spacers that mean nothing*/	continue;      if ( c != '[' && c != '{') { /*not the start of a set of characters*/	if (c == 'x' || c== 'X') {  /*wild-card character matches anything*/          /*next line checks to see if wild card is for multiple positions*/	  if (pattern[i+1] == '(') {	    i++;	    secondIndex = i;            /*find end of description of how many positions are wildcarded               will look like x(2) or x(2,5) */	    while (pattern[secondIndex] != ',' && pattern[secondIndex] != ')')	      secondIndex++;	    if (pattern[secondIndex] == ')') {  /*fixed number of positions wildcarded*/	      i -= 1;               /*wildcard, so all characters are allowed*/	      charSetMask=allone; 	      positionProbability = 1;	    }	    else { /*variable number of positions wildcarded*/	  	      sscanf((Char*) &pattern[++i], "%d,%d", &minWildcard, &maxWildcard);	      maxWildcard = maxWildcard - minWildcard;         currentWildcardProduct *= (maxWildcard + 1);         if (currentWildcardProduct > patternSearch->wildcardProduct)            patternSearch->wildcardProduct = currentWildcardProduct;         patternSearch->minPatternMatchLength += minWildcard;	      while (minWildcard-- > 0) {             /*use one position each for the minimum number of              wildcard spaces required */            patternSearch->inputPatternMasked[j++] = allone;             if (j >= MaxP) {               Blast_MessageWrite(error_msg, BLAST_SEV_WARNING, 2, 1,                                   "pattern too long");               return(-1);            }	      }	      if (maxWildcard != 0) {            /*negative masking used to indicate variability              in number of wildcard spaces; e.g., if pattern looks              like x(3,5) then variability is 2 and there will              be three wildcard positions with mask allone followed              by a single position with mask -2*/            patternSearch->inputPatternMasked[j++] = -maxWildcard;            patternSearch->patternProbability *= maxWildcard;	      }         /*now skip over wildcard description with the i index*/	      while (pattern[++i] != ')') ; 	      continue;	    }	  }	  else {  /*wild card is for one position only*/	    charSetMask=allone; 	    positionProbability =1;	  }	} 	else {	  if (c == 'U') {   /*look for special U character*/	    charSetMask = allone*2+1;	    positionProbability = 1; 	  }	  else {         /*exactly one character matches*/        prevSetMask = currentSetMask;        currentSetMask =  charSetMask = (1 << seedSearch->order[c]);        if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/           currentWildcardProduct = 1;        positionProbability =            seedSearch->standardProb[(Uint1)seedSearch->order[c]];	  }	}      }      else {	if (c == '[') {  /*start of a set of characters allowed*/	  charSetMask = 0;	  positionProbability = 0;	  /*For each character in the set add it to the mask and            add its probability to positionProbability*/	  while ((c=pattern[++i]) != ']') { /*end of set*/        if ((c < 'A') || (c > 'Z') || (c == '\0')) {           Blast_MessageWrite(error_msg, BLAST_SEV_WARNING, 2, 1,                               "pattern description has a non-alphabetic"                              "character inside a bracket");                      return(-1);        }        charSetMask = charSetMask | (1 << seedSearch->order[c]);        positionProbability += seedSearch->standardProb[(Uint1)seedSearch->order[c]];	  }     prevSetMask = currentSetMask;     currentSetMask = charSetMask;	  if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/	      currentWildcardProduct = 1; 	} 	else {   /*start of a set of characters forbidden*/	  /*For each character forbidden remove it to the mask and            subtract its probability from positionProbability*/	  charSetMask = allone; 	  positionProbability = 1;	  while ((c=pattern[++i]) != '}') { /*end of set*/	    charSetMask = charSetMask -  (charSetMask & (1 << seedSearch->order[c]));	    positionProbability -= seedSearch->standardProb[(Uint1)seedSearch->order[c]];	  }          prevSetMask = currentSetMask;          currentSetMask = charSetMask;	  if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/	      currentWildcardProduct = 1;	}      }      /*handle a number of positions that are the same */      if (pattern[i+1] == '(') {  /*read opening paren*/	i++;	numIdentical = atoi((Char *) &pattern[++i]);  /*get number of positions*/        patternSearch->minPatternMatchLength += numIdentical;	while (pattern[++i] != ')') ;  /*skip over piece in pattern*/	while ((numIdentical--) > 0) {	  /*set up mask for these positions*/	  patternSearch->inputPatternMasked[j++] = charSetMask;	  patternSearch->patternProbability *= positionProbability; 	}      }       else {   /*specification is for one posiion only*/         patternSearch->inputPatternMasked[j++] = charSetMask;         patternSearch->minPatternMatchLength++;         patternSearch->patternProbability *= positionProbability;      }      if (j >= MaxP) {         Blast_MessageWrite(error_msg, BLAST_SEV_WARNING, 2, 1,                             "pattern too long");      }    }    localPattern[j-1] = 1;    if (patternSearch->patternProbability > 1.0)      patternSearch->patternProbability = 1.0;    for (i = 0; i < j; i++) {      tempInputPatternMasked[i] = patternSearch->inputPatternMasked[i];       tj = j;    }    j = expanding(patternSearch->inputPatternMasked, localPattern, j, MaxP);    if ((j== -1) || ((j > BITS_PACKED_PER_WORD) && is_dna)) {      patternSearch->flagPatternLength = PATTERN_TOO_LONG;      longpacking2(tempInputPatternMasked, tj, patternSearch);      for (i = 0; i < tj; i++)          patternSearch->inputPatternMasked[i] = tempInputPatternMasked[i];      patternSearch->highestPlace = tj;      if (is_dna)          init_pattern_DNA(patternSearch);      return 1;    }    if (j > BITS_PACKED_PER_WORD) {      patternSearch->flagPatternLength = MULTI_WORD_PATTERN;      longpacking(j, localPattern, patternSearch);      return j;    }     /*make a bit mask out of local pattern of length j*/    patternSearch->match_mask = packing(localPattern, j);    /*store for each character a bit mask of which positions      that character can occur in*/    for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) {      thisMask = 0;      for (charSetMask = 0; charSetMask < j; charSetMask++) {	if ((1<< charIndex) & patternSearch->inputPatternMasked[charSetMask]) 	  thisMask |= (1 << charSetMask);      }      patternSearch->whichPositionsByCharacter[charIndex] = thisMask;    }    patternSearch->whichPositionPtr = patternSearch->whichPositionsByCharacter;    if (is_dna)       init_pattern_DNA(patternSearch);    return j; /*return number of places for pattern representation*/}Int2 PHILookupTableNew(const LookupTableOptions* opt, PHILookupTable* * lut,                       Boolean is_dna, BlastScoreBlk* sbp){   PHILookupTable* lookup = *lut =       (PHILookupTable*) malloc(sizeof(PHILookupTable));   Blast_Message* error_msg = NULL;   if (!lookup)      return -1;   lookup->is_dna = is_dna;   init_pattern((Uint1*)opt->phi_pattern, is_dna, sbp, &lookup->pattern_info,                &error_msg);    lookup->num_matches = 0;   lookup->allocated_size = MIN_PHI_LOOKUP_SIZE;   if ((lookup->start_offsets =        (Int4*) malloc(MIN_PHI_LOOKUP_SIZE*sizeof(Int4))) == NULL)      return -1;   if ((lookup->lengths = (Int4*) malloc(MIN_PHI_LOOKUP_SIZE*sizeof(Int4)))        == NULL)      return -1;   return 0;}PHILookupTable* PHILookupTableDestruct(PHILookupTable* lut){   sfree(lut->pattern_info);   sfree(lut->start_offsets);   sfree(lut->lengths);   sfree(lut);   return NULL;}static Int2 PHIBlastAddPatternHit(PHILookupTable* lookup, Int4 offset,                                   Int4 length){   if (lookup->num_matches >= lookup->allocated_size) {      lookup->start_offsets = (Int4*) realloc(lookup->start_offsets,                                               2*lookup->allocated_size);      lookup->lengths = (Int4*) realloc(lookup->lengths,                                               2*lookup->allocated_size);      if (!lookup->start_offsets || !lookup->lengths)         return -1;      lookup->allocated_size *= 2;   }               lookup->start_offsets[lookup->num_matches] = offset;   lookup->lengths[lookup->num_matches] = length;   ++lookup->num_matches;   return 0;}Int4 PHIBlastIndexQuery(PHILookupTable* lookup,         BLAST_SequenceBlk* query, ListNode* location, Boolean is_dna){   ListNode* loc;   Int4 from, to;   Int4 loc_length;   Uint1* sequence;   patternSearchItems* pattern_info = lookup->pattern_info;   Int4* hitArray;   Int4 i, twiceNumHits;      hitArray = (Int4 *) calloc(2*query->length, sizeof(Int4));   for(loc=location; loc; loc=loc->next) {      from = ((SSeqRange*) loc->ptr)->left;      to = ((SSeqRange*) loc->ptr)->right;      loc_length = to - from + 1;      sequence = query->sequence + from;            twiceNumHits = FindPatternHits(hitArray, sequence, loc_length, is_dna,                                     pattern_info);            for (i = 0; i < twiceNumHits; i += 2) {         PHIBlastAddPatternHit(lookup, hitArray[i+1]+from,                                hitArray[i]-hitArray[i+1]+1);      }   }   return lookup->num_matches;}static Boolean PHIBlastMatchPatterns(Uint1* subject, Uint1* query, Int4 length){   Int4 index;   for (index = 0; index < length; ++index) {      if (subject[index] != query[index])         break;   }   return (index == length);}Int4 PHIBlastScanSubject(const LookupTableWrap* lookup_wrap,        const BLAST_SequenceBlk *query_blk,         const BLAST_SequenceBlk *subject_blk,         Int4* offset_ptr, Uint4 * query_offsets, Uint4 * subject_offsets,         Int4 array_size){   Uint1* subject, *query;   PHILookupTable* lookup = (PHILookupTable*) lookup_wrap->lut;   Int4 index, count = 0, twiceNumHits, i;   Int4 *start_offsets = lookup->start_offsets;   Int4 *pat_lengths = lookup->lengths;   Int4 offset, length;   Int4 hitArray[MAX_HIT];   query = query_blk->sequence;   subject = subject_blk->sequence;   /* It must be guaranteed that all pattern matches for a given       subject sequence are processed in one call to this function.   */   *offset_ptr = subject_blk->length;   twiceNumHits = FindPatternHits(hitArray, subject, subject_blk->length,                                   lookup->is_dna, lookup->pattern_info);   for (i = 0; i < twiceNumHits; i += 2) {#if 0      if (count > array_size - lookup->num_matches)            break;#endif      length = hitArray[i] - hitArray[i+1] + 1;      offset = hitArray[i+1];      for (index = 0; index < lookup->num_matches; ++index) {         /* Match pattern lengths in subject in query first; then             check for identical match of pattern */         if (length == pat_lengths[index] &&             PHIBlastMatchPatterns(subject+offset, query+start_offsets[index],                                    length))         {            /* Pattern has matched completely. Save index into the array               of pattern start offsets in query (so pattern length will               be accessible in the word finder later), and the subject               offset. */            query_offsets[count] = index;            subject_offsets[count] = offset;            ++count;         }      }   }   return count;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -