📄 phi_lookup.c
字号:
} if (placeIndex == numPlacesInPattern) patternSearch->spacing[wordIndex++] = 0; else if (inputPatternMasked[placeIndex] < 0) { patternSearch->spacing[wordIndex++] = -inputPatternMasked[placeIndex]; } else { placeIndex--; patternSearch->spacing[wordIndex++] = 0; } placeInWord = -1; patternWordProbability = 1.0; } else { patternWordProbability *= (double) num_of_one(inputPatternMasked[placeIndex])/ (double) ALPHABET_SIZE; } } patternSearch->numWords = wordIndex;}/*pattern is a string describing the pattern to search for; is_dna is a boolean describing the strings are DNA or protein*/static Int4 init_pattern(Uint1 *pattern, Boolean is_dna, BlastScoreBlk* sbp, patternSearchItems* *pattern_info, Blast_Message* *error_msg){ Uint4 i; /*index over string describing the pattern*/ Uint4 j; /*index for position in pattern*/ Int4 charIndex; /*index over characters in alphabet*/ Int4 secondIndex; /*second index into pattern*/ Int4 numIdentical; /*number of consec. positions with identical specification*/ Uint4 charSetMask; /*index over masks for specific characters*/ Int4 currentSetMask, prevSetMask ; /*mask for current and previous character positions*/ Int4 thisMask; /*integer representing a bit pattern for a set of characters*/ Int4 minWildcard, maxWildcard; /*used for variable number of wildcard positions*/ Uint4 tj=0; /*temporary copy of j*/ Int4 tempInputPatternMasked[MaxP]; /*local copy of parts of inputPatternMasked*/ Uint1 c; /*character occurring in pattern*/ Uint1 localPattern[MaxP]; /*local variable to hold for each position whether it is last in pattern (1) or not (0) */ double positionProbability; /*probability of a set of characters allowed in one position*/ Int4 currentWildcardProduct; /*product of wildcard lengths for consecutive character positions that overlap*/ seedSearchItems *seedSearch; patternSearchItems* patternSearch; seedSearch = (seedSearchItems*) calloc(1, sizeof(seedSearchItems)); patternSearch = *pattern_info = (patternSearchItems*) calloc(1, sizeof(patternSearchItems)); initProbs(seedSearch); init_order(sbp->matrix, PAT_SEED_FLAG, FALSE, seedSearch); patternSearch->flagPatternLength = ONE_WORD_PATTERN; patternSearch->patternProbability = 1.0; patternSearch->minPatternMatchLength = 0; patternSearch->wildcardProduct = 1; currentWildcardProduct = 1; prevSetMask = 0; currentSetMask = 0; for (i = 0 ; i < MaxP; i++) { patternSearch->inputPatternMasked[i] = 0; localPattern[i] = 0; } for (i = 0, j = 0; i < strlen((Char *) pattern); i++) { if ((c=pattern[i]) == '-' || c == '\n' || c == '.' || c =='>' || c ==' ' || c == '<') /*spacers that mean nothing*/ continue; if ( c != '[' && c != '{') { /*not the start of a set of characters*/ if (c == 'x' || c== 'X') { /*wild-card character matches anything*/ /*next line checks to see if wild card is for multiple positions*/ if (pattern[i+1] == '(') { i++; secondIndex = i; /*find end of description of how many positions are wildcarded will look like x(2) or x(2,5) */ while (pattern[secondIndex] != ',' && pattern[secondIndex] != ')') secondIndex++; if (pattern[secondIndex] == ')') { /*fixed number of positions wildcarded*/ i -= 1; /*wildcard, so all characters are allowed*/ charSetMask=allone; positionProbability = 1; } else { /*variable number of positions wildcarded*/ sscanf((Char*) &pattern[++i], "%d,%d", &minWildcard, &maxWildcard); maxWildcard = maxWildcard - minWildcard; currentWildcardProduct *= (maxWildcard + 1); if (currentWildcardProduct > patternSearch->wildcardProduct) patternSearch->wildcardProduct = currentWildcardProduct; patternSearch->minPatternMatchLength += minWildcard; while (minWildcard-- > 0) { /*use one position each for the minimum number of wildcard spaces required */ patternSearch->inputPatternMasked[j++] = allone; if (j >= MaxP) { Blast_MessageWrite(error_msg, BLAST_SEV_WARNING, 2, 1, "pattern too long"); return(-1); } } if (maxWildcard != 0) { /*negative masking used to indicate variability in number of wildcard spaces; e.g., if pattern looks like x(3,5) then variability is 2 and there will be three wildcard positions with mask allone followed by a single position with mask -2*/ patternSearch->inputPatternMasked[j++] = -maxWildcard; patternSearch->patternProbability *= maxWildcard; } /*now skip over wildcard description with the i index*/ while (pattern[++i] != ')') ; continue; } } else { /*wild card is for one position only*/ charSetMask=allone; positionProbability =1; } } else { if (c == 'U') { /*look for special U character*/ charSetMask = allone*2+1; positionProbability = 1; } else { /*exactly one character matches*/ prevSetMask = currentSetMask; currentSetMask = charSetMask = (1 << seedSearch->order[c]); if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/ currentWildcardProduct = 1; positionProbability = seedSearch->standardProb[(Uint1)seedSearch->order[c]]; } } } else { if (c == '[') { /*start of a set of characters allowed*/ charSetMask = 0; positionProbability = 0; /*For each character in the set add it to the mask and add its probability to positionProbability*/ while ((c=pattern[++i]) != ']') { /*end of set*/ if ((c < 'A') || (c > 'Z') || (c == '\0')) { Blast_MessageWrite(error_msg, BLAST_SEV_WARNING, 2, 1, "pattern description has a non-alphabetic" "character inside a bracket"); return(-1); } charSetMask = charSetMask | (1 << seedSearch->order[c]); positionProbability += seedSearch->standardProb[(Uint1)seedSearch->order[c]]; } prevSetMask = currentSetMask; currentSetMask = charSetMask; if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/ currentWildcardProduct = 1; } else { /*start of a set of characters forbidden*/ /*For each character forbidden remove it to the mask and subtract its probability from positionProbability*/ charSetMask = allone; positionProbability = 1; while ((c=pattern[++i]) != '}') { /*end of set*/ charSetMask = charSetMask - (charSetMask & (1 << seedSearch->order[c])); positionProbability -= seedSearch->standardProb[(Uint1)seedSearch->order[c]]; } prevSetMask = currentSetMask; currentSetMask = charSetMask; if (!(prevSetMask & currentSetMask)) /*character sets don't overlap*/ currentWildcardProduct = 1; } } /*handle a number of positions that are the same */ if (pattern[i+1] == '(') { /*read opening paren*/ i++; numIdentical = atoi((Char *) &pattern[++i]); /*get number of positions*/ patternSearch->minPatternMatchLength += numIdentical; while (pattern[++i] != ')') ; /*skip over piece in pattern*/ while ((numIdentical--) > 0) { /*set up mask for these positions*/ patternSearch->inputPatternMasked[j++] = charSetMask; patternSearch->patternProbability *= positionProbability; } } else { /*specification is for one posiion only*/ patternSearch->inputPatternMasked[j++] = charSetMask; patternSearch->minPatternMatchLength++; patternSearch->patternProbability *= positionProbability; } if (j >= MaxP) { Blast_MessageWrite(error_msg, BLAST_SEV_WARNING, 2, 1, "pattern too long"); } } localPattern[j-1] = 1; if (patternSearch->patternProbability > 1.0) patternSearch->patternProbability = 1.0; for (i = 0; i < j; i++) { tempInputPatternMasked[i] = patternSearch->inputPatternMasked[i]; tj = j; } j = expanding(patternSearch->inputPatternMasked, localPattern, j, MaxP); if ((j== -1) || ((j > BITS_PACKED_PER_WORD) && is_dna)) { patternSearch->flagPatternLength = PATTERN_TOO_LONG; longpacking2(tempInputPatternMasked, tj, patternSearch); for (i = 0; i < tj; i++) patternSearch->inputPatternMasked[i] = tempInputPatternMasked[i]; patternSearch->highestPlace = tj; if (is_dna) init_pattern_DNA(patternSearch); return 1; } if (j > BITS_PACKED_PER_WORD) { patternSearch->flagPatternLength = MULTI_WORD_PATTERN; longpacking(j, localPattern, patternSearch); return j; } /*make a bit mask out of local pattern of length j*/ patternSearch->match_mask = packing(localPattern, j); /*store for each character a bit mask of which positions that character can occur in*/ for (charIndex = 0; charIndex < ALPHABET_SIZE; charIndex++) { thisMask = 0; for (charSetMask = 0; charSetMask < j; charSetMask++) { if ((1<< charIndex) & patternSearch->inputPatternMasked[charSetMask]) thisMask |= (1 << charSetMask); } patternSearch->whichPositionsByCharacter[charIndex] = thisMask; } patternSearch->whichPositionPtr = patternSearch->whichPositionsByCharacter; if (is_dna) init_pattern_DNA(patternSearch); return j; /*return number of places for pattern representation*/}Int2 PHILookupTableNew(const LookupTableOptions* opt, PHILookupTable* * lut, Boolean is_dna, BlastScoreBlk* sbp){ PHILookupTable* lookup = *lut = (PHILookupTable*) malloc(sizeof(PHILookupTable)); Blast_Message* error_msg = NULL; if (!lookup) return -1; lookup->is_dna = is_dna; init_pattern((Uint1*)opt->phi_pattern, is_dna, sbp, &lookup->pattern_info, &error_msg); lookup->num_matches = 0; lookup->allocated_size = MIN_PHI_LOOKUP_SIZE; if ((lookup->start_offsets = (Int4*) malloc(MIN_PHI_LOOKUP_SIZE*sizeof(Int4))) == NULL) return -1; if ((lookup->lengths = (Int4*) malloc(MIN_PHI_LOOKUP_SIZE*sizeof(Int4))) == NULL) return -1; return 0;}PHILookupTable* PHILookupTableDestruct(PHILookupTable* lut){ sfree(lut->pattern_info); sfree(lut->start_offsets); sfree(lut->lengths); sfree(lut); return NULL;}static Int2 PHIBlastAddPatternHit(PHILookupTable* lookup, Int4 offset, Int4 length){ if (lookup->num_matches >= lookup->allocated_size) { lookup->start_offsets = (Int4*) realloc(lookup->start_offsets, 2*lookup->allocated_size); lookup->lengths = (Int4*) realloc(lookup->lengths, 2*lookup->allocated_size); if (!lookup->start_offsets || !lookup->lengths) return -1; lookup->allocated_size *= 2; } lookup->start_offsets[lookup->num_matches] = offset; lookup->lengths[lookup->num_matches] = length; ++lookup->num_matches; return 0;}Int4 PHIBlastIndexQuery(PHILookupTable* lookup, BLAST_SequenceBlk* query, ListNode* location, Boolean is_dna){ ListNode* loc; Int4 from, to; Int4 loc_length; Uint1* sequence; patternSearchItems* pattern_info = lookup->pattern_info; Int4* hitArray; Int4 i, twiceNumHits; hitArray = (Int4 *) calloc(2*query->length, sizeof(Int4)); for(loc=location; loc; loc=loc->next) { from = ((SSeqRange*) loc->ptr)->left; to = ((SSeqRange*) loc->ptr)->right; loc_length = to - from + 1; sequence = query->sequence + from; twiceNumHits = FindPatternHits(hitArray, sequence, loc_length, is_dna, pattern_info); for (i = 0; i < twiceNumHits; i += 2) { PHIBlastAddPatternHit(lookup, hitArray[i+1]+from, hitArray[i]-hitArray[i+1]+1); } } return lookup->num_matches;}static Boolean PHIBlastMatchPatterns(Uint1* subject, Uint1* query, Int4 length){ Int4 index; for (index = 0; index < length; ++index) { if (subject[index] != query[index]) break; } return (index == length);}Int4 PHIBlastScanSubject(const LookupTableWrap* lookup_wrap, const BLAST_SequenceBlk *query_blk, const BLAST_SequenceBlk *subject_blk, Int4* offset_ptr, Uint4 * query_offsets, Uint4 * subject_offsets, Int4 array_size){ Uint1* subject, *query; PHILookupTable* lookup = (PHILookupTable*) lookup_wrap->lut; Int4 index, count = 0, twiceNumHits, i; Int4 *start_offsets = lookup->start_offsets; Int4 *pat_lengths = lookup->lengths; Int4 offset, length; Int4 hitArray[MAX_HIT]; query = query_blk->sequence; subject = subject_blk->sequence; /* It must be guaranteed that all pattern matches for a given subject sequence are processed in one call to this function. */ *offset_ptr = subject_blk->length; twiceNumHits = FindPatternHits(hitArray, subject, subject_blk->length, lookup->is_dna, lookup->pattern_info); for (i = 0; i < twiceNumHits; i += 2) {#if 0 if (count > array_size - lookup->num_matches) break;#endif length = hitArray[i] - hitArray[i+1] + 1; offset = hitArray[i+1]; for (index = 0; index < lookup->num_matches; ++index) { /* Match pattern lengths in subject in query first; then check for identical match of pattern */ if (length == pat_lengths[index] && PHIBlastMatchPatterns(subject+offset, query+start_offsets[index], length)) { /* Pattern has matched completely. Save index into the array of pattern start offsets in query (so pattern length will be accessible in the word finder later), and the subject offset. */ query_offsets[count] = index; subject_offsets[count] = offset; ++count; } } } return count;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -