📄 pattern.c
字号:
/* * =========================================================================== * PRODUCTION $Log: pattern.c,v $ * PRODUCTION Revision 1000.2 2004/06/01 18:08:25 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.9 * PRODUCTION * =========================================================================== *//* $Id: pattern.c,v 1000.2 2004/06/01 18:08:25 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's offical duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Ilya Dondoshansky * *//** @file pattern.c * Functions for finding pattern matches in sequence. * @todo FIXME needs doxygen comments and lines shorter than 80 characters */static char const rcsid[] = "$Id: pattern.c,v 1000.2 2004/06/01 18:08:25 gouriano Exp $";#include <algo/blast/core/blast_def.h>#include <algo/blast/core/pattern.h>/*Looks for 1 bits in the same position of s and mask Let rightOne be the rightmost position where s and mask both have a 1. Let rightMaskOnly < rightOne be the rightmost position where mask has a 1, if any or -1 otherwise returns (rightOne - rightMaskOnly) */ static Int4 lenof(Int4 s, Int4 mask){ Int4 checkingMatches = s & mask; /*look for 1 bits in same position*/ Int4 rightOne; /*loop index looking for 1 in checkingMatches*/ Int4 rightMaskOnly; /*rightnost bit that is 1 in the mask only*/ rightMaskOnly = -1; /*AAS Changed upper bound on loop here*/ for (rightOne = 0; rightOne < BITS_PACKED_PER_WORD; rightOne++) { if ((checkingMatches >> rightOne) % 2 == 1) return rightOne - rightMaskOnly; if ((mask >> rightOne) %2 == 1) rightMaskOnly = rightOne; } /*ErrPostEx(SEV_FATAL, 1, 0, "wrong\n");*/ return(-1);}/* routine to find hits of pattern to sequence when sequence is proteins hitArray is an array of matches to pass back seq is the input sequence len1 is the length of the input sequence patternSearch carries variables that keep track of search parameters returns the number of matches*/static Int4 find_hitsS(Int4 *hitArray, const Uint1* seq, Int4 len1, patternSearchItems *patternSearch){ Int4 i; /*loop index on sequence*/ Int4 prefixMatchedBitPattern = 0; /*indicates where pattern aligns with seq; e.g., if value is 9 = 0101 then last 3 chars of seq match first 3 positions in pattern and last 1 char of seq matches 1 position of pattern*/ Int4 numMatches = 0; /*number of matches found*/ Int4 mask; /*mask of input pattern positions after which a match can be declared*/ Int4 maskShiftPlus1; /*mask shifted left 1 plus 1 */ mask = patternSearch->match_mask; maskShiftPlus1 = (mask << 1) + 1; for (i = 0; i < len1; i++) { /*shift the positions matched by 1 and try to match up against the next character, also allow next character to match the first position*/ prefixMatchedBitPattern = ((prefixMatchedBitPattern << 1) | maskShiftPlus1) & patternSearch->whichPositionPtr[seq[i]]; if (prefixMatchedBitPattern & mask) { /*first part of pair is index of place in seq where match ends; second part is where match starts*/ hitArray[numMatches++] = i; hitArray[numMatches++] = i - lenof(prefixMatchedBitPattern, mask)+1; if (numMatches == MAX_HIT) { /*ErrPostEx(SEV_WARNING, 0, 0, "%ld matches saved, discarding others", numMatches);*/ break; } } } return numMatches;}/*find hits when sequence is DNA and pattern is short returns twice the number of hits pos indicates the starting position len is length of sequence seq hitArray stores the results*/static Int4 find_hitsS_DNA(Int4* hitArray, const Uint1* seq, Int4 pos, Int4 len, patternSearchItems *patternSearch){ /*Some variables and the algorithm are similar to what is used in find_hits() above; see more detailed comments there*/ Uint4 prefixMatchedBitPattern; /*indicates where pattern aligns with sequence*/ Uint4 mask2; /*mask to match agaist*/ Int4 maskShiftPlus1; /*mask2 shifted plus 1*/ Uint4 tmp; /*intermediate result of masked comparisons*/ Int4 i; /*index on seq*/ Int4 end; /*count of number of 4-mer iterations needed*/ Int4 remain; /*0,1,2,3 DNA letters left over*/ Int4 j; /*index on suffixRemnant*/ Int4 twiceNumHits = 0; /*twice the number of hits*/ mask2 = patternSearch->match_mask*BITS_PACKED_PER_WORD+15; maskShiftPlus1 = (patternSearch->match_mask << 1)+1; if (pos != 0) { pos = 4 - pos; prefixMatchedBitPattern = ((patternSearch->match_mask * ((1 << (pos+1))-1)*2) + (1 << (pos+1))-1)& patternSearch->DNAwhichSuffixPosPtr[seq[0]]; seq++; end = (len-pos)/4; remain = (len-pos) % 4; } else { prefixMatchedBitPattern = maskShiftPlus1; end = len/4; remain = len % 4; } for (i = 0; i < end; i++) { if ( (tmp = (prefixMatchedBitPattern & patternSearch->DNAwhichPrefixPosPtr[seq[i]]))) { for (j = 0; j < 4; j++) { if (tmp & patternSearch->match_mask) { hitArray[twiceNumHits++] = i*4+j + pos; hitArray[twiceNumHits++] = i*4+j + pos -lenof(tmp & patternSearch->match_mask, patternSearch->match_mask)+1; } tmp = (tmp << 1); } } prefixMatchedBitPattern = (((prefixMatchedBitPattern << 4) | mask2) & patternSearch->DNAwhichSuffixPosPtr[seq[i]]); } /* In the last byte check bits only up to 'remain' */ if ( (tmp = (prefixMatchedBitPattern & patternSearch->DNAwhichPrefixPosPtr[seq[i]]))) { for (j = 0; j < remain; j++) { if (tmp & patternSearch->match_mask) { hitArray[twiceNumHits++] = i*4+j + pos; hitArray[twiceNumHits++] = i*4+j + pos - lenof(tmp & patternSearch->match_mask, patternSearch->match_mask)+1; } tmp = (tmp << 1); } } return twiceNumHits;}/*Top level routine when pattern has a short description hitArray is to return the hits seq is the input sequence start is what position to start at in seq len is the length of seq is_dna is 1 if and only if seq is a DNA sequence the return value from the appropriate lower level routine is passed back*/static Int4 find_hitsS_head(Int4* hitArray, const Uint1* seq, Int4 start, Int4 len, Uint1 is_dna, patternSearchItems *patternSearch){ if (is_dna) return find_hitsS_DNA(hitArray, &seq[start/4], start % 4, len, patternSearch); return find_hitsS(hitArray, &seq[start], len, patternSearch);}/*Shift each word in the array left by 1 bit and add bit b, if the new values is bigger that OVERFLOW1, then subtract OVERFLOW1 */static void lmove(Int4 *a, Uint1 b, patternSearchItems *patternSearch){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -