📄 pattern.h
字号:
/* * =========================================================================== * PRODUCTION $Log: pattern.h,v $ * PRODUCTION Revision 1000.2 2004/06/01 18:04:23 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.4 * PRODUCTION * =========================================================================== *//* $Id: pattern.h,v 1000.2 2004/06/01 18:04:23 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's offical duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Ilya Dondoshansky * *//** @file pattern.h * Functions for finding pattern matches in sequence (PHI-BLAST). */#include <algo/blast/core/blast_def.h>#include <algo/blast/core/blast_options.h>#ifndef PATTERN__H#define PATTERN__H#ifdef __cplusplusextern "C" {#endif/** @todo: FIXME comment #defines */#define BUF_SIZE 100#define ASCII_SIZE 256#define BITS_PACKED_PER_WORD 30#define MaxW 11#define MaxP (BITS_PACKED_PER_WORD * MaxW) /*threshold pattern length*/#define MAX_WORDS_IN_PATTERN 100#define MAX_HIT 20000#define OVERFLOW1 (1 << BITS_PACKED_PER_WORD)#define ONE_WORD_PATTERN 0#define MULTI_WORD_PATTERN 1#define ALPHABET_SIZE 25#define PATTERN_SPACE_SIZE 1000typedef struct patternSearchItems { Int4 numWords; /**< Number of words need to hold bit representation of pattern*/ Int4 match_mask;/**< Bit mask representation of input pattern for patterns that fit in a word*/ Int4 match_maskL[BUF_SIZE]; /**< Bit mask representation of input pattern for long patterns*/ Int4 bitPatternByLetter[ASCII_SIZE][MaxW]; /**< Which positions can a character occur in for long patterns*/ Int4 *whichPositionPtr; /**< Used to pass a piece a row of the arrays*/ Uint4 *DNAwhichPrefixPosPtr; /**< Prefix position array for DNA patterns */ Uint4 *DNAwhichSuffixPosPtr; /* Suffix position array for DNA patterns*/ Int4 whichPositionsByCharacter[ASCII_SIZE]; /**< Which positions can a character occur in for short patterns*/ Uint4 DNAwhichPrefixPositions[ASCII_SIZE]; /**< For DNA sequence: where prefix of DNA 4-mer matches pattern*/ Uint4 DNAwhichSuffixPositions[ASCII_SIZE]; /**< Similar to above for suffixes*/ /*for each letter in the alphabet and each word in the masked pattern representation, holds a bit pattern saying for which positions the letter will match*/ Int4 SLL[MAX_WORDS_IN_PATTERN][ASCII_SIZE]; /**< Similar to whichPositionsByCharacter for many-word patterns*/ Uint4 DNAprefixSLL[MAX_WORDS_IN_PATTERN][ASCII_SIZE]; /*similar to DNAwhichPrefixPositions for many word patterns*/ Uint4 DNAsuffixSLL[MAX_WORDS_IN_PATTERN][ASCII_SIZE]; /*similar to DNAwhichSuffixPositions for many word patterns*/ Char flagPatternLength; /**< Indicates if pattern fits in 1 word, some words, or is too long*/ double patternProbability; /**< Probability of this letter combination*/ Int4 whichMostSpecific; /**< Which word in an extra long pattern has the lowest probability of a match*/ Int4 numPlacesInWord[MAX_WORDS_IN_PATTERN]; /**< When pattern has more than 7 words, keep track of how many places of pattern in each word of the representation; was called lening */ Int4 spacing[MAX_WORDS_IN_PATTERN]; /**< Spaces until next word due to wildcard*/ Int4 inputPatternMasked[MaxP]; Int4 highestPlace; /**< Number of places in pattern representation as computed in input_pattern; was called num*/ Int4 minPatternMatchLength; /**< Minimum length of string to match this pattern*/ Int4 wildcardProduct; /**< Product of wildcard lengths*/} patternSearchItems;/** Find the places where the pattern matches seq; * 3 different methods are used depending on the length of the pattern. * @param hitArray Stores the results as pairs of positions in consecutive * entries [out] * @param seq Sequence [in] * @param len Length of the sequence [in] * @param is_dna Indicates whether seq is made of DNA or protein letters [in] * @param patternSearch Pattern information [in] * @return Twice the number of hits (length of hitArray filled in)*/Int4 FindPatternHits(Int4 *hitArray, const Uint1* seq, Int4 len, Boolean is_dna, patternSearchItems * patternSearch);#ifdef __cplusplus}#endif#endif /* PATTERN__H */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -