📄 blast_stat.c
字号:
/* * =========================================================================== * PRODUCTION $Log: blast_stat.c,v $ * PRODUCTION Revision 1000.6 2004/06/01 18:07:50 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.77 * PRODUCTION * =========================================================================== *//* $Id: blast_stat.c,v 1000.6 2004/06/01 18:07:50 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Tom Madden * *//** @file blast_stat.c * Functions to calculate BLAST probabilities etc. * Detailed Contents: * * - allocate and deallocate structures used by BLAST to calculate * probabilities etc. * * - calculate residue frequencies for query and "average" database. * * - read in matrix or load it from memory. * * - calculate sum-p from a collection of HSP's, for both the case * of a "small" gap and a "large" gap, when give a total score and the * number of HSP's. * * - calculate expect values for p-values. * * - calculate pseuod-scores from p-values. * * @todo FIXME needs doxygen comments */static char const rcsid[] = "$Id: blast_stat.c,v 1000.6 2004/06/01 18:07:50 gouriano Exp $";#include <algo/blast/core/blast_stat.h>#include <algo/blast/core/blast_util.h>#include <util/tables/raw_scoremat.h>#include <algo/blast/core/blast_encoding.h>#include "blast_psi_priv.h"/* OSF1 apparently doesn't like this. */#if defined(HUGE_VAL) && !defined(OS_UNIX_OSF1)#define BLASTKAR_HUGE_VAL HUGE_VAL#else#define BLASTKAR_HUGE_VAL 1.e30#endif/* Allocates and Deallocates the two-dimensional matrix. */static SBLASTMatrixStructure* BlastMatrixAllocate (Int2 alphabet_size);/* performs sump calculation, used by BlastSumPStd */static double BlastSumPCalc (int r, double s);#define COMMENT_CHR '#'#define TOKSTR " \t\n\r"#define BLAST_MAX_ALPHABET 40 /* ncbistdaa is only 26, this should be enough *//* How many of the first bases are not ambiguous (it's four, of course).*/#define NUMBER_NON_AMBIG_BP 4/* Used in BlastKarlinBlkGappedCalc */typedef double array_of_8[8];/* Used to temporarily store matrix values for retrieval. */typedef struct MatrixInfo { char* name; /* name of matrix (e.g., BLOSUM90). */ array_of_8 *values; /* The values (below). */ Int4 *prefs; /* Preferences for display. */ Int4 max_number_values; /* number of values (e.g., BLOSUM90_VALUES_MAX). */} MatrixInfo;/**************************************************************************************How the statistical parameters for the matrices are stored:-----------------------------------------------------------They parameters are stored in a two-dimensional array double (i.e., doubles, which has as it's first dimensions the number of different gap existence and extension combinations and as it's second dimension 8.The eight different columns specify:1.) gap existence penalty (INT2_MAX denotes infinite).2.) gap extension penalty (INT2_MAX denotes infinite).3.) decline to align penalty (INT2_MAX denotes infinite).4.) Lambda5.) K6.) H7.) alpha8.) beta(Items 4-8 are explained in:Altschul SF, Bundschuh R, Olsen R, Hwa T.The estimation of statistical parameters for local alignment score distributions.Nucleic Acids Res. 2001 Jan 15;29(2):351-61.).Take BLOSUM45 (below) as an example. Currently (5/17/02) there are14 different allowed combinations (specified by "#define BLOSUM45_VALUES_MAX 14").The first row in the array "blosum45_values" has INT2_MAX (i.e., 32767) for gap existence, extension, and decline-to-align penalties. For all practical purposesthis value is large enough to be infinite, so the alignments will be ungapped.BLAST may also use this value (INT2_MAX) as a signal to skip gapping, so adifferent value should not be used if the intent is to have gapless extensions.The next row is for the gap existence penalty 13 and the extension penalty 3.The decline-to-align penalty is only supported in a few cases, so it is normallyset to INT2_MAX.How to add a new matrix to blast_stat.c:--------------------------------------To add a new matrix to blast_stat.c it is necessary to complete four steps. As an example consider adding the matrixcalled TESTMATRIX1.) add a define specifying how many different existence and extensionspenalties are allowed, so it would be necessary to add the line:#define TESTMATRIX_VALUES_MAX 14if 14 values were to be allowed.2.) add a two-dimensional array to contain the statistical parameters:static double testmatrix_values[TESTMATRIX_VALUES_MAX][8] ={ ...3.) add a "prefs" array that should hint about the "optimal" gap existence and extension penalties:static Int4 testmatrix_prefs[TESTMATRIX_VALUES_MAX] = {BLAST_MATRIX_NOMINAL,...};4.) Go to the function BlastLoadMatrixValues (in this file) andadd two lines before the return at the end of the function: matrix_info = MatrixInfoNew("TESTMATRIX", testmatrix_values, testmatrix_prefs, TESTMATRIX_VALUES_MAX); ListNodeAddPointer(&retval, 0, matrix_info);***************************************************************************************/ #define BLOSUM45_VALUES_MAX 14static double blosum45_values[BLOSUM45_VALUES_MAX][8] = { {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.2291, 0.0924, 0.2514, 0.9113, -5.7}, {13, 3, (double) INT2_MAX, 0.207, 0.049, 0.14, 1.5, -22}, {12, 3, (double) INT2_MAX, 0.199, 0.039, 0.11, 1.8, -34}, {11, 3, (double) INT2_MAX, 0.190, 0.031, 0.095, 2.0, -38}, {10, 3, (double) INT2_MAX, 0.179, 0.023, 0.075, 2.4, -51}, {16, 2, (double) INT2_MAX, 0.210, 0.051, 0.14, 1.5, -24}, {15, 2, (double) INT2_MAX, 0.203, 0.041, 0.12, 1.7, -31}, {14, 2, (double) INT2_MAX, 0.195, 0.032, 0.10, 1.9, -36}, {13, 2, (double) INT2_MAX, 0.185, 0.024, 0.084, 2.2, -45}, {12, 2, (double) INT2_MAX, 0.171, 0.016, 0.061, 2.8, -65}, {19, 1, (double) INT2_MAX, 0.205, 0.040, 0.11, 1.9, -43}, {18, 1, (double) INT2_MAX, 0.198, 0.032, 0.10, 2.0, -43}, {17, 1, (double) INT2_MAX, 0.189, 0.024, 0.079, 2.4, -57}, {16, 1, (double) INT2_MAX, 0.176, 0.016, 0.063, 2.8, -67},};static Int4 blosum45_prefs[BLOSUM45_VALUES_MAX] = {BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_BEST,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL};#define BLOSUM50_VALUES_MAX 16static double blosum50_values[BLOSUM50_VALUES_MAX][8] = { {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.2318, 0.112, 0.3362, 0.6895, -4.0}, {13, 3, (double) INT2_MAX, 0.212, 0.063, 0.19, 1.1, -16}, {12, 3, (double) INT2_MAX, 0.206, 0.055, 0.17, 1.2, -18}, {11, 3, (double) INT2_MAX, 0.197, 0.042, 0.14, 1.4, -25}, {10, 3, (double) INT2_MAX, 0.186, 0.031, 0.11, 1.7, -34}, {9, 3, (double) INT2_MAX, 0.172, 0.022, 0.082, 2.1, -48}, {16, 2, (double) INT2_MAX, 0.215, 0.066, 0.20, 1.05, -15}, {15, 2, (double) INT2_MAX, 0.210, 0.058, 0.17, 1.2, -20}, {14, 2, (double) INT2_MAX, 0.202, 0.045, 0.14, 1.4, -27}, {13, 2, (double) INT2_MAX, 0.193, 0.035, 0.12, 1.6, -32}, {12, 2, (double) INT2_MAX, 0.181, 0.025, 0.095, 1.9, -41}, {19, 1, (double) INT2_MAX, 0.212, 0.057, 0.18, 1.2, -21}, {18, 1, (double) INT2_MAX, 0.207, 0.050, 0.15, 1.4, -28}, {17, 1, (double) INT2_MAX, 0.198, 0.037, 0.12, 1.6, -33}, {16, 1, (double) INT2_MAX, 0.186, 0.025, 0.10, 1.9, -42}, {15, 1, (double) INT2_MAX, 0.171, 0.015, 0.063, 2.7, -76},};static Int4 blosum50_prefs[BLOSUM50_VALUES_MAX] = {BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_BEST,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL};#define BLOSUM62_VALUES_MAX 12static double blosum62_values[BLOSUM62_VALUES_MAX][8] = { {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3176, 0.134, 0.4012, 0.7916, -3.2}, {11, 2, (double) INT2_MAX, 0.297, 0.082, 0.27, 1.1, -10}, {10, 2, (double) INT2_MAX, 0.291, 0.075, 0.23, 1.3, -15}, {9, 2, (double) INT2_MAX, 0.279, 0.058, 0.19, 1.5, -19}, {8, 2, (double) INT2_MAX, 0.264, 0.045, 0.15, 1.8, -26}, {7, 2, (double) INT2_MAX, 0.239, 0.027, 0.10, 2.5, -46}, {6, 2, (double) INT2_MAX, 0.201, 0.012, 0.061, 3.3, -58}, {13, 1, (double) INT2_MAX, 0.292, 0.071, 0.23, 1.2, -11}, {12, 1, (double) INT2_MAX, 0.283, 0.059, 0.19, 1.5, -19}, {11, 1, (double) INT2_MAX, 0.267, 0.041, 0.14, 1.9, -30}, {10, 1, (double) INT2_MAX, 0.243, 0.024, 0.10, 2.5, -44}, {9, 1, (double) INT2_MAX, 0.206, 0.010, 0.052, 4.0, -87},};static Int4 blosum62_prefs[BLOSUM62_VALUES_MAX] = { BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_BEST, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL,};#define BLOSUM80_VALUES_MAX 10static double blosum80_values[BLOSUM80_VALUES_MAX][8] = { {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3430, 0.177, 0.6568, 0.5222, -1.6}, {25, 2, (double) INT2_MAX, 0.342, 0.17, 0.66, 0.52, -1.6}, {13, 2, (double) INT2_MAX, 0.336, 0.15, 0.57, 0.59, -3}, {9, 2, (double) INT2_MAX, 0.319, 0.11, 0.42, 0.76, -6}, {8, 2, (double) INT2_MAX, 0.308, 0.090, 0.35, 0.89, -9}, {7, 2, (double) INT2_MAX, 0.293, 0.070, 0.27, 1.1, -14}, {6, 2, (double) INT2_MAX, 0.268, 0.045, 0.19, 1.4, -19}, {11, 1, (double) INT2_MAX, 0.314, 0.095, 0.35, 0.90, -9}, {10, 1, (double) INT2_MAX, 0.299, 0.071, 0.27, 1.1, -14}, {9, 1, (double) INT2_MAX, 0.279, 0.048, 0.20, 1.4, -19},};static Int4 blosum80_prefs[BLOSUM80_VALUES_MAX] = { BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_BEST, BLAST_MATRIX_NOMINAL};#define BLOSUM90_VALUES_MAX 8static double blosum90_values[BLOSUM90_VALUES_MAX][8] = { {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3346, 0.190, 0.7547, 0.4434, -1.4}, {9, 2, (double) INT2_MAX, 0.310, 0.12, 0.46, 0.67, -6}, {8, 2, (double) INT2_MAX, 0.300, 0.099, 0.39, 0.76, -7}, {7, 2, (double) INT2_MAX, 0.283, 0.072, 0.30, 0.93, -11}, {6, 2, (double) INT2_MAX, 0.259, 0.048, 0.22, 1.2, -16}, {11, 1, (double) INT2_MAX, 0.302, 0.093, 0.39, 0.78, -8}, {10, 1, (double) INT2_MAX, 0.290, 0.075, 0.28, 1.04, -15}, {9, 1, (double) INT2_MAX, 0.265, 0.044, 0.20, 1.3, -19},};static Int4 blosum90_prefs[BLOSUM90_VALUES_MAX] = { BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_NOMINAL, BLAST_MATRIX_BEST, BLAST_MATRIX_NOMINAL};#define PAM250_VALUES_MAX 16static double pam250_values[PAM250_VALUES_MAX][8] = { {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.2252, 0.0868, 0.2223, 0.98, -5.0}, {15, 3, (double) INT2_MAX, 0.205, 0.049, 0.13, 1.6, -23}, {14, 3, (double) INT2_MAX, 0.200, 0.043, 0.12, 1.7, -26}, {13, 3, (double) INT2_MAX, 0.194, 0.036, 0.10, 1.9, -31}, {12, 3, (double) INT2_MAX, 0.186, 0.029, 0.085, 2.2, -41}, {11, 3, (double) INT2_MAX, 0.174, 0.020, 0.070, 2.5, -48}, {17, 2, (double) INT2_MAX, 0.204, 0.047, 0.12, 1.7, -28}, {16, 2, (double) INT2_MAX, 0.198, 0.038, 0.11, 1.8, -29}, {15, 2, (double) INT2_MAX, 0.191, 0.031, 0.087, 2.2, -44}, {14, 2, (double) INT2_MAX, 0.182, 0.024, 0.073, 2.5, -53}, {13, 2, (double) INT2_MAX, 0.171, 0.017, 0.059, 2.9, -64}, {21, 1, (double) INT2_MAX, 0.205, 0.045, 0.11, 1.8, -34}, {20, 1, (double) INT2_MAX, 0.199, 0.037, 0.10, 1.9, -35}, {19, 1, (double) INT2_MAX, 0.192, 0.029, 0.083, 2.3, -52}, {18, 1, (double) INT2_MAX, 0.183, 0.021, 0.070, 2.6, -60}, {17, 1, (double) INT2_MAX, 0.171, 0.014, 0.052, 3.3, -86},};static Int4 pam250_prefs[PAM250_VALUES_MAX] = {BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_BEST,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL};#define PAM30_VALUES_MAX 7static double pam30_values[PAM30_VALUES_MAX][8] = { {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3400, 0.283, 1.754, 0.1938, -0.3}, {7, 2, (double) INT2_MAX, 0.305, 0.15, 0.87, 0.35, -3}, {6, 2, (double) INT2_MAX, 0.287, 0.11, 0.68, 0.42, -4}, {5, 2, (double) INT2_MAX, 0.264, 0.079, 0.45, 0.59, -7}, {10, 1, (double) INT2_MAX, 0.309, 0.15, 0.88, 0.35, -3}, {9, 1, (double) INT2_MAX, 0.294, 0.11, 0.61, 0.48, -6}, {8, 1, (double) INT2_MAX, 0.270, 0.072, 0.40, 0.68, -10},};static Int4 pam30_prefs[PAM30_VALUES_MAX] = {BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_BEST,BLAST_MATRIX_NOMINAL,};#define PAM70_VALUES_MAX 7static double pam70_values[PAM70_VALUES_MAX][8] = { {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3345, 0.229, 1.029, 0.3250, -0.7}, {8, 2, (double) INT2_MAX, 0.301, 0.12, 0.54, 0.56, -5}, {7, 2, (double) INT2_MAX, 0.286, 0.093, 0.43, 0.67, -7}, {6, 2, (double) INT2_MAX, 0.264, 0.064, 0.29, 0.90, -12}, {11, 1, (double) INT2_MAX, 0.305, 0.12, 0.52, 0.59, -6}, {10, 1, (double) INT2_MAX, 0.291, 0.091, 0.41, 0.71, -9}, {9, 1, (double) INT2_MAX, 0.270, 0.060, 0.28, 0.97, -14},};static Int4 pam70_prefs[PAM70_VALUES_MAX] = {BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_NOMINAL,BLAST_MATRIX_BEST,BLAST_MATRIX_NOMINAL};#define BLOSUM62_20_VALUES_MAX 65static double blosum62_20_values[BLOSUM62_20_VALUES_MAX][8] = { {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.03391, 0.125, 0.4544, 0.07462, -3.2}, {100, 12, (double) INT2_MAX, 0.0300, 0.056, 0.21, 0.14, -15}, {95, 12, (double) INT2_MAX, 0.0291, 0.047, 0.18, 0.16, -20}, {90, 12, (double) INT2_MAX, 0.0280, 0.038, 0.15, 0.19, -28}, {85, 12, (double) INT2_MAX, 0.0267, 0.030, 0.13, 0.21, -31}, {80, 12, (double) INT2_MAX, 0.0250, 0.021, 0.10, 0.25, -39}, {105, 11, (double) INT2_MAX, 0.0301, 0.056, 0.22, 0.14, -16}, {100, 11, (double) INT2_MAX, 0.0294, 0.049, 0.20, 0.15, -17}, {95, 11, (double) INT2_MAX, 0.0285, 0.042, 0.16, 0.18, -25}, {90, 11, (double) INT2_MAX, 0.0271, 0.031, 0.14, 0.20, -28}, {85, 11, (double) INT2_MAX, 0.0256, 0.023, 0.10, 0.26, -46}, {115, 10, (double) INT2_MAX, 0.0308, 0.062, 0.22, 0.14, -20}, {110, 10, (double) INT2_MAX, 0.0302, 0.056, 0.19, 0.16, -26}, {105, 10, (double) INT2_MAX, 0.0296, 0.050, 0.17, 0.17, -27},
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -