📄 blast_stat.h
字号:
/* * =========================================================================== * PRODUCTION $Log: blast_stat.h,v $ * PRODUCTION Revision 1000.6 2004/06/01 18:03:52 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.45 * PRODUCTION * =========================================================================== *//* $Id: blast_stat.h,v 1000.6 2004/06/01 18:03:52 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Tom Madden * *//** @file blast_stat.h * Definitions and prototypes used by blast_stat.c to calculate BLAST * statistics. @todo FIXME: needs doxygen comments */#ifndef __BLAST_STAT__#define __BLAST_STAT__#include <algo/blast/core/blast_def.h>#include <algo/blast/core/blast_message.h>#ifdef __cplusplusextern "C" {#endif/* Defines for the matrix 'preferences' (as specified by S. Altschul).*/#define BLAST_MATRIX_NOMINAL 0#define BLAST_MATRIX_PREFERRED 1#define BLAST_MATRIX_BEST 2/****************************************************************************For more accuracy in the calculation of K, set K_SUMLIMIT to 0.00001.For high speed in the calculation of K, use a K_SUMLIMIT of 0.001Note: statistical significance is often not greatly affected by the valueof K, so high accuracy is generally unwarranted.*****************************************************************************//* K_SUMLIMIT_DEFAULT == sumlimit used in BlastKarlinLHtoK() */#define BLAST_KARLIN_K_SUMLIMIT_DEFAULT 0.0001/* LAMBDA_ACCURACY_DEFAULT == accuracy to which Lambda should be calc'd */#define BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT (1.e-5)/* LAMBDA_ITER_DEFAULT == no. of iterations in LambdaBis = ln(accuracy)/ln(2)*/#define BLAST_KARLIN_LAMBDA_ITER_DEFAULT 17/* Initial guess for the value of Lambda in BlastKarlinLambdaNR */#define BLAST_KARLIN_LAMBDA0_DEFAULT 0.5#define BLAST_KARLIN_K_ITER_MAX 100#define BLAST_SUMP_EPSILON_DEFAULT 0.002 /* accuracy for SumP calculations *//* Where are the BLAST matrices located?*/#define BLASTMAT_DIR "/usr/ncbi/blast/matrix"/************************************************************************* Structure to the Karlin-Blk parameters. This structure was (more or less) copied from the old karlin.h.**************************************************************************/typedef struct Blast_KarlinBlk { double Lambda; /* Lambda value used in statistics */ double K, logK; /* K value used in statistics */ double H; /* H value used in statistics */ double paramC; /* for use in seed. */ } Blast_KarlinBlk;/******************************************************************** Structures relating to scoring or the BlastScoreBlk********************************************************************//*SCORE_MIN is (-2**31 + 1)/2 because it has been observed more than once thata compiler did not properly calculate the quantity (-2**31)/2. The listof compilers which failed this operation have at least at some time included:NeXT and a version of AIX/370's MetaWare High C R2.1r.For this reason, SCORE_MIN is not simply defined to be LONG_MIN/2.*/#define BLAST_SCORE_MIN INT2_MIN#define BLAST_SCORE_MAX INT2_MAX#if defined(OS_DOS) || defined(OS_MAC)#define BLAST_SCORE_1MIN (-100)#define BLAST_SCORE_1MAX ( 100)#else#define BLAST_SCORE_1MIN (-10000)#define BLAST_SCORE_1MAX ( 10000)#endif#define BLAST_SCORE_RANGE_MAX (BLAST_SCORE_1MAX - BLAST_SCORE_1MIN)typedef struct Blast_ScoreFreq { Int4 score_min, score_max; Int4 obs_min, obs_max; double score_avg; double* sprob0,* sprob;} Blast_ScoreFreq;#define BLAST_MATRIX_SIZE 32/* Remove me */typedef struct SBLASTMatrixStructure { Int4 *matrix[BLAST_MATRIX_SIZE]; Int4 long_matrix[BLAST_MATRIX_SIZE*BLAST_MATRIX_SIZE]; /* not used */} SBLASTMatrixStructure;typedef struct BlastScoreBlk { Boolean protein_alphabet; /* TRUE if alphabet_code is for a protein alphabet (e.g., ncbistdaa etc.), FALSE for nt. alphabets. */ Uint1 alphabet_code; /* NCBI alphabet code. */ Int2 alphabet_size; /* size of alphabet. */ Int2 alphabet_start; /* numerical value of 1st letter. */ SBLASTMatrixStructure* matrix_struct; /* Holds info about matrix. */ Int4 **matrix; /* Substitution matrix */ Int4 **posMatrix; /* Sub matrix for position depend BLAST. */ double karlinK; /* Karlin-Altschul parameter associated with posMatrix */ Int2 mat_dim1, mat_dim2; /* dimensions of matrix. */ Int4 *maxscore; /* Max. score for each letter */ Int4 loscore, hiscore; /* Min. & max. substitution scores */ Int4 penalty, reward; /* penalty and reward for blastn. */ double scale_factor; /* multiplier for all cutoff and dropoff scores */ Boolean read_in_matrix; /* If TRUE, matrix is read in, otherwise produce one from penalty and reward above. */ Blast_ScoreFreq** sfp; /* score frequencies. */ double **posFreqs; /*matrix of position specific frequencies*/ /* kbp & kbp_gap are ptrs that should be set to kbp_std, kbp_psi, etc. */ Blast_KarlinBlk** kbp; /* Karlin-Altschul parameters. */ Blast_KarlinBlk** kbp_gap; /* K-A parameters for gapped alignments. */ /* Below are the Karlin-Altschul parameters for non-position based ('std') and position based ('psi') searches. */ Blast_KarlinBlk **kbp_std, **kbp_psi, **kbp_gap_std, **kbp_gap_psi; Blast_KarlinBlk* kbp_ideal; /* Ideal values (for query with average database composition). */ Int4 number_of_contexts; /* Used by sfp and kbp, how large are these*/ char* name; /* name of matrix. */ Uint1* ambiguous_res; /* Array of ambiguous res. (e.g, 'X', 'N')*/ Int2 ambig_size, /* size of array above. */ ambig_occupy; /* How many occupied? */ ListNode* comments; /* Comments about matrix. */ Int4 query_length; /* the length of the query. */ Int8 effective_search_sp; /* product of above two */} BlastScoreBlk;typedef struct Blast_ResFreq { Uint1 alphabet_code; double* prob; /* probs, (possible) non-zero offset. */ double* prob0; /* probs, zero offset. */} Blast_ResFreq;BlastScoreBlk* BlastScoreBlkNew (Uint1 alphabet, Int4 number_of_contexts);Int2 BlastScoreBlkMatrixLoad(BlastScoreBlk* sbp);BlastScoreBlk* BlastScoreBlkFree (BlastScoreBlk* sbp);Int2 BLAST_ScoreSetAmbigRes (BlastScoreBlk* sbp, char ambiguous_res);Int2 BLAST_ScoreBlkFill (BlastScoreBlk* sbp, char* string, Int4 length, Int4 context_number); /** This function fills in the BlastScoreBlk structure. * Tasks are: * -read in the matrix * -set maxscore * @param sbp Scoring block [in] [out] * @param matrix Full path to the matrix in the directory structure [in]*/Int2 BLAST_ScoreBlkMatFill (BlastScoreBlk* sbp, char* matrix); /* Functions taken from the OLD karlin.c*/Blast_KarlinBlk* Blast_KarlinBlkCreate (void);/** Deallocates the KarlinBlk * @param kbp KarlinBlk to be deallocated [in]*/Blast_KarlinBlk* Blast_KarlinBlkDestruct(Blast_KarlinBlk* kbp);Int2 Blast_KarlinBlkGappedCalc (Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, char* matrix_name, Blast_Message** error_return);/** Calculates the standard Karlin parameters. This is used * if the query is translated and the calculated (real) Karlin * parameters are bad, as they're calculated for non-coding regions. * @param sbp ScoreBlk used to calculate "ideal" values. [in]*/Blast_KarlinBlk* Blast_KarlinBlkIdealCalc(BlastScoreBlk* sbp);Int2 Blast_KarlinBlkStandardCalc(BlastScoreBlk* sbp, Int4 context_start, Int4 context_end);/* Attempts to fill KarlinBlk for given gap opening, extensions etc. Will return non-zero status if that fails. return values: -1 if matrix_name is NULL; 1 if matrix not found 2 if matrix found, but open, extend etc. values not supported.*/Int2 Blast_KarlinkGapBlkFill(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, char* matrix_name);/* Prints a messages about the allowed matrices, BlastKarlinkGapBlkFill should return 1 before this is called. */char* BLAST_PrintMatrixMessage(const char *matrix);/* Prints a messages about the allowed open etc values for the given matrix, BlastKarlinkGapBlkFill should return 2 before this is called. */char* BLAST_PrintAllowedValues(const char *matrix, Int4 gap_open, Int4 gap_extend, Int4 decline_align);/** Calculates the parameter Lambda given an initial guess for its value */doubleBlast_KarlinLambdaNR(Blast_ScoreFreq* sfp, double initialLambdaGuess);/** Calculates the Expect value based upon the search space and some Karlin-Altschul * parameters. It is "simple" as it does not use sum-statistics. * @param S the score of the alignment. [in] * @param kbp the Karlin-Altschul parameters. [in] * @param searchsp total search space to be used [in] * @return the expect value */double BLAST_KarlinStoE_simple (Int4 S, Blast_KarlinBlk* kbp, Int8 searchsp);double BLAST_GapDecayDivisor(double decayrate, unsigned nsegs );/** Calculate the cutoff score from the expected number of HSPs or vice versa. * @param S The calculated score [in] [out] * @param E The calculated e-value [in] [out] * @param kbp The Karlin-Altschul statistical parameters [in] * @param searchsp The effective search space [in] * @param dodecay Use gap decay feature? [in] * @param gap_decay_rate Gap decay rate to use, if dodecay is set [in] */Int2 BLAST_Cutoffs (Int4 *S, double* E, Blast_KarlinBlk* kbp, Int8 searchsp, Boolean dodecay, double gap_decay_rate);/* Functions to calculate SumE (for large and small gaps). */double BLAST_SmallGapSumE (Blast_KarlinBlk* kbp, Int4 gap, Int2 num, double xsum, Int4 query_length, Int4 subject_length, double weight_divisor);double BLAST_UnevenGapSumE (Blast_KarlinBlk* kbp, Int4 p_gap, Int4 n_gap, Int2 num, double xsum, Int4 query_length, Int4 subject_length, double weight_divisor);double BLAST_LargeGapSumE (Blast_KarlinBlk* kbp, Int2 num, double xsum, Int4 query_length, Int4 subject_length, double weight_divisor );/*Obtains arrays of the allowed opening and extension penalties for gapped BLAST forthe given matrix. Also obtains arrays of Lambda, K, and H. The pref_flags field isused for display purposes, with the defines: BLAST_MATRIX_NOMINAL, BLAST_MATRIX_PREFERRED, andBLAST_MATRIX_BEST.Any of these fields thatare not required should be set to NULL. The Int2 return value is the length of thearrays.*/void BLAST_GetAlphaBeta (char* matrixName, double *alpha,double *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend);Int4 ** RPSCalculatePSSM(double scalingFactor, Int4 rps_query_length, Uint1 * rps_query_seq, Int4 db_seq_length, Int4 **posMatrix);Int4BLAST_ComputeLengthAdjustment(double K, double logK, double alpha_d_lambda, double beta, Int4 query_length, Int8 db_length, Int4 db_num_seqs, Int4 * length_adjustment);/** Allocates a new Blast_ResFreq structure and fills in the prob element based upon the contents of sbp. * @param sbp The BlastScoreBlk* used to init prob [in]*/Blast_ResFreq* Blast_ResFreqNew(const BlastScoreBlk* sbp);/** Deallocates Blast_ResFreq and prob0 element. * @param rfp the Blast_ResFreq to be deallocated.*/Blast_ResFreq* Blast_ResFreqDestruct(Blast_ResFreq* rfp);/** Calculates residues frequencies given a standard distribution. * @param sbp the BlastScoreBlk provides information on alphabet. * @param rfp the prob element on this Blast_ResFreq is used.*/Int2 Blast_ResFreqStdComp(const BlastScoreBlk* sbp, Blast_ResFreq* rfp);/** Creates a new structure to keep track of score frequencies for a scoring * system. * @param score_min Minimum score [in] * @param score_max Maximum score [in] */Blast_ScoreFreq*Blast_ScoreFreqNew(Int4 score_min, Int4 score_max);/** Deallocates the score frequencies structure * @param sfp the structure to deallocate [in] * @return NULL */Blast_ScoreFreq*Blast_ScoreFreqDestruct(Blast_ScoreFreq* sfp);/** Fills a buffer with the 'standard' alphabet * (given by STD_AMINO_ACID_FREQS[index].ch). * * @return Number of residues in alphabet or negative returns upon error. */Int2Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1* residues, Uint4 residues_size);/* Please see comment on blast_stat.c */Int2Blast_KarlinBlkCalc(Blast_KarlinBlk* kbp, Blast_ScoreFreq* sfp);#ifdef __cplusplus}#endif#endif /* !__BLAST_STAT__ */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -