📄 aligneval.c

📁 hmmer源程序
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved *  *     This source code is distributed under the terms of the *     GNU General Public License. See the files COPYING and LICENSE *     for details. *****************************************************************//* aligneval.c * RCS $Id: aligneval.c,v 1.6 2001/04/23 15:48:04 eddy Exp $ *  * Comparison of multiple alignments. Three functions are * provided, using subtly different scoring schemes: *    CompareMultAlignments()    - basic scoring scheme *    CompareRefMultAlignments() - only certain "canonical" columns  *                                 are scored *                                  * The similarity measure is a fractional alignment identity averaged * over all sequence pairs. The score for all pairs is: *      (identically aligned symbols) / (total aligned columns in  *      known alignment) *       * A column c is identically aligned for sequences i, j if: *    1) both i,j have a symbol aligned in column c, and the *       same pair of symbols is aligned somewhere in the test *       alignment *    2) S[i][c] is aligned to a gap in sequence j, and that symbol *       is aligned to a gap in the test alignment *    3) converse of 2) *     *     * The algorithm is as follows: *    1) For each known/test aligned pair of sequences (k1,k2 and t1,t2) *        construct a list for each sequence, in which for every *        counted symbol we record the raw index of the symbol in *        the other sequence that it aligns to, or -1 if it aligns *        to a gap or uncounted symbol. *         *    2)  Compare the list for k1 to the list for t1 and count an identity  *        for each correct alignment. *         *    3) Repeat 2) for comparing k2 to t2. Note that this means correct sym/sym *       alignments count for 2; correct sym/gap alignments count for 1. *     *    4) The score is (identities from 2 + identities from 3) /  *       (totals from 2 + totals from 3). * * Written originally for koala's ss2 pairwise alignment package. *  * Sean Eddy, Sun Nov  1 12:45:11 1992 * SRE, Thu Jul 29 16:47:18 1993: major revision: all functions replaced by new algorithm * CVS $Id: aligneval.c,v 1.6 2001/04/23 15:48:04 eddy Exp $ */#include <stdio.h>#include <string.h>#include <ctype.h>#include "squid.h"#ifdef MEMDEBUG#include "dbmalloc.h"#endifstatic int make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen);static int make_ref_alilist(int *refcoords, char *k1, char *k2, char *s1, char *s2, 			    int **ret_s1_list, int *ret_listlen);static int compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc);/* Function: ComparePairAlignments *  * Purpose:  Calculate and return a number representing how well two different alignments *           of a pair of sequences compare. The number is, roughly speaking, *           the fraction of columns which are identically aligned. *  *           For all columns c in which either known1[c] or known2[c]  *           is a non-gap, count an identity if those same symbols are *           aligned somewhere in calc1/calc2. The score is identities/total *           columns examined. (i.e. fully gapped columns don't count) *  *           more explicitly, identities come from: *             both known and test aligned pairs have the same symbol in the first sequence aligned to *               a gap in the second sequence; *             both known and test aligned pairs have the same symbol in the second sequence *               aligned to a gap in the first sequence; *             the known alignment has symbols aligned at this column, and the test *               alignment aligns the same two symbols. *  * Args:     known1, known2: trusted alignment of two sequences *           calc1, calc2:   test alignment of two sequences *   * Return:   Returns -1.0 on internal failure. */floatComparePairAlignments(char *known1, char *known2, char *calc1, char *calc2){  int *klist1;  int *klist2;  int *tlist1;  int *tlist2;  int len1, len2;  float score;  if (! make_alilist(calc1,  calc2,  &tlist1, &len1)) return -1.0;  if (! make_alilist(calc2,  calc1,  &tlist2, &len2)) return -1.0;  if (! make_alilist(known1, known2, &klist1, &len1)) return -1.0;  if (! make_alilist(known2, known1, &klist2, &len2)) return -1.0;  if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;    free(klist1);  free(klist2);  free(tlist1);  free(tlist2);  return score;}/* Function: CompareRefPairAlignments() *  * Same as above, but the only columns that count are the ones * with indices in *refcoord. *refcoord and the known1, known2 * pair must be in sync with each other (come from the same * multiple sequence alignment) * * Args:     ref           - 0..alen-1 array of 1 or 0  *           known1,known2 - trusted alignment *           calc1, calc2  - test alignment            * * Return:  the fractional alignment identity on success, -1.0 on failure. */floatCompareRefPairAlignments(int  *ref, char *known1, char *known2, char *calc1, char *calc2){  int *klist1;  int *klist2;  int *tlist1;  int *tlist2;  int len1, len2;  float score;  if (! make_ref_alilist(ref, known1, known2, calc1,  calc2,  &tlist1, &len1)) return -1.0;  if (! make_ref_alilist(ref, known2, known1, calc2,  calc1,  &tlist2, &len2)) return -1.0;  if (! make_ref_alilist(ref, known1, known2, known1, known2, &klist1, &len1)) return -1.0;  if (! make_ref_alilist(ref, known2, known1, known2, known1, &klist2, &len2)) return -1.0;  if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;    free(klist1);  free(klist2);  free(tlist1);  free(tlist2);  return score;}/* Function: make_alilist() *  * Purpose:  Construct a list (array) mapping the raw symbols of s1 *           onto the indexes of the aligned symbols in s2 (or -1 *           for gaps in s2). The list (s1_list) will be of the *           length of s1's raw sequence. *            * Args:     s1          - sequence to construct the list for *           s2          - sequence s1 is aligned to *           ret_s1_list - RETURN: the constructed list (caller must free) *           ret_listlen - RETURN: length of the list *            * Returns:  1 on success, 0 on failure */static intmake_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen){  int *s1_list;  int  col;			/* column position in alignment */  int  r1, r2;			/* raw symbol index at current col in s1, s2 */    /* Malloc for s1_list. It can't be longer than s1 itself; we just malloc   * for that (and waste a wee bit of space)   */  s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));  r1 = r2 = 0;  for (col = 0; s1[col] != '\0'; col++)    {      /* symbol in s1? Record what it's aligned to, and bump       * the r1 counter.       */      if (! isgap(s1[col]))	{	  s1_list[r1] = isgap(s2[col]) ? -1 : r2;	  r1++;	}      /* symbol in s2? bump the r2 counter       */      if (! isgap(s2[col]))	r2++;    }  *ret_listlen = r1;  *ret_s1_list = s1_list;  return 1;}/* Function: make_ref_alilist() *  * Purpose:  Construct a list (array) mapping the raw symbols of s1 *           which are under canonical columns of the ref alignment *           onto the indexes of the aligned symbols in s2 (or -1 *           for gaps in s2 or noncanonical symbols in s2).  *            * Args:     ref:        - array of indices of canonical coords (1 canonical, 0 non) *           k1          - s1's known alignment (w/ respect to refcoords) *           k2          - s2's known alignment (w/ respect to refcoords) *           s1          - sequence to construct the list for *           s2          - sequence s1 is aligned to *           ret_s1_list - RETURN: the constructed list (caller must free) *           ret_listlen - RETURN: length of the list *            * Returns:  1 on success, 0 on failure *//*ARGSUSED*/static intmake_ref_alilist(int *ref, char *k1, char *k2,		 char *s1, char *s2, int **ret_s1_list, int *ret_listlen){  int *s1_list;  int  col;			/* column position in alignment */  int  r1, r2;			/* raw symbol index at current col in s1, s2 */  int *canons1;			/* flag array, 1 if position i in s1 raw seq is canonical */  int  lpos;			/* position in list */    /* Allocations. No arrays can exceed the length of their   * appropriate parent (s1 or s2)   */  s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));  canons1 = (int *) MallocOrDie (sizeof(int) * strlen(s1));  /* First we use refcoords and k1,k2 to construct an array of 1's    * and 0's, telling us whether s1's raw symbol number i is countable.   * It's countable simply if it's under a canonical column.   */  r1 =  0;  for (col = 0; k1[col] != '\0'; col++)    {      if (! isgap(k1[col]))	{	  canons1[r1] = ref[col] ? 1 : 0;	  r1++;	}    }  /* Now we can construct the list. We don't count pairs if the sym in s1   * is non-canonical.
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -