📄 malign.c

📁 生物序列比对程序clustw的源代码
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
#include <stdio.h>#include <string.h>#include <ctype.h>#include <stdlib.h>#include "clustalw.h"/* *       Prototypes *//* *       Global Variables */extern double  **tmat;extern Boolean no_weights;extern sint     debug;extern sint     max_aa;extern sint     nseqs;extern sint     profile1_nseqs;extern sint     nsets;extern sint     **sets;extern sint     divergence_cutoff;extern sint     *seq_weight;extern sint     output_order, *output_index;extern Boolean distance_tree;extern char    seqname[];extern sint     *seqlen_array;extern char    **seq_array;sint malign(sint istart,char *phylip_name) /* full progressive alignment*/{   static 	sint *aligned;   static 	sint *group;   static 	sint ix;   sint 	*maxid, max, sum;   sint		*tree_weight;   sint 		i,j,set,iseq=0;   sint 		status,entries;   lint		score = 0;   info("Start of Multiple Alignment");/* get the phylogenetic tree from *.ph */   if (nseqs >= 2)      {       status = read_tree(phylip_name, (sint)0, nseqs);       if (status == 0) return((sint)0);     }/* calculate sequence weights according to branch lengths of the tree -   weights in global variable seq_weight normalised to sum to 100 */   calc_seq_weights((sint)0, nseqs, seq_weight);/* recalculate tmat matrix as percent similarity matrix */   status = calc_similarities(nseqs);   if (status == 0) return((sint)0);/* for each sequence, find the most closely related sequence */   maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));   for (i=1;i<=nseqs;i++)     {         maxid[i] = -1;         for (j=1;j<=nseqs;j++)            if (j!=i && maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];     }/* group the sequences according to their relative divergence */   if (istart == 0)     {        sets = (sint **) ckalloc( (nseqs+1) * sizeof (sint *) );        for(i=0;i<=nseqs;i++)           sets[i] = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );        create_sets((sint)0,nseqs);        info("There are %d groups",(pint)nsets);/* clear the memory used for the phylogenetic tree */        if (nseqs >= 2)             clear_tree(NULL);/* start the multiple alignments.........  */        info("Aligning...");/* first pass, align closely related sequences first.... */        ix = 0;        aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );        for (i=0;i<=nseqs;i++) aligned[i] = 0;        for(set=1;set<=nsets;++set)         {          entries=0;          for (i=1;i<=nseqs;i++)            {               if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff))                 {                    entries++;                    if  (aligned[i] == 0)                       {                          if (output_order==INPUT)                            {                              ++ix;                              output_index[i] = i;                            }                          else output_index[++ix] = i;                          aligned[i] = 1;                       }                 }            }          if(entries > 0) score = prfalign(sets[set], aligned);          else score=0.0;/* negative score means fatal error... exit now!  */          if (score < 0)              {                return(-1);             }          if ((entries > 0) && (score > 0))             info("Group %d: Sequences:%4d      Score:%d",             (pint)set,(pint)entries,(pint)score);          else             info("Group %d:                     Delayed",             (pint)set);        }        for (i=0;i<=nseqs;i++)          sets[i]=ckfree((void *)sets[i]);        sets=ckfree(sets);     }   else     {/* clear the memory used for the phylogenetic tree */        if (nseqs >= 2)             clear_tree(NULL);        aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );        ix = 0;        for (i=1;i<=istart+1;i++)         {           aligned[i] = 1;           ++ix;           output_index[i] = i;         }        for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;     }/* second pass - align remaining, more divergent sequences..... *//* if not all sequences were aligned, for each unaligned sequence,   find it's closest pair amongst the aligned sequences.  */    group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));    tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );    for (i=0;i<nseqs;i++)   		tree_weight[i] = seq_weight[i];/* if we haven't aligned any sequences, in the first pass - align thetwo most closely related sequences now */   if(ix==0)     {        max = -1;	iseq = 0;        for (i=1;i<=nseqs;i++)          {             for (j=i+1;j<=nseqs;j++)               {                  if (max < tmat[i][j])		  {                     max = tmat[i][j];                     iseq = i;                  }              }          }        aligned[iseq]=1;        if (output_order == INPUT)          {            ++ix;            output_index[iseq] = iseq;          }         else            output_index[++ix] = iseq;     }    while (ix < nseqs)      {             for (i=1;i<=nseqs;i++) {                if (aligned[i] == 0)                  {                     maxid[i] = -1;                     for (j=1;j<=nseqs;j++)                         if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))                            maxid[i] = tmat[i][j];                  }              }/* find the most closely related sequence to those already aligned */            max = -1;	    iseq = 0;            for (i=1;i<=nseqs;i++)              {                if ((aligned[i] == 0) && (maxid[i] > max))                  {                     max = maxid[i];                     iseq = i;                  }              }/* align this sequence to the existing alignment *//* weight sequences with percent identity with profile*//* OR...., multiply sequence weights from tree by percent identity with new sequence */   if(no_weights==FALSE) {   for (j=0;j<nseqs;j++)       if (aligned[j+1] != 0)              seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];/*  Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR*/         sum = 0;         for (j=0;j<nseqs;j++)           if (aligned[j+1] != 0)            sum += seq_weight[j];         if (sum == 0)          {           for (j=0;j<nseqs;j++)                seq_weight[j] = 1;                sum = j;          }         for (j=0;j<nseqs;j++)           if (aligned[j+1] != 0)             {               seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;               if (seq_weight[j] < 1) seq_weight[j] = 1;             }	}         entries = 0;         for (j=1;j<=nseqs;j++)           if (aligned[j] != 0)              {                 group[j] = 1;                 entries++;              }           else if (iseq==j)              {                 group[j] = 2;                 entries++;              }         aligned[iseq] = 1;         score = prfalign(group, aligned);         info("Sequence:%d     Score:%d",(pint)iseq,(pint)score);         if (output_order == INPUT)          {            ++ix;            output_index[iseq] = iseq;          }         else            output_index[++ix] = iseq;      }   group=ckfree((void *)group);   aligned=ckfree((void *)aligned);   maxid=ckfree((void *)maxid);   tree_weight=ckfree((void *)tree_weight);   aln_score();/* make the rest (output stuff) into routine clustal_out in file amenu.c */   return(nseqs);}sint seqalign(sint istart,char *phylip_name) /* sequence alignment to existing profile */{   static 	sint *aligned, *tree_weight;   static 	sint *group;   static 	sint ix;   sint 	*maxid, max;   sint 		i,j,status,iseq;   sint 		sum,entries;   lint		score = 0;   info("Start of Multiple Alignment");/* get the phylogenetic tree from *.ph */   if (nseqs >= 2)      {       status = read_tree(phylip_name, (sint)0, nseqs);       if (status == 0) return(0);     }/* calculate sequence weights according to branch lengths of the tree -   weights in global variable seq_weight normalised to sum to 100 */   calc_seq_weights((sint)0, nseqs, seq_weight);      tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );   for (i=0;i<nseqs;i++)   		tree_weight[i] = seq_weight[i];/* recalculate tmat matrix as percent similarity matrix */   status = calc_similarities(nseqs);   if (status == 0) return((sint)0);/* for each sequence, find the most closely related sequence */
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -