⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trees.c

📁 clustalw1.83.DOS.ZIP,用于多序列比对的软件
💻 C
📖 第 1 页 / 共 5 页
字号:
/*.................compute SMATij values and find the smallest one ........*/

		for(jj=2; jj<=last_seq-first_seq+1; ++jj) 
			if(tkill[jj] != 1) 
				for(ii=1; ii<jj; ++ii)
					if(tkill[ii] != 1) {
						diq = djq = 0.0;

						for(i=1; i<=last_seq-first_seq+1; ++i) {
							diq = diq + tmat[i][ii];
							djq = djq + tmat[i][jj];
						}

						dij = tmat[ii][jj];
						d2r = diq + djq - (2.0*dij);
						dr  = sumd - dij -d2r;
						fnseqs2 = fnseqs - 2.0;
					        total= d2r+ fnseqs2*dij +dr*2.0;
						total= total / (2.0*fnseqs2);

						if(total < tmin) {
							tmin = total;
							mini = ii;
							minj = jj;
						}
					}
		

/*.................compute branch lengths and print the results ........*/


		dio = djo = 0.0;
		for(i=1; i<=last_seq-first_seq+1; ++i) {
			dio = dio + tmat[i][mini];
			djo = djo + tmat[i][minj];
		}

		dmin = tmat[mini][minj];
		dio = (dio - dmin) / fnseqs2;
		djo = (djo - dmin) / fnseqs2;
		bi = (dmin + dio - djo) * 0.5;
		bj = dmin - bi;
		bi = bi - av[mini];
		bj = bj - av[minj];

		if( av[mini] > 0.0 )
			typei = 0;
		else
			typei = 1;
		if( av[minj] > 0.0 )
			typej = 0;
		else
			typej = 1;

		if(verbose) 
	 	    fprintf(tree,"\n Cycle%4d     = ",(pint)nc);

/* 
   set negative branch lengths to zero.  Also set any tiny positive
   branch lengths to zero.
*/		if( fabs(bi) < 0.0001) bi = 0.0;
		if( fabs(bj) < 0.0001) bj = 0.0;

	    	if(verbose) {
		    if(typei == 0) 
			fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi);
		    else 
			fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi);

		    if(typej == 0) 
			fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj);
		    else 
			fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj);

		    fprintf(tree,"\n");
	    	}	


	    	left_branch[nc] = bi;
	    	right_branch[nc] = bj;

		for(i=1; i<=last_seq-first_seq+1; i++)
			tree_description[nc][i] = 0;

	     	if(typei == 0) { 
			for(i=nc-1; i>=1; i--)
				if(tree_description[i][mini] == 1) {
					for(j=1; j<=last_seq-first_seq+1; j++)  
					     if(tree_description[i][j] == 1)
						    tree_description[nc][j] = 1;
					break;
				}
		}
		else
			tree_description[nc][mini] = 1;

		if(typej == 0) {
			for(i=nc-1; i>=1; i--) 
				if(tree_description[i][minj] == 1) {
					for(j=1; j<=last_seq-first_seq+1; j++)  
					     if(tree_description[i][j] == 1)
						    tree_description[nc][j] = 1;
					break;
				}
		}
		else
			tree_description[nc][minj] = 1;
			

/* 
   Here is where the -0.00005 branch lengths come from for 3 or more
   identical seqs.
*/
/*		if(dmin <= 0.0) dmin = 0.0001; */
                if(dmin <= 0.0) dmin = 0.000001;
		av[mini] = dmin * 0.5;

/*........................Re-initialisation................................*/

		fnseqs = fnseqs - 1.0;
		tkill[minj] = 1;

		for(j=1; j<=last_seq-first_seq+1; ++j) 
			if( tkill[j] != 1 ) {
				da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5;
				if( (mini - j) < 0 ) 
					tmat[mini][j] = da;
				if( (mini - j) > 0)
					tmat[j][mini] = da;
			}

		for(j=1; j<=last_seq-first_seq+1; ++j)
			tmat[minj][j] = tmat[j][minj] = 0.0;


/****/	}						/**end main cycle**/

/******************************Last Cycle (3 Seqs. left)********************/

	nude = 1;

	for(i=1; i<=last_seq-first_seq+1; ++i)
		if( tkill[i] != 1 ) {
			l[nude] = i;
			nude = nude + 1;
		}

	b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
	b2 =  tmat[l[1]][l[2]] - b1;
	b3 =  tmat[l[1]][l[3]] - b1;
 
	branch[1] = b1 - av[l[1]];
	branch[2] = b2 - av[l[2]];
	branch[3] = b3 - av[l[3]];

/* Reset tiny negative and positive branch lengths to zero */
	if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
	if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
	if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;

	left_branch[last_seq-first_seq+1-2] = branch[1];
	left_branch[last_seq-first_seq+1-1] = branch[2];
	left_branch[last_seq-first_seq+1]   = branch[3];

	for(i=1; i<=last_seq-first_seq+1; i++)
		tree_description[last_seq-first_seq+1-2][i] = 0;

	if(verbose)
		fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc);

	for(i=1; i<=3; ++i) {
	   if( av[l[i]] > 0.0) {
	      	if(verbose)
	      	    fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]);
		for(k=last_seq-first_seq+1-3; k>=1; k--)
			if(tree_description[k][l[i]] == 1) {
				for(j=1; j<=last_seq-first_seq+1; j++)
				 	if(tree_description[k][j] == 1)
					    tree_description[last_seq-first_seq+1-2][j] = i;
				break;
			}
	   }
	   else  {
	      	if(verbose)
	   	    fprintf(tree,"\n\t\t  SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]);
		tree_description[last_seq-first_seq+1-2][l[i]] = i;
	   }
	   if(i < 3) {
	      	if(verbose)
	            fprintf(tree,"joins");
	   }
	}

	if(verbose)
		fprintf(tree,"\n");

}

#else /* ORIGINAL_NJ_TREE */

void nj_tree(char **tree_description, FILE *tree) {
	void fast_nj_tree();
 
	/*fprintf(stderr, "****** call fast_nj_tree() !!!! ******\n");*/
	fast_nj_tree(tree_description, tree);
}


/****************************************************************************
 * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited.
 *						written by Tadashi Koike
 *						(takoike@genes.nig.ac.jp)
 *******************
 * <IMPROVEMENT 1> : Store the value of sum of the score to temporary array,
 *                   and use again and again.
 *
 *	In the main cycle, these are calculated again and again :
 *	    diq = sum of tmat[n][ii]   (n:1 to last_seq-first_seq+1),
 *	    djq = sum of tmat[n][jj]   (n:1 to last_seq-first_seq+1),
 *	    dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1),
 *	    djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1)
 *		// 'last_seq' and 'first_seq' are both constant values //
 *	and the result of above calculations is always same until 
 *	a best pair of neighbour nodes is joined.
 *
 *	So, we change the logic to calculate the sum[i] (=sum of tmat[n][i]
 *	(n:1 to last_seq-first_seq+1)) and store it to array, before
 *	beginning to find a best pair of neighbour nodes, and after that 
 *	we use them again and again.
 *
 *	    tmat[i][j]
 *	              1   2   3   4   5
 *	            +---+---+---+---+---+
 *	          1 |   |   |   |   |   |
 *	            +---+---+---+---+---+
 *	          2 |   |   |   |   |   |  1) calculate sum of tmat[n][i]
 *	            +---+---+---+---+---+        (n: 1 to last_seq-first_seq+1)
 *	          3 |   |   |   |   |   |  2) store that sum value to sum[i]
 *	            +---+---+---+---+---+
 *	          4 |   |   |   |   |   |  3) use sum[i] during finding a best
 *	            +---+---+---+---+---+     pair of neibour nodes.
 *	          5 |   |   |   |   |   |
 *	            +---+---+---+---+---+
 *	              |   |   |   |   |
 *	              V   V   V   V   V  Calculate sum , and store it to sum[i]
 *	            +---+---+---+---+---+
 *	     sum[i] |   |   |   |   |   |
 *	            +---+---+---+---+---+
 *
 *	At this time, we thought that we use upper triangle of the matrix
 *	because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal 
 *	to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead 
 *	of sum[i] for storing the sum value.
 *
 *	    tmat[i][j]
 *	              1   2   3   4   5     sum_cols[i]
 *	            +---+---+---+---+---+     +---+
 *	          1     | # | # | # | # | --> |   | ... sum of tmat[1][2..5]
 *	            + - +---+---+---+---+     +---+
 *	          2         | # | # | # | --> |   | ... sum of tmat[2][3..5]
 *	            + - + - +---+---+---+     +---+
 *	          3             | # | # | --> |   | ... sum of tmat[3][4..5]
 *	            + - + - + - +---+---+     +---+
 *	          4                 | # | --> |   | ... sum of tmat[4][5]
 *	            + - + - + - + - +---+     +---+
 *	          5                     | --> |   | ... zero
 *	            + - + - + - + - + - +     +---+
 *	              |   |   |   |   |
 *	              V   V   V   V   V  Calculate sum , sotre to sum[i]
 *	            +---+---+---+---+---+
 *	sum_rows[i] |   |   |   |   |   |
 *	            +---+---+---+---+---+
 *	              |   |   |   |   |
 *	              |   |   |   |   +----- sum of tmat[1..4][5]
 *	              |   |   |   +--------- sum of tmat[1..3][4]
 *	              |   |   +------------- sum of tmat[1..2][3]
 *	              |   +----------------- sum of tmat[1][2]
 *	              +--------------------- zero
 *
 *	And we use (sum_rows[i] + sum_cols[i]) instead of sum[i].
 *
 *******************
 * <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of
 *                   tkill[i] flag array.
 *
 *	In original logic, invalid(killed?) nodes after nodes-joining
 *	are managed with tkill[i] flag array (set to 1 when killed).
 *	By this method, it is conspicuous to try next node but skip it
 *	at the latter of finding a best pair of neighbor nodes.
 *
 *	So, we thought that we managed valid nodes by using a chain list 
 *	as below:
 *
 *	1) declare the list structure.
 *		struct {
 *		    sint n;		// entry number of node.
 *		    void *prev;		// pointer to previous entry.
 *		    void *next;		// pointer to next entry.
 *		}
 *	2) construct a valid node list.
 *
 *       +-----+    +-----+    +-----+    +-----+        +-----+
 * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev |
 *       |  0  |    |  1  |    |  2  |    |  3  |        |  n  |
 *       | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL
 *       +-----+    +-----+    +-----+    +-----+        +-----+
 *
 *	3) when finding a best pair of neighbor nodes, we use
 *	   this chain list as loop counter.
 *
 *	4) If an entry was killed by node-joining, this chain list is
 *	   modified to remove that entry.
 *
 *	   EX) remove the entry No 2.
 *       +-----+    +-----+               +-----+        +-----+
 * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev |
 *       |  0  |    |  1  |               |  3  |        |  n  |
 *       | next|--->| next|-------------->| next|- - - ->| next|->NULL
 *       +-----+    +-----+               +-----+        +-----+
 *                             +-----+
 *                       NULL<-|prev |
 *                             |  2  |
 *                             | next|->NULL
 *                             +-----+
 *
 *	By this method, speed is up at the latter of finding a best pair of
 *	neighbor nodes.
 *
 *******************
 * <IMPROVEMENT 3> : Cut the frequency of division.
 *
 * At comparison between 'total' and 'tmin' in the main cycle, total is
 * divided by (2.0*fnseqs2) before comparison.  If N nodes are available, 
 * that division happen (N*(N-1))/2 order.
 *
 * We thought that the comparison relation between tmin and total/(2.0*fnseqs2)
 * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total.
 * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing
 * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead.
 *
 *******************
 * <IMPROVEMENT 4> : some transformation of the equation (to cut operations).
 *
 * We transform an equation of calculating 'total' in the main cycle.
 *
 */


void fast_nj_tree(char **tree_description, FILE *tree)
{
	register int i;
	sint l[4],nude,k;
	sint nc,mini,minj,j,ii,jj;
	double fnseqs,fnseqs2=0,sumd;
	double diq,djq,dij,d2r,dr,dio,djo,da;
	double tmin,total,dmin;
	double bi,bj,b1,b2,b3,branch[4];
	sint typei,typej;             /* 0 = node; 1 = OTU */

	/* IMPROVEMENT 1, STEP 0 : declare  variables */
	double *sum_cols, *sum_rows, *join;

	/* IMPROVEMENT 2, STEP 0 : declare  variables */
	sint loop_limit;
	typedef struct _ValidNodeID {
	    sint n;
	    struct _ValidNodeID *prev;
	    struct _ValidNodeID *next;
	} ValidNodeID;
	ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next;

	/*
	 * correspondence of the loop counter variables.
	 *   i .. lpi->n,	ii .. lpii->n
	 *   j .. lpj->n,	jj .. lpjj->n
	 */

	fnseqs = (double)last_seq-first_seq+1;

/*********************** First initialisation ***************************/
	
	if(verbose)  {
		fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n");
		fprintf(tree,"\n Saitou, N. and Nei, M. (1987)");
		fprintf(tree," The Neighbor-joining Method:");
		fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees.");
		fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n");
		fprintf(tree,"\n\n This is an UNROOTED tree\n");
		fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n");
	}	

	if (fnseqs == 2) {
		if (verbose) fprintf(tree,"Cycle   1     =  SEQ:   1 (%9.5f) joins  SEQ:   2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]);
		return;
	}

	mini = minj = 0;

	left_branch 	= (double *) ckalloc( (nseqs+2) * sizeof (double)   );
	right_branch    = (double *) ckalloc( (nseqs+2) * sizeof (double)   );
	tkill 		= (sint *) ckalloc( (nseqs+1) * sizeof (sint) );
	av   		= (double *) ckalloc( (nseqs+1) * sizeof (double)   );

	/* IMPROVEMENT 1, STEP 1 : Allocate memory */
	sum_cols	= (double *) ckalloc( (nseqs+1) * sizeof (double)   );
	sum_rows	= (double *) ckalloc( (nseqs+1) * sizeof (double)   );
	join		= (double *) ckalloc( (nseqs+1) * sizeof (double)   );

	/* IMPROVEMENT 2, STEP 1 : Allocate memory */
	tvalid	= (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) );
	/* tvalid[0] is special entry in array. it points a header of valid entry list */
	tvalid[0].n = 0;
	tvalid[0].prev = NULL;
	tvalid[0].next = &tvalid[1];

	/* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */
	for(i=1, loop_limit = last_seq-first_seq+1,
		lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ;
		i<=loop_limit ;
		++i, ++lpi, ++lp_prev, ++lp_next)
		{
		tmat[i][i] = av[i] = 0.0;
		tkill[i] = 0;
		lpi->n = i;
		lpi->prev = lp_prev;
		lpi->next = lp_next;

		/* IMPROVEMENT 1, STEP 2 : Initialize arrays */
		sum_cols[i] = sum_rows[i] = join[i] = 0.0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -