⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trees.c

📁 clustalx用来做基因序列分析
💻 C
📖 第 1 页 / 共 5 页
字号:
					if(tkill[ii] != 1) {						diq = djq = 0.0;						for(i=1; i<=last_seq-first_seq+1; ++i) {							diq = diq + tmat[i][ii];							djq = djq + tmat[i][jj];						}						dij = tmat[ii][jj];						d2r = diq + djq - (2.0*dij);						dr  = sumd - dij -d2r;						fnseqs2 = fnseqs - 2.0;					        total= d2r+ fnseqs2*dij +dr*2.0;						total= total / (2.0*fnseqs2);						if(total < tmin) {							tmin = total;							mini = ii;							minj = jj;						}					}		/*.................compute branch lengths and print the results ........*/		dio = djo = 0.0;		for(i=1; i<=last_seq-first_seq+1; ++i) {			dio = dio + tmat[i][mini];			djo = djo + tmat[i][minj];		}		dmin = tmat[mini][minj];		dio = (dio - dmin) / fnseqs2;		djo = (djo - dmin) / fnseqs2;		bi = (dmin + dio - djo) * 0.5;		bj = dmin - bi;		bi = bi - av[mini];		bj = bj - av[minj];		if( av[mini] > 0.0 )			typei = 0;		else			typei = 1;		if( av[minj] > 0.0 )			typej = 0;		else			typej = 1;		if(verbose) 	 	    fprintf(tree,"\n Cycle%4d     = ",(pint)nc);/*    set negative branch lengths to zero.  Also set any tiny positive   branch lengths to zero.*/		if( fabs(bi) < 0.0001) bi = 0.0;		if( fabs(bj) < 0.0001) bj = 0.0;	    	if(verbose) {		    if(typei == 0) 			fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi);		    else 			fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi);		    if(typej == 0) 			fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj);		    else 			fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj);		    fprintf(tree,"\n");	    	}		    	left_branch[nc] = bi;	    	right_branch[nc] = bj;		for(i=1; i<=last_seq-first_seq+1; i++)			tree_description[nc][i] = 0;	     	if(typei == 0) { 			for(i=nc-1; i>=1; i--)				if(tree_description[i][mini] == 1) {					for(j=1; j<=last_seq-first_seq+1; j++)  					     if(tree_description[i][j] == 1)						    tree_description[nc][j] = 1;					break;				}		}		else			tree_description[nc][mini] = 1;		if(typej == 0) {			for(i=nc-1; i>=1; i--) 				if(tree_description[i][minj] == 1) {					for(j=1; j<=last_seq-first_seq+1; j++)  					     if(tree_description[i][j] == 1)						    tree_description[nc][j] = 1;					break;				}		}		else			tree_description[nc][minj] = 1;			/*    Here is where the -0.00005 branch lengths come from for 3 or more   identical seqs.*//*		if(dmin <= 0.0) dmin = 0.0001; */                if(dmin <= 0.0) dmin = 0.000001;		av[mini] = dmin * 0.5;/*........................Re-initialisation................................*/		fnseqs = fnseqs - 1.0;		tkill[minj] = 1;		for(j=1; j<=last_seq-first_seq+1; ++j) 			if( tkill[j] != 1 ) {				da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5;				if( (mini - j) < 0 ) 					tmat[mini][j] = da;				if( (mini - j) > 0)					tmat[j][mini] = da;			}		for(j=1; j<=last_seq-first_seq+1; ++j)			tmat[minj][j] = tmat[j][minj] = 0.0;/****/	}						/**end main cycle**//******************************Last Cycle (3 Seqs. left)********************/	nude = 1;	for(i=1; i<=last_seq-first_seq+1; ++i)		if( tkill[i] != 1 ) {			l[nude] = i;			nude = nude + 1;		}	b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;	b2 =  tmat[l[1]][l[2]] - b1;	b3 =  tmat[l[1]][l[3]] - b1; 	branch[1] = b1 - av[l[1]];	branch[2] = b2 - av[l[2]];	branch[3] = b3 - av[l[3]];/* Reset tiny negative and positive branch lengths to zero */	if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;	if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;	if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;	left_branch[last_seq-first_seq+1-2] = branch[1];	left_branch[last_seq-first_seq+1-1] = branch[2];	left_branch[last_seq-first_seq+1]   = branch[3];	for(i=1; i<=last_seq-first_seq+1; i++)		tree_description[last_seq-first_seq+1-2][i] = 0;	if(verbose)		fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc);	for(i=1; i<=3; ++i) {	   if( av[l[i]] > 0.0) {	      	if(verbose)	      	    fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]);		for(k=last_seq-first_seq+1-3; k>=1; k--)			if(tree_description[k][l[i]] == 1) {				for(j=1; j<=last_seq-first_seq+1; j++)				 	if(tree_description[k][j] == 1)					    tree_description[last_seq-first_seq+1-2][j] = i;				break;			}	   }	   else  {	      	if(verbose)	   	    fprintf(tree,"\n\t\t  SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]);		tree_description[last_seq-first_seq+1-2][l[i]] = i;	   }	   if(i < 3) {	      	if(verbose)	            fprintf(tree,"joins");	   }	}	if(verbose)		fprintf(tree,"\n");}#else /* ORIGINAL_NJ_TREE */void nj_tree(char **tree_description, FILE *tree) {	void fast_nj_tree(); 	/*fprintf(stderr, "****** call fast_nj_tree() !!!! ******\n");*/	fast_nj_tree(tree_description, tree);}/**************************************************************************** * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited. *						written by Tadashi Koike *						(takoike@genes.nig.ac.jp) ******************* * <IMPROVEMENT 1> : Store the value of sum of the score to temporary array, *                   and use again and again. * *	In the main cycle, these are calculated again and again : *	    diq = sum of tmat[n][ii]   (n:1 to last_seq-first_seq+1), *	    djq = sum of tmat[n][jj]   (n:1 to last_seq-first_seq+1), *	    dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1), *	    djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1) *		// 'last_seq' and 'first_seq' are both constant values // *	and the result of above calculations is always same until  *	a best pair of neighbour nodes is joined. * *	So, we change the logic to calculate the sum[i] (=sum of tmat[n][i] *	(n:1 to last_seq-first_seq+1)) and store it to array, before *	beginning to find a best pair of neighbour nodes, and after that  *	we use them again and again. * *	    tmat[i][j] *	              1   2   3   4   5 *	            +---+---+---+---+---+ *	          1 |   |   |   |   |   | *	            +---+---+---+---+---+ *	          2 |   |   |   |   |   |  1) calculate sum of tmat[n][i] *	            +---+---+---+---+---+        (n: 1 to last_seq-first_seq+1) *	          3 |   |   |   |   |   |  2) store that sum value to sum[i] *	            +---+---+---+---+---+ *	          4 |   |   |   |   |   |  3) use sum[i] during finding a best *	            +---+---+---+---+---+     pair of neibour nodes. *	          5 |   |   |   |   |   | *	            +---+---+---+---+---+ *	              |   |   |   |   | *	              V   V   V   V   V  Calculate sum , and store it to sum[i] *	            +---+---+---+---+---+ *	     sum[i] |   |   |   |   |   | *	            +---+---+---+---+---+ * *	At this time, we thought that we use upper triangle of the matrix *	because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal  *	to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead  *	of sum[i] for storing the sum value. * *	    tmat[i][j] *	              1   2   3   4   5     sum_cols[i] *	            +---+---+---+---+---+     +---+ *	          1     | # | # | # | # | --> |   | ... sum of tmat[1][2..5] *	            + - +---+---+---+---+     +---+ *	          2         | # | # | # | --> |   | ... sum of tmat[2][3..5] *	            + - + - +---+---+---+     +---+ *	          3             | # | # | --> |   | ... sum of tmat[3][4..5] *	            + - + - + - +---+---+     +---+ *	          4                 | # | --> |   | ... sum of tmat[4][5] *	            + - + - + - + - +---+     +---+ *	          5                     | --> |   | ... zero *	            + - + - + - + - + - +     +---+ *	              |   |   |   |   | *	              V   V   V   V   V  Calculate sum , sotre to sum[i] *	            +---+---+---+---+---+ *	sum_rows[i] |   |   |   |   |   | *	            +---+---+---+---+---+ *	              |   |   |   |   | *	              |   |   |   |   +----- sum of tmat[1..4][5] *	              |   |   |   +--------- sum of tmat[1..3][4] *	              |   |   +------------- sum of tmat[1..2][3] *	              |   +----------------- sum of tmat[1][2] *	              +--------------------- zero * *	And we use (sum_rows[i] + sum_cols[i]) instead of sum[i]. * ******************* * <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of *                   tkill[i] flag array. * *	In original logic, invalid(killed?) nodes after nodes-joining *	are managed with tkill[i] flag array (set to 1 when killed). *	By this method, it is conspicuous to try next node but skip it *	at the latter of finding a best pair of neighbor nodes. * *	So, we thought that we managed valid nodes by using a chain list  *	as below: * *	1) declare the list structure. *		struct { *		    sint n;		// entry number of node. *		    void *prev;		// pointer to previous entry. *		    void *next;		// pointer to next entry. *		} *	2) construct a valid node list. * *       +-----+    +-----+    +-----+    +-----+        +-----+ * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev | *       |  0  |    |  1  |    |  2  |    |  3  |        |  n  | *       | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL *       +-----+    +-----+    +-----+    +-----+        +-----+ * *	3) when finding a best pair of neighbor nodes, we use *	   this chain list as loop counter. * *	4) If an entry was killed by node-joining, this chain list is *	   modified to remove that entry. * *	   EX) remove the entry No 2. *       +-----+    +-----+               +-----+        +-----+ * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev | *       |  0  |    |  1  |               |  3  |        |  n  | *       | next|--->| next|-------------->| next|- - - ->| next|->NULL *       +-----+    +-----+               +-----+        +-----+ *                             +-----+ *                       NULL<-|prev | *                             |  2  | *                             | next|->NULL *                             +-----+ * *	By this method, speed is up at the latter of finding a best pair of *	neighbor nodes. * ******************* * <IMPROVEMENT 3> : Cut the frequency of division. * * At comparison between 'total' and 'tmin' in the main cycle, total is * divided by (2.0*fnseqs2) before comparison.  If N nodes are available,  * that division happen (N*(N-1))/2 order. * * We thought that the comparison relation between tmin and total/(2.0*fnseqs2) * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total. * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead. * ******************* * <IMPROVEMENT 4> : some transformation of the equation (to cut operations). * * We transform an equation of calculating 'total' in the main cycle. * */void fast_nj_tree(char **tree_description, FILE *tree){	register int i;	sint l[4],nude,k;	sint nc,mini,minj,j,ii,jj;	double fnseqs,fnseqs2=0,sumd;	double diq,djq,dij,d2r,dr,dio,djo,da;	double tmin,total,dmin;	double bi,bj,b1,b2,b3,branch[4];	sint typei,typej;             /* 0 = node; 1 = OTU */	/* IMPROVEMENT 1, STEP 0 : declare  variables */	double *sum_cols, *sum_rows, *join;	/* IMPROVEMENT 2, STEP 0 : declare  variables */	sint loop_limit;	typedef struct _ValidNodeID {	    sint n;	    struct _ValidNodeID *prev;	    struct _ValidNodeID *next;	} ValidNodeID;	ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next;	/*	 * correspondence of the loop counter variables.	 *   i .. lpi->n,	ii .. lpii->n	 *   j .. lpj->n,	jj .. lpjj->n	 */	fnseqs = (double)last_seq-first_seq+1;/*********************** First initialisation ***************************/		if(verbose)  {		fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n");		fprintf(tree,"\n Saitou, N. and Nei, M. (1987)");		fprintf(tree," The Neighbor-joining Method:");		fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees.");		fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n");		fprintf(tree,"\n\n This is an UNROOTED tree\n");		fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n");	}		if (fnseqs == 2) {		if (verbose) fprintf(tree,"Cycle   1     =  SEQ:   1 (%9.5f) joins  SEQ:   2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]);		return;	}	mini = minj = 0;	left_branch 	= (double *) ckalloc( (nseqs+2) * sizeof (double)   );	right_branch    = (double *) ckalloc( (nseqs+2) * sizeof (double)   );	tkill 		= (sint *) ckalloc( (nseqs+1) * sizeof (sint) );	av   		= (double *) ckalloc( (nseqs+1) * sizeof (double)   );	/* IMPROVEMENT 1, STEP 1 : Allocate memory */	sum_cols	= (double *) ckalloc( (nseqs+1) * sizeof (double)   );	sum_rows	= (double *) ckalloc( (nseqs+1) * sizeof (double)   );	join		= (double *) ckalloc( (nseqs+1) * sizeof (double)   );	/* IMPROVEMENT 2, STEP 1 : Allocate memory */	tvalid	= (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) );	/* tvalid[0] is special entry in array. it points a header of valid entry list */	tvalid[0].n = 0;	tvalid[0].prev = NULL;	tvalid[0].next = &tvalid[1];	/* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */	for(i=1, loop_limit = last_seq-first_seq+1,		lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ;		i<=loop_limit ;		++i, ++lpi, ++lp_prev, ++lp_next)		{		tmat[i][i] = av[i] = 0.0;		tkill[i] = 0;		lpi->n = i;		lpi->prev = lp_prev;		lpi->next = lp_next;		/* IMPROVEMENT 1, STEP 2 : Initialize arrays */		sum_cols[i] = sum_rows[i] = join[i] = 0.0;		}	tvalid[loop_limit].next = NULL;	/*	 * IMPROVEMENT 1, STEP 3 : Calculate the sum of score value that 	 * is sequence[i] to others.	 */	sumd = 0.0;	for (lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {		double tmp_sum = 0.0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -