📄 trees.c
字号:
if(tkill[ii] != 1) { diq = djq = 0.0; for(i=1; i<=last_seq-first_seq+1; ++i) { diq = diq + tmat[i][ii]; djq = djq + tmat[i][jj]; } dij = tmat[ii][jj]; d2r = diq + djq - (2.0*dij); dr = sumd - dij -d2r; fnseqs2 = fnseqs - 2.0; total= d2r+ fnseqs2*dij +dr*2.0; total= total / (2.0*fnseqs2); if(total < tmin) { tmin = total; mini = ii; minj = jj; } } /*.................compute branch lengths and print the results ........*/ dio = djo = 0.0; for(i=1; i<=last_seq-first_seq+1; ++i) { dio = dio + tmat[i][mini]; djo = djo + tmat[i][minj]; } dmin = tmat[mini][minj]; dio = (dio - dmin) / fnseqs2; djo = (djo - dmin) / fnseqs2; bi = (dmin + dio - djo) * 0.5; bj = dmin - bi; bi = bi - av[mini]; bj = bj - av[minj]; if( av[mini] > 0.0 ) typei = 0; else typei = 1; if( av[minj] > 0.0 ) typej = 0; else typej = 1; if(verbose) fprintf(tree,"\n Cycle%4d = ",(pint)nc);/* set negative branch lengths to zero. Also set any tiny positive branch lengths to zero.*/ if( fabs(bi) < 0.0001) bi = 0.0; if( fabs(bj) < 0.0001) bj = 0.0; if(verbose) { if(typei == 0) fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi); else fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi); if(typej == 0) fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj); else fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj); fprintf(tree,"\n"); } left_branch[nc] = bi; right_branch[nc] = bj; for(i=1; i<=last_seq-first_seq+1; i++) tree_description[nc][i] = 0; if(typei == 0) { for(i=nc-1; i>=1; i--) if(tree_description[i][mini] == 1) { for(j=1; j<=last_seq-first_seq+1; j++) if(tree_description[i][j] == 1) tree_description[nc][j] = 1; break; } } else tree_description[nc][mini] = 1; if(typej == 0) { for(i=nc-1; i>=1; i--) if(tree_description[i][minj] == 1) { for(j=1; j<=last_seq-first_seq+1; j++) if(tree_description[i][j] == 1) tree_description[nc][j] = 1; break; } } else tree_description[nc][minj] = 1; /* Here is where the -0.00005 branch lengths come from for 3 or more identical seqs.*//* if(dmin <= 0.0) dmin = 0.0001; */ if(dmin <= 0.0) dmin = 0.000001; av[mini] = dmin * 0.5;/*........................Re-initialisation................................*/ fnseqs = fnseqs - 1.0; tkill[minj] = 1; for(j=1; j<=last_seq-first_seq+1; ++j) if( tkill[j] != 1 ) { da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5; if( (mini - j) < 0 ) tmat[mini][j] = da; if( (mini - j) > 0) tmat[j][mini] = da; } for(j=1; j<=last_seq-first_seq+1; ++j) tmat[minj][j] = tmat[j][minj] = 0.0;/****/ } /**end main cycle**//******************************Last Cycle (3 Seqs. left)********************/ nude = 1; for(i=1; i<=last_seq-first_seq+1; ++i) if( tkill[i] != 1 ) { l[nude] = i; nude = nude + 1; } b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5; b2 = tmat[l[1]][l[2]] - b1; b3 = tmat[l[1]][l[3]] - b1; branch[1] = b1 - av[l[1]]; branch[2] = b2 - av[l[2]]; branch[3] = b3 - av[l[3]];/* Reset tiny negative and positive branch lengths to zero */ if( fabs(branch[1]) < 0.0001) branch[1] = 0.0; if( fabs(branch[2]) < 0.0001) branch[2] = 0.0; if( fabs(branch[3]) < 0.0001) branch[3] = 0.0; left_branch[last_seq-first_seq+1-2] = branch[1]; left_branch[last_seq-first_seq+1-1] = branch[2]; left_branch[last_seq-first_seq+1] = branch[3]; for(i=1; i<=last_seq-first_seq+1; i++) tree_description[last_seq-first_seq+1-2][i] = 0; if(verbose) fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc); for(i=1; i<=3; ++i) { if( av[l[i]] > 0.0) { if(verbose) fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]); for(k=last_seq-first_seq+1-3; k>=1; k--) if(tree_description[k][l[i]] == 1) { for(j=1; j<=last_seq-first_seq+1; j++) if(tree_description[k][j] == 1) tree_description[last_seq-first_seq+1-2][j] = i; break; } } else { if(verbose) fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]); tree_description[last_seq-first_seq+1-2][l[i]] = i; } if(i < 3) { if(verbose) fprintf(tree,"joins"); } } if(verbose) fprintf(tree,"\n");}#else /* ORIGINAL_NJ_TREE */void nj_tree(char **tree_description, FILE *tree) { void fast_nj_tree(); /*fprintf(stderr, "****** call fast_nj_tree() !!!! ******\n");*/ fast_nj_tree(tree_description, tree);}/**************************************************************************** * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited. * written by Tadashi Koike * (takoike@genes.nig.ac.jp) ******************* * <IMPROVEMENT 1> : Store the value of sum of the score to temporary array, * and use again and again. * * In the main cycle, these are calculated again and again : * diq = sum of tmat[n][ii] (n:1 to last_seq-first_seq+1), * djq = sum of tmat[n][jj] (n:1 to last_seq-first_seq+1), * dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1), * djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1) * // 'last_seq' and 'first_seq' are both constant values // * and the result of above calculations is always same until * a best pair of neighbour nodes is joined. * * So, we change the logic to calculate the sum[i] (=sum of tmat[n][i] * (n:1 to last_seq-first_seq+1)) and store it to array, before * beginning to find a best pair of neighbour nodes, and after that * we use them again and again. * * tmat[i][j] * 1 2 3 4 5 * +---+---+---+---+---+ * 1 | | | | | | * +---+---+---+---+---+ * 2 | | | | | | 1) calculate sum of tmat[n][i] * +---+---+---+---+---+ (n: 1 to last_seq-first_seq+1) * 3 | | | | | | 2) store that sum value to sum[i] * +---+---+---+---+---+ * 4 | | | | | | 3) use sum[i] during finding a best * +---+---+---+---+---+ pair of neibour nodes. * 5 | | | | | | * +---+---+---+---+---+ * | | | | | * V V V V V Calculate sum , and store it to sum[i] * +---+---+---+---+---+ * sum[i] | | | | | | * +---+---+---+---+---+ * * At this time, we thought that we use upper triangle of the matrix * because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal * to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead * of sum[i] for storing the sum value. * * tmat[i][j] * 1 2 3 4 5 sum_cols[i] * +---+---+---+---+---+ +---+ * 1 | # | # | # | # | --> | | ... sum of tmat[1][2..5] * + - +---+---+---+---+ +---+ * 2 | # | # | # | --> | | ... sum of tmat[2][3..5] * + - + - +---+---+---+ +---+ * 3 | # | # | --> | | ... sum of tmat[3][4..5] * + - + - + - +---+---+ +---+ * 4 | # | --> | | ... sum of tmat[4][5] * + - + - + - + - +---+ +---+ * 5 | --> | | ... zero * + - + - + - + - + - + +---+ * | | | | | * V V V V V Calculate sum , sotre to sum[i] * +---+---+---+---+---+ * sum_rows[i] | | | | | | * +---+---+---+---+---+ * | | | | | * | | | | +----- sum of tmat[1..4][5] * | | | +--------- sum of tmat[1..3][4] * | | +------------- sum of tmat[1..2][3] * | +----------------- sum of tmat[1][2] * +--------------------- zero * * And we use (sum_rows[i] + sum_cols[i]) instead of sum[i]. * ******************* * <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of * tkill[i] flag array. * * In original logic, invalid(killed?) nodes after nodes-joining * are managed with tkill[i] flag array (set to 1 when killed). * By this method, it is conspicuous to try next node but skip it * at the latter of finding a best pair of neighbor nodes. * * So, we thought that we managed valid nodes by using a chain list * as below: * * 1) declare the list structure. * struct { * sint n; // entry number of node. * void *prev; // pointer to previous entry. * void *next; // pointer to next entry. * } * 2) construct a valid node list. * * +-----+ +-----+ +-----+ +-----+ +-----+ * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev | * | 0 | | 1 | | 2 | | 3 | | n | * | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL * +-----+ +-----+ +-----+ +-----+ +-----+ * * 3) when finding a best pair of neighbor nodes, we use * this chain list as loop counter. * * 4) If an entry was killed by node-joining, this chain list is * modified to remove that entry. * * EX) remove the entry No 2. * +-----+ +-----+ +-----+ +-----+ * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev | * | 0 | | 1 | | 3 | | n | * | next|--->| next|-------------->| next|- - - ->| next|->NULL * +-----+ +-----+ +-----+ +-----+ * +-----+ * NULL<-|prev | * | 2 | * | next|->NULL * +-----+ * * By this method, speed is up at the latter of finding a best pair of * neighbor nodes. * ******************* * <IMPROVEMENT 3> : Cut the frequency of division. * * At comparison between 'total' and 'tmin' in the main cycle, total is * divided by (2.0*fnseqs2) before comparison. If N nodes are available, * that division happen (N*(N-1))/2 order. * * We thought that the comparison relation between tmin and total/(2.0*fnseqs2) * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total. * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead. * ******************* * <IMPROVEMENT 4> : some transformation of the equation (to cut operations). * * We transform an equation of calculating 'total' in the main cycle. * */void fast_nj_tree(char **tree_description, FILE *tree){ register int i; sint l[4],nude,k; sint nc,mini,minj,j,ii,jj; double fnseqs,fnseqs2=0,sumd; double diq,djq,dij,d2r,dr,dio,djo,da; double tmin,total,dmin; double bi,bj,b1,b2,b3,branch[4]; sint typei,typej; /* 0 = node; 1 = OTU */ /* IMPROVEMENT 1, STEP 0 : declare variables */ double *sum_cols, *sum_rows, *join; /* IMPROVEMENT 2, STEP 0 : declare variables */ sint loop_limit; typedef struct _ValidNodeID { sint n; struct _ValidNodeID *prev; struct _ValidNodeID *next; } ValidNodeID; ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next; /* * correspondence of the loop counter variables. * i .. lpi->n, ii .. lpii->n * j .. lpj->n, jj .. lpjj->n */ fnseqs = (double)last_seq-first_seq+1;/*********************** First initialisation ***************************/ if(verbose) { fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n"); fprintf(tree,"\n Saitou, N. and Nei, M. (1987)"); fprintf(tree," The Neighbor-joining Method:"); fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees."); fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n"); fprintf(tree,"\n\n This is an UNROOTED tree\n"); fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n"); } if (fnseqs == 2) { if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]); return; } mini = minj = 0; left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) ); right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) ); tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) ); av = (double *) ckalloc( (nseqs+1) * sizeof (double) ); /* IMPROVEMENT 1, STEP 1 : Allocate memory */ sum_cols = (double *) ckalloc( (nseqs+1) * sizeof (double) ); sum_rows = (double *) ckalloc( (nseqs+1) * sizeof (double) ); join = (double *) ckalloc( (nseqs+1) * sizeof (double) ); /* IMPROVEMENT 2, STEP 1 : Allocate memory */ tvalid = (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) ); /* tvalid[0] is special entry in array. it points a header of valid entry list */ tvalid[0].n = 0; tvalid[0].prev = NULL; tvalid[0].next = &tvalid[1]; /* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */ for(i=1, loop_limit = last_seq-first_seq+1, lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ; i<=loop_limit ; ++i, ++lpi, ++lp_prev, ++lp_next) { tmat[i][i] = av[i] = 0.0; tkill[i] = 0; lpi->n = i; lpi->prev = lp_prev; lpi->next = lp_next; /* IMPROVEMENT 1, STEP 2 : Initialize arrays */ sum_cols[i] = sum_rows[i] = join[i] = 0.0; } tvalid[loop_limit].next = NULL; /* * IMPROVEMENT 1, STEP 3 : Calculate the sum of score value that * is sequence[i] to others. */ sumd = 0.0; for (lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) { double tmp_sum = 0.0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -