📄 trees.c
字号:
/*.................compute SMATij values and find the smallest one ........*/
for(jj=2; jj<=last_seq-first_seq+1; ++jj)
if(tkill[jj] != 1)
for(ii=1; ii<jj; ++ii)
if(tkill[ii] != 1) {
diq = djq = 0.0;
for(i=1; i<=last_seq-first_seq+1; ++i) {
diq = diq + tmat[i][ii];
djq = djq + tmat[i][jj];
}
dij = tmat[ii][jj];
d2r = diq + djq - (2.0*dij);
dr = sumd - dij -d2r;
fnseqs2 = fnseqs - 2.0;
total= d2r+ fnseqs2*dij +dr*2.0;
total= total / (2.0*fnseqs2);
if(total < tmin) {
tmin = total;
mini = ii;
minj = jj;
}
}
/*.................compute branch lengths and print the results ........*/
dio = djo = 0.0;
for(i=1; i<=last_seq-first_seq+1; ++i) {
dio = dio + tmat[i][mini];
djo = djo + tmat[i][minj];
}
dmin = tmat[mini][minj];
dio = (dio - dmin) / fnseqs2;
djo = (djo - dmin) / fnseqs2;
bi = (dmin + dio - djo) * 0.5;
bj = dmin - bi;
bi = bi - av[mini];
bj = bj - av[minj];
if( av[mini] > 0.0 )
typei = 0;
else
typei = 1;
if( av[minj] > 0.0 )
typej = 0;
else
typej = 1;
if(verbose)
fprintf(tree,"\n Cycle%4d = ",(pint)nc);
/*
set negative branch lengths to zero. Also set any tiny positive
branch lengths to zero.
*/ if( fabs(bi) < 0.0001) bi = 0.0;
if( fabs(bj) < 0.0001) bj = 0.0;
if(verbose) {
if(typei == 0)
fprintf(tree,"Node:%4d (%9.5f) joins ",(pint)mini,bi);
else
fprintf(tree," SEQ:%4d (%9.5f) joins ",(pint)mini,bi);
if(typej == 0)
fprintf(tree,"Node:%4d (%9.5f)",(pint)minj,bj);
else
fprintf(tree," SEQ:%4d (%9.5f)",(pint)minj,bj);
fprintf(tree,"\n");
}
left_branch[nc] = bi;
right_branch[nc] = bj;
for(i=1; i<=last_seq-first_seq+1; i++)
tree_description[nc][i] = 0;
if(typei == 0) {
for(i=nc-1; i>=1; i--)
if(tree_description[i][mini] == 1) {
for(j=1; j<=last_seq-first_seq+1; j++)
if(tree_description[i][j] == 1)
tree_description[nc][j] = 1;
break;
}
}
else
tree_description[nc][mini] = 1;
if(typej == 0) {
for(i=nc-1; i>=1; i--)
if(tree_description[i][minj] == 1) {
for(j=1; j<=last_seq-first_seq+1; j++)
if(tree_description[i][j] == 1)
tree_description[nc][j] = 1;
break;
}
}
else
tree_description[nc][minj] = 1;
/*
Here is where the -0.00005 branch lengths come from for 3 or more
identical seqs.
*/
/* if(dmin <= 0.0) dmin = 0.0001; */
if(dmin <= 0.0) dmin = 0.000001;
av[mini] = dmin * 0.5;
/*........................Re-initialisation................................*/
fnseqs = fnseqs - 1.0;
tkill[minj] = 1;
for(j=1; j<=last_seq-first_seq+1; ++j)
if( tkill[j] != 1 ) {
da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5;
if( (mini - j) < 0 )
tmat[mini][j] = da;
if( (mini - j) > 0)
tmat[j][mini] = da;
}
for(j=1; j<=last_seq-first_seq+1; ++j)
tmat[minj][j] = tmat[j][minj] = 0.0;
/****/ } /**end main cycle**/
/******************************Last Cycle (3 Seqs. left)********************/
nude = 1;
for(i=1; i<=last_seq-first_seq+1; ++i)
if( tkill[i] != 1 ) {
l[nude] = i;
nude = nude + 1;
}
b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
b2 = tmat[l[1]][l[2]] - b1;
b3 = tmat[l[1]][l[3]] - b1;
branch[1] = b1 - av[l[1]];
branch[2] = b2 - av[l[2]];
branch[3] = b3 - av[l[3]];
/* Reset tiny negative and positive branch lengths to zero */
if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
left_branch[last_seq-first_seq+1-2] = branch[1];
left_branch[last_seq-first_seq+1-1] = branch[2];
left_branch[last_seq-first_seq+1] = branch[3];
for(i=1; i<=last_seq-first_seq+1; i++)
tree_description[last_seq-first_seq+1-2][i] = 0;
if(verbose)
fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",(pint)nc);
for(i=1; i<=3; ++i) {
if( av[l[i]] > 0.0) {
if(verbose)
fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",(pint)l[i],branch[i]);
for(k=last_seq-first_seq+1-3; k>=1; k--)
if(tree_description[k][l[i]] == 1) {
for(j=1; j<=last_seq-first_seq+1; j++)
if(tree_description[k][j] == 1)
tree_description[last_seq-first_seq+1-2][j] = i;
break;
}
}
else {
if(verbose)
fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",(pint)l[i],branch[i]);
tree_description[last_seq-first_seq+1-2][l[i]] = i;
}
if(i < 3) {
if(verbose)
fprintf(tree,"joins");
}
}
if(verbose)
fprintf(tree,"\n");
}
#else /* ORIGINAL_NJ_TREE */
void nj_tree(char **tree_description, FILE *tree) {
void fast_nj_tree();
/*fprintf(stderr, "****** call fast_nj_tree() !!!! ******\n");*/
fast_nj_tree(tree_description, tree);
}
/****************************************************************************
* [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited.
* written by Tadashi Koike
* (takoike@genes.nig.ac.jp)
*******************
* <IMPROVEMENT 1> : Store the value of sum of the score to temporary array,
* and use again and again.
*
* In the main cycle, these are calculated again and again :
* diq = sum of tmat[n][ii] (n:1 to last_seq-first_seq+1),
* djq = sum of tmat[n][jj] (n:1 to last_seq-first_seq+1),
* dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1),
* djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1)
* // 'last_seq' and 'first_seq' are both constant values //
* and the result of above calculations is always same until
* a best pair of neighbour nodes is joined.
*
* So, we change the logic to calculate the sum[i] (=sum of tmat[n][i]
* (n:1 to last_seq-first_seq+1)) and store it to array, before
* beginning to find a best pair of neighbour nodes, and after that
* we use them again and again.
*
* tmat[i][j]
* 1 2 3 4 5
* +---+---+---+---+---+
* 1 | | | | | |
* +---+---+---+---+---+
* 2 | | | | | | 1) calculate sum of tmat[n][i]
* +---+---+---+---+---+ (n: 1 to last_seq-first_seq+1)
* 3 | | | | | | 2) store that sum value to sum[i]
* +---+---+---+---+---+
* 4 | | | | | | 3) use sum[i] during finding a best
* +---+---+---+---+---+ pair of neibour nodes.
* 5 | | | | | |
* +---+---+---+---+---+
* | | | | |
* V V V V V Calculate sum , and store it to sum[i]
* +---+---+---+---+---+
* sum[i] | | | | | |
* +---+---+---+---+---+
*
* At this time, we thought that we use upper triangle of the matrix
* because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal
* to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead
* of sum[i] for storing the sum value.
*
* tmat[i][j]
* 1 2 3 4 5 sum_cols[i]
* +---+---+---+---+---+ +---+
* 1 | # | # | # | # | --> | | ... sum of tmat[1][2..5]
* + - +---+---+---+---+ +---+
* 2 | # | # | # | --> | | ... sum of tmat[2][3..5]
* + - + - +---+---+---+ +---+
* 3 | # | # | --> | | ... sum of tmat[3][4..5]
* + - + - + - +---+---+ +---+
* 4 | # | --> | | ... sum of tmat[4][5]
* + - + - + - + - +---+ +---+
* 5 | --> | | ... zero
* + - + - + - + - + - + +---+
* | | | | |
* V V V V V Calculate sum , sotre to sum[i]
* +---+---+---+---+---+
* sum_rows[i] | | | | | |
* +---+---+---+---+---+
* | | | | |
* | | | | +----- sum of tmat[1..4][5]
* | | | +--------- sum of tmat[1..3][4]
* | | +------------- sum of tmat[1..2][3]
* | +----------------- sum of tmat[1][2]
* +--------------------- zero
*
* And we use (sum_rows[i] + sum_cols[i]) instead of sum[i].
*
*******************
* <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of
* tkill[i] flag array.
*
* In original logic, invalid(killed?) nodes after nodes-joining
* are managed with tkill[i] flag array (set to 1 when killed).
* By this method, it is conspicuous to try next node but skip it
* at the latter of finding a best pair of neighbor nodes.
*
* So, we thought that we managed valid nodes by using a chain list
* as below:
*
* 1) declare the list structure.
* struct {
* sint n; // entry number of node.
* void *prev; // pointer to previous entry.
* void *next; // pointer to next entry.
* }
* 2) construct a valid node list.
*
* +-----+ +-----+ +-----+ +-----+ +-----+
* NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev |
* | 0 | | 1 | | 2 | | 3 | | n |
* | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL
* +-----+ +-----+ +-----+ +-----+ +-----+
*
* 3) when finding a best pair of neighbor nodes, we use
* this chain list as loop counter.
*
* 4) If an entry was killed by node-joining, this chain list is
* modified to remove that entry.
*
* EX) remove the entry No 2.
* +-----+ +-----+ +-----+ +-----+
* NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev |
* | 0 | | 1 | | 3 | | n |
* | next|--->| next|-------------->| next|- - - ->| next|->NULL
* +-----+ +-----+ +-----+ +-----+
* +-----+
* NULL<-|prev |
* | 2 |
* | next|->NULL
* +-----+
*
* By this method, speed is up at the latter of finding a best pair of
* neighbor nodes.
*
*******************
* <IMPROVEMENT 3> : Cut the frequency of division.
*
* At comparison between 'total' and 'tmin' in the main cycle, total is
* divided by (2.0*fnseqs2) before comparison. If N nodes are available,
* that division happen (N*(N-1))/2 order.
*
* We thought that the comparison relation between tmin and total/(2.0*fnseqs2)
* is equal to the comparison relation between (tmin*2.0*fnseqs2) and total.
* Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing
* a total value and multiply tmin and (tmin*2.0*fnseqs2) instead.
*
*******************
* <IMPROVEMENT 4> : some transformation of the equation (to cut operations).
*
* We transform an equation of calculating 'total' in the main cycle.
*
*/
void fast_nj_tree(char **tree_description, FILE *tree)
{
register int i;
sint l[4],nude,k;
sint nc,mini,minj,j,ii,jj;
double fnseqs,fnseqs2=0,sumd;
double diq,djq,dij,d2r,dr,dio,djo,da;
double tmin,total,dmin;
double bi,bj,b1,b2,b3,branch[4];
sint typei,typej; /* 0 = node; 1 = OTU */
/* IMPROVEMENT 1, STEP 0 : declare variables */
double *sum_cols, *sum_rows, *join;
/* IMPROVEMENT 2, STEP 0 : declare variables */
sint loop_limit;
typedef struct _ValidNodeID {
sint n;
struct _ValidNodeID *prev;
struct _ValidNodeID *next;
} ValidNodeID;
ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next;
/*
* correspondence of the loop counter variables.
* i .. lpi->n, ii .. lpii->n
* j .. lpj->n, jj .. lpjj->n
*/
fnseqs = (double)last_seq-first_seq+1;
/*********************** First initialisation ***************************/
if(verbose) {
fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n");
fprintf(tree,"\n Saitou, N. and Nei, M. (1987)");
fprintf(tree," The Neighbor-joining Method:");
fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees.");
fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n");
fprintf(tree,"\n\n This is an UNROOTED tree\n");
fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n");
}
if (fnseqs == 2) {
if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]);
return;
}
mini = minj = 0;
left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) );
av = (double *) ckalloc( (nseqs+1) * sizeof (double) );
/* IMPROVEMENT 1, STEP 1 : Allocate memory */
sum_cols = (double *) ckalloc( (nseqs+1) * sizeof (double) );
sum_rows = (double *) ckalloc( (nseqs+1) * sizeof (double) );
join = (double *) ckalloc( (nseqs+1) * sizeof (double) );
/* IMPROVEMENT 2, STEP 1 : Allocate memory */
tvalid = (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) );
/* tvalid[0] is special entry in array. it points a header of valid entry list */
tvalid[0].n = 0;
tvalid[0].prev = NULL;
tvalid[0].next = &tvalid[1];
/* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */
for(i=1, loop_limit = last_seq-first_seq+1,
lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ;
i<=loop_limit ;
++i, ++lpi, ++lp_prev, ++lp_next)
{
tmat[i][i] = av[i] = 0.0;
tkill[i] = 0;
lpi->n = i;
lpi->prev = lp_prev;
lpi->next = lp_next;
/* IMPROVEMENT 1, STEP 2 : Initialize arrays */
sum_cols[i] = sum_rows[i] = join[i] = 0.0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -