📄 trees.c
字号:
/* Phyle of filogenetic tree calculating functions for CLUSTAL W */
/* DES was here FEB. 1994 */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include "clustalw.h"
#include "dayhoff.h" /* set correction for amino acid distances >= 75% */
/*
* Prototypes
*/
Boolean transition(sint base1, sint base2);
void tree_gap_delete(void);
void distance_matrix_output(FILE *ofile);
void nj_tree(char **tree_description, FILE *tree);
void compare_tree(char **tree1, char **tree2, sint *hits, sint n);
void print_phylip_tree(char **tree_description, FILE *tree, sint bootstrap);
void print_nexus_tree(char **tree_description, FILE *tree, sint bootstrap);
sint two_way_split(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap);
sint two_way_split_nexus(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap);
void print_tree(char **tree_description, FILE *tree, sint *totals);
static Boolean is_ambiguity(char c);
static void overspill_message(sint overspill,sint total_dists);
/*
* Global variables
*/
extern sint max_names;
extern double **tmat; /* general nxn array of reals; allocated from main */
/* this is used as a distance matrix */
extern Boolean dnaflag; /* TRUE for DNA seqs; FALSE for proteins */
extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/
extern Boolean kimura; /* Use correction for multiple substitutions */
extern Boolean output_tree_clustal; /* clustal text output for trees */
extern Boolean output_tree_phylip; /* phylip nested parentheses format */
extern Boolean output_tree_distances; /* phylip distance matrix */
extern Boolean output_tree_nexus; /* nexus format tree */
extern Boolean output_pim; /* perc identity matrix output Ramu */
extern sint bootstrap_format; /* bootstrap file format */
extern Boolean empty; /* any sequences in memory? */
extern Boolean usemenu; /* interactive (TRUE) or command line (FALSE) */
extern sint nseqs;
extern sint max_aln_length;
extern sint *seqlen_array; /* the lengths of the sequences */
extern char **seq_array; /* the sequences */
extern char **names; /* the seq. names */
extern char seqname[]; /* name of input file */
extern sint gap_pos1,gap_pos2;
extern Boolean use_ambiguities;
extern char *amino_acid_codes;
static double *av;
static double *left_branch, *right_branch;
static double *save_left_branch, *save_right_branch;
static sint *boot_totals;
static sint *tkill;
/*
The next line is a fossil from the days of using the cc ran()
static int ran_factor;
*/
static sint *boot_positions;
static FILE *phylip_phy_tree_file;
static FILE *clustal_phy_tree_file;
static FILE *distances_phy_tree_file;
static FILE *nexus_phy_tree_file;
static FILE *pim_file; /* Ramu */
static Boolean verbose;
static char *tree_gaps;
static sint first_seq, last_seq;
/* array of weights; 1 for use this posn.; 0 don't */
extern sint boot_ntrials; /* number of bootstrap trials */
extern unsigned sint boot_ran_seed; /* random number generator seed */
void phylogenetic_tree(char *phylip_name,char *clustal_name,char *dist_name, char *nexus_name, char *pim_name)
/*
Calculate a tree using the distances in the nseqs*nseqs array tmat.
This is the routine for getting the REAL trees after alignment.
*/
{ char path[FILENAMELEN+1];
sint i, j;
sint overspill = 0;
sint total_dists;
static char **standard_tree;
static char **save_tree;
char lin2[10];
if(empty) {
error("You must load an alignment first");
return;
}
if(nseqs<2) {
error("Alignment has only %d sequences",nseqs);
return;
}
first_seq=1;
last_seq=nseqs;
get_path(seqname,path);
if(output_tree_clustal) {
if (clustal_name[0]!=EOS) {
if((clustal_phy_tree_file = open_explicit_file(
clustal_name))==NULL) return;
}
else {
if((clustal_phy_tree_file = open_output_file(
"\nEnter name for CLUSTAL tree output file ",path,
clustal_name,"nj")) == NULL) return;
}
}
if(output_tree_phylip) {
if (phylip_name[0]!=EOS) {
if((phylip_phy_tree_file = open_explicit_file(
phylip_name))==NULL) return;
}
else {
if((phylip_phy_tree_file = open_output_file(
"\nEnter name for PHYLIP tree output file ",path,
phylip_name,"ph")) == NULL) return;
}
}
if(output_tree_distances)
{
if (dist_name[0]!=EOS) {
if((distances_phy_tree_file = open_explicit_file(
dist_name))==NULL) return;
}
else {
if((distances_phy_tree_file = open_output_file(
"\nEnter name for distance matrix output file ",path,
dist_name,"dst")) == NULL) return;
}
}
if(output_tree_nexus)
{
if (nexus_name[0]!=EOS) {
if((nexus_phy_tree_file = open_explicit_file(
nexus_name))==NULL) return;
}
else {
if((nexus_phy_tree_file = open_output_file(
"\nEnter name for NEXUS tree output file ",path,
nexus_name,"tre")) == NULL) return;
}
}
if(output_pim)
{
if (pim_name[0]!=EOS) {
if((pim_file = open_explicit_file(
pim_name))==NULL) return;
}
else {
if((pim_file = open_output_file(
"\nEnter name for % Identity matrix output file ",path,
pim_name,"pim")) == NULL) return;
}
}
boot_positions = (sint *)ckalloc( (seqlen_array[first_seq]+2) * sizeof (sint) );
for(j=1; j<=seqlen_array[first_seq]; ++j)
boot_positions[j] = j;
if(output_tree_clustal) {
verbose = TRUE; /* Turn on file output */
if(dnaflag)
overspill = dna_distance_matrix(clustal_phy_tree_file);
else
overspill = prot_distance_matrix(clustal_phy_tree_file);
}
if(output_tree_phylip) {
verbose = FALSE; /* Turn off file output */
if(dnaflag)
overspill = dna_distance_matrix(phylip_phy_tree_file);
else
overspill = prot_distance_matrix(phylip_phy_tree_file);
}
if(output_tree_nexus) {
verbose = FALSE; /* Turn off file output */
if(dnaflag)
overspill = dna_distance_matrix(nexus_phy_tree_file);
else
overspill = prot_distance_matrix(nexus_phy_tree_file);
}
if(output_pim) { /* Ramu */
verbose = FALSE; /* Turn off file output */
if(dnaflag)
calc_percidentity(pim_file);
else
calc_percidentity(pim_file);
}
if(output_tree_distances) {
verbose = FALSE; /* Turn off file output */
if(dnaflag)
overspill = dna_distance_matrix(distances_phy_tree_file);
else
overspill = prot_distance_matrix(distances_phy_tree_file);
distance_matrix_output(distances_phy_tree_file);
}
/* check if any distances overflowed the distance corrections */
if ( overspill > 0 ) {
total_dists = (nseqs*(nseqs-1))/2;
overspill_message(overspill,total_dists);
}
if(output_tree_clustal) verbose = TRUE; /* Turn on file output */
standard_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
for(i=0; i<nseqs+1; i++)
standard_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
save_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) );
for(i=0; i<nseqs+1; i++)
save_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) );
if(output_tree_clustal || output_tree_phylip || output_tree_nexus)
nj_tree(standard_tree,clustal_phy_tree_file);
for(i=1; i<nseqs+1; i++)
for(j=1; j<nseqs+1; j++)
save_tree[i][j] = standard_tree[i][j];
if(output_tree_phylip)
print_phylip_tree(standard_tree,phylip_phy_tree_file,0);
for(i=1; i<nseqs+1; i++)
for(j=1; j<nseqs+1; j++)
standard_tree[i][j] = save_tree[i][j];
if(output_tree_nexus)
print_nexus_tree(standard_tree,nexus_phy_tree_file,0);
/*
print_tree(standard_tree,phy_tree_file);
*/
tree_gaps=ckfree((void *)tree_gaps);
boot_positions=ckfree((void *)boot_positions);
if (left_branch != NULL) left_branch=ckfree((void *)left_branch);
if (right_branch != NULL) right_branch=ckfree((void *)right_branch);
if (tkill != NULL) tkill=ckfree((void *)tkill);
if (av != NULL) av=ckfree((void *)av);
for (i=0;i<nseqs+1;i++)
standard_tree[i]=ckfree((void *)standard_tree[i]);
standard_tree=ckfree((void *)standard_tree);
for (i=0;i<nseqs+1;i++)
save_tree[i]=ckfree((void *)save_tree[i]);
save_tree=ckfree((void *)save_tree);
if(output_tree_clustal) {
fclose(clustal_phy_tree_file);
info("Phylogenetic tree file created: [%s]",clustal_name);
}
if(output_tree_phylip) {
fclose(phylip_phy_tree_file);
info("Phylogenetic tree file created: [%s]",phylip_name);
}
if(output_tree_distances) {
fclose(distances_phy_tree_file);
info("Distance matrix file created: [%s]",dist_name);
}
if(output_tree_nexus) {
fclose(nexus_phy_tree_file);
info("Nexus tree file created: [%s]",nexus_name);
}
if(output_pim) {
fclose(pim_file);
info(" perc identity matrix file created: [%s]",pim_name);
}
}
static void overspill_message(sint overspill,sint total_dists)
{
char err_mess[1024]="";
sprintf(err_mess,"%d of the distances out of a total of %d",
(pint)overspill,(pint)total_dists);
strcat(err_mess,"\n were out of range for the distance correction.");
strcat(err_mess,"\n");
strcat(err_mess,"\n SUGGESTIONS: 1) remove the most distant sequences");
strcat(err_mess,"\n or 2) use the PHYLIP package");
strcat(err_mess,"\n or 3) turn off the correction.");
strcat(err_mess,"\n Note: Use option 3 with caution! With this degree");
strcat(err_mess,"\n of divergence you will have great difficulty");
strcat(err_mess,"\n getting robust and reliable trees.");
strcat(err_mess,"\n\n");
warning(err_mess);
}
Boolean transition(sint base1, sint base2) /* TRUE if transition; else FALSE */
/*
assumes that the bases of DNA sequences have been translated as
a,A = 0; c,C = 1; g,G = 2; t,T,u,U = 3; N = 4;
a,A = 0; c,C = 2; g,G = 6; t,T,u,U =17;
A <--> G and T <--> C are transitions; all others are transversions.
*/
{
if( ((base1 == 0) && (base2 == 6)) || ((base1 == 6) && (base2 == 0)) )
return TRUE; /* A <--> G */
if( ((base1 ==17) && (base2 == 2)) || ((base1 == 2) && (base2 ==17)) )
return TRUE; /* T <--> C */
return FALSE;
}
void tree_gap_delete(void) /* flag all positions in alignment that have a gap */
{ /* in ANY sequence */
sint seqn;
sint posn;
tree_gaps = (char *)ckalloc( (max_aln_length+1) * sizeof (char) );
for(posn=1; posn<=seqlen_array[first_seq]; ++posn) {
tree_gaps[posn] = 0;
for(seqn=1; seqn<=last_seq-first_seq+1; ++seqn) {
if((seq_array[seqn+first_seq-1][posn] == gap_pos1) ||
(seq_array[seqn+first_seq-1][posn] == gap_pos2)) {
tree_gaps[posn] = 1;
break;
}
}
}
}
void distance_matrix_output(FILE *ofile)
{
sint i,j;
fprintf(ofile,"%6d",(pint)last_seq-first_seq+1);
for(i=1;i<=last_seq-first_seq+1;i++) {
fprintf(ofile,"\n%-*s ",max_names,names[i]);
for(j=1;j<=last_seq-first_seq+1;j++) {
fprintf(ofile,"%6.3f ",tmat[i][j]);
if(j % 8 == 0) {
if(j!=last_seq-first_seq+1) fprintf(ofile,"\n");
if(j != last_seq-first_seq+1 ) fprintf(ofile," ");
}
}
}
}
#ifdef ORIGINAL_NJ_TREE
void nj_tree(char **tree_description, FILE *tree)
{
register int i;
sint l[4],nude,k;
sint nc,mini,minj,j,ii,jj;
double fnseqs,fnseqs2=0,sumd;
double diq,djq,dij,d2r,dr,dio,djo,da;
double tmin,total,dmin;
double bi,bj,b1,b2,b3,branch[4];
sint typei,typej; /* 0 = node; 1 = OTU */
fnseqs = (double)last_seq-first_seq+1;
/*********************** First initialisation ***************************/
if(verbose) {
fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n");
fprintf(tree,"\n Saitou, N. and Nei, M. (1987)");
fprintf(tree," The Neighbor-joining Method:");
fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees.");
fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n");
fprintf(tree,"\n\n This is an UNROOTED tree\n");
fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n");
}
if (fnseqs == 2) {
if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]);
return;
}
mini = minj = 0;
left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) );
av = (double *) ckalloc( (nseqs+1) * sizeof (double) );
for(i=1;i<=last_seq-first_seq+1;++i)
{
tmat[i][i] = av[i] = 0.0;
tkill[i] = 0;
}
/*********************** Enter The Main Cycle ***************************/
/* for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) { */ /**start main cycle**/
for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) {
sumd = 0.0;
for(j=2; j<=last_seq-first_seq+1; ++j)
for(i=1; i<j; ++i) {
tmat[j][i] = tmat[i][j];
sumd = sumd + tmat[i][j];
}
tmin = 99999.0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -