📄 trees.c
字号:
/* Phyle of filogenetic tree calculating functions for CLUSTAL W *//* DES was here FEB. 1994 */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <math.h>#include "clustalw.h"#include "dayhoff.h" /* set correction for amino acid distances >= 75% *//* * Prototypes */Boolean transition(sint base1, sint base2);void tree_gap_delete(void);void distance_matrix_output(FILE *ofile);void nj_tree(char **tree_description, FILE *tree);void compare_tree(char **tree1, char **tree2, sint *hits, sint n);void print_phylip_tree(char **tree_description, FILE *tree, sint bootstrap);void print_nexus_tree(char **tree_description, FILE *tree, sint bootstrap);sint two_way_split(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap);sint two_way_split_nexus(char **tree_description, FILE *tree, sint start_row, sint flag, sint bootstrap);void print_tree(char **tree_description, FILE *tree, sint *totals);static Boolean is_ambiguity(char c);static void overspill_message(sint overspill,sint total_dists);/* * Global variables */extern sint max_names;extern double **tmat; /* general nxn array of reals; allocated from main */ /* this is used as a distance matrix */extern Boolean dnaflag; /* TRUE for DNA seqs; FALSE for proteins */extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/extern Boolean kimura; /* Use correction for multiple substitutions */extern Boolean output_tree_clustal; /* clustal text output for trees */extern Boolean output_tree_phylip; /* phylip nested parentheses format */extern Boolean output_tree_distances; /* phylip distance matrix */extern Boolean output_tree_nexus; /* nexus format tree */extern Boolean output_pim; /* perc identity matrix output Ramu */extern sint bootstrap_format; /* bootstrap file format */extern Boolean empty; /* any sequences in memory? */extern Boolean usemenu; /* interactive (TRUE) or command line (FALSE) */extern sint nseqs;extern sint max_aln_length;extern sint *seqlen_array; /* the lengths of the sequences */extern char **seq_array; /* the sequences */extern char **names; /* the seq. names */extern char seqname[]; /* name of input file */extern sint gap_pos1,gap_pos2;extern Boolean use_ambiguities;extern char *amino_acid_codes;static double *av;static double *left_branch, *right_branch;static double *save_left_branch, *save_right_branch;static sint *boot_totals;static sint *tkill;/* The next line is a fossil from the days of using the cc ran()static int ran_factor;*/static sint *boot_positions;static FILE *phylip_phy_tree_file;static FILE *clustal_phy_tree_file;static FILE *distances_phy_tree_file;static FILE *nexus_phy_tree_file;static FILE *pim_file; /* Ramu */static Boolean verbose;static char *tree_gaps;static sint first_seq, last_seq; /* array of weights; 1 for use this posn.; 0 don't */extern sint boot_ntrials; /* number of bootstrap trials */extern unsigned sint boot_ran_seed; /* random number generator seed */void phylogenetic_tree(char *phylip_name,char *clustal_name,char *dist_name, char *nexus_name, char *pim_name)/* Calculate a tree using the distances in the nseqs*nseqs array tmat. This is the routine for getting the REAL trees after alignment.*/{ char path[FILENAMELEN+1]; sint i, j; sint overspill = 0; sint total_dists; static char **standard_tree; static char **save_tree; char lin2[10]; if(empty) { error("You must load an alignment first"); return; } if(nseqs<2) { error("Alignment has only %d sequences",nseqs); return; } first_seq=1; last_seq=nseqs; get_path(seqname,path); if(output_tree_clustal) { if (clustal_name[0]!=EOS) { if((clustal_phy_tree_file = open_explicit_file( clustal_name))==NULL) return; } else { if((clustal_phy_tree_file = open_output_file( "\nEnter name for CLUSTAL tree output file ",path, clustal_name,"nj")) == NULL) return; }}if(output_tree_phylip) { if (phylip_name[0]!=EOS) { if((phylip_phy_tree_file = open_explicit_file( phylip_name))==NULL) return; } else { if((phylip_phy_tree_file = open_output_file( "\nEnter name for PHYLIP tree output file ",path, phylip_name,"ph")) == NULL) return; }}if(output_tree_distances){ if (dist_name[0]!=EOS) { if((distances_phy_tree_file = open_explicit_file( dist_name))==NULL) return; } else { if((distances_phy_tree_file = open_output_file( "\nEnter name for distance matrix output file ",path, dist_name,"dst")) == NULL) return; }}if(output_tree_nexus){ if (nexus_name[0]!=EOS) { if((nexus_phy_tree_file = open_explicit_file( nexus_name))==NULL) return; } else { if((nexus_phy_tree_file = open_output_file( "\nEnter name for NEXUS tree output file ",path, nexus_name,"tre")) == NULL) return; }}if(output_pim){ if (pim_name[0]!=EOS) { if((pim_file = open_explicit_file( pim_name))==NULL) return; } else { if((pim_file = open_output_file( "\nEnter name for % Identity matrix output file ",path, pim_name,"pim")) == NULL) return; }} boot_positions = (sint *)ckalloc( (seqlen_array[first_seq]+2) * sizeof (sint) ); for(j=1; j<=seqlen_array[first_seq]; ++j) boot_positions[j] = j; if(output_tree_clustal) { verbose = TRUE; /* Turn on file output */ if(dnaflag) overspill = dna_distance_matrix(clustal_phy_tree_file); else overspill = prot_distance_matrix(clustal_phy_tree_file); } if(output_tree_phylip) { verbose = FALSE; /* Turn off file output */ if(dnaflag) overspill = dna_distance_matrix(phylip_phy_tree_file); else overspill = prot_distance_matrix(phylip_phy_tree_file); } if(output_tree_nexus) { verbose = FALSE; /* Turn off file output */ if(dnaflag) overspill = dna_distance_matrix(nexus_phy_tree_file); else overspill = prot_distance_matrix(nexus_phy_tree_file); } if(output_pim) { /* Ramu */ verbose = FALSE; /* Turn off file output */ if(dnaflag) calc_percidentity(pim_file); else calc_percidentity(pim_file); } if(output_tree_distances) { verbose = FALSE; /* Turn off file output */ if(dnaflag) overspill = dna_distance_matrix(distances_phy_tree_file); else overspill = prot_distance_matrix(distances_phy_tree_file); distance_matrix_output(distances_phy_tree_file); }/* check if any distances overflowed the distance corrections */ if ( overspill > 0 ) { total_dists = (nseqs*(nseqs-1))/2; overspill_message(overspill,total_dists); } if(output_tree_clustal) verbose = TRUE; /* Turn on file output */ standard_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) ); for(i=0; i<nseqs+1; i++) standard_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) ); save_tree = (char **) ckalloc( (nseqs+1) * sizeof (char *) ); for(i=0; i<nseqs+1; i++) save_tree[i] = (char *) ckalloc( (nseqs+1) * sizeof(char) ); if(output_tree_clustal || output_tree_phylip || output_tree_nexus) nj_tree(standard_tree,clustal_phy_tree_file); for(i=1; i<nseqs+1; i++) for(j=1; j<nseqs+1; j++) save_tree[i][j] = standard_tree[i][j]; if(output_tree_phylip) print_phylip_tree(standard_tree,phylip_phy_tree_file,0); for(i=1; i<nseqs+1; i++) for(j=1; j<nseqs+1; j++) standard_tree[i][j] = save_tree[i][j]; if(output_tree_nexus) print_nexus_tree(standard_tree,nexus_phy_tree_file,0);/* print_tree(standard_tree,phy_tree_file);*/ tree_gaps=ckfree((void *)tree_gaps); boot_positions=ckfree((void *)boot_positions); if (left_branch != NULL) left_branch=ckfree((void *)left_branch); if (right_branch != NULL) right_branch=ckfree((void *)right_branch); if (tkill != NULL) tkill=ckfree((void *)tkill); if (av != NULL) av=ckfree((void *)av); for (i=0;i<nseqs+1;i++) standard_tree[i]=ckfree((void *)standard_tree[i]); standard_tree=ckfree((void *)standard_tree); for (i=0;i<nseqs+1;i++) save_tree[i]=ckfree((void *)save_tree[i]); save_tree=ckfree((void *)save_tree);if(output_tree_clustal) { fclose(clustal_phy_tree_file); info("Phylogenetic tree file created: [%s]",clustal_name);}if(output_tree_phylip) { fclose(phylip_phy_tree_file); info("Phylogenetic tree file created: [%s]",phylip_name);}if(output_tree_distances) { fclose(distances_phy_tree_file); info("Distance matrix file created: [%s]",dist_name);}if(output_tree_nexus) { fclose(nexus_phy_tree_file); info("Nexus tree file created: [%s]",nexus_name);}if(output_pim) { fclose(pim_file); info(" perc identity matrix file created: [%s]",pim_name);}}static void overspill_message(sint overspill,sint total_dists){ char err_mess[1024]=""; sprintf(err_mess,"%d of the distances out of a total of %d", (pint)overspill,(pint)total_dists); strcat(err_mess,"\n were out of range for the distance correction."); strcat(err_mess,"\n"); strcat(err_mess,"\n SUGGESTIONS: 1) remove the most distant sequences"); strcat(err_mess,"\n or 2) use the PHYLIP package"); strcat(err_mess,"\n or 3) turn off the correction."); strcat(err_mess,"\n Note: Use option 3 with caution! With this degree"); strcat(err_mess,"\n of divergence you will have great difficulty"); strcat(err_mess,"\n getting robust and reliable trees."); strcat(err_mess,"\n\n"); warning(err_mess);}Boolean transition(sint base1, sint base2) /* TRUE if transition; else FALSE *//* assumes that the bases of DNA sequences have been translated as a,A = 0; c,C = 1; g,G = 2; t,T,u,U = 3; N = 4; a,A = 0; c,C = 2; g,G = 6; t,T,u,U =17; A <--> G and T <--> C are transitions; all others are transversions.*/{ if( ((base1 == 0) && (base2 == 6)) || ((base1 == 6) && (base2 == 0)) ) return TRUE; /* A <--> G */ if( ((base1 ==17) && (base2 == 2)) || ((base1 == 2) && (base2 ==17)) ) return TRUE; /* T <--> C */ return FALSE;}void tree_gap_delete(void) /* flag all positions in alignment that have a gap */{ /* in ANY sequence */ sint seqn; sint posn; tree_gaps = (char *)ckalloc( (max_aln_length+1) * sizeof (char) ); for(posn=1; posn<=seqlen_array[first_seq]; ++posn) { tree_gaps[posn] = 0; for(seqn=1; seqn<=last_seq-first_seq+1; ++seqn) { if((seq_array[seqn+first_seq-1][posn] == gap_pos1) || (seq_array[seqn+first_seq-1][posn] == gap_pos2)) { tree_gaps[posn] = 1; break; } } }}void distance_matrix_output(FILE *ofile){ sint i,j; fprintf(ofile,"%6d",(pint)last_seq-first_seq+1); for(i=1;i<=last_seq-first_seq+1;i++) { fprintf(ofile,"\n%-*s ",max_names,names[i]); for(j=1;j<=last_seq-first_seq+1;j++) { fprintf(ofile,"%6.3f ",tmat[i][j]); if(j % 8 == 0) { if(j!=last_seq-first_seq+1) fprintf(ofile,"\n"); if(j != last_seq-first_seq+1 ) fprintf(ofile," "); } } }}#ifdef ORIGINAL_NJ_TREEvoid nj_tree(char **tree_description, FILE *tree){ register int i; sint l[4],nude,k; sint nc,mini,minj,j,ii,jj; double fnseqs,fnseqs2=0,sumd; double diq,djq,dij,d2r,dr,dio,djo,da; double tmin,total,dmin; double bi,bj,b1,b2,b3,branch[4]; sint typei,typej; /* 0 = node; 1 = OTU */ fnseqs = (double)last_seq-first_seq+1;/*********************** First initialisation ***************************/ if(verbose) { fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n"); fprintf(tree,"\n Saitou, N. and Nei, M. (1987)"); fprintf(tree," The Neighbor-joining Method:"); fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees."); fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n"); fprintf(tree,"\n\n This is an UNROOTED tree\n"); fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n"); } if (fnseqs == 2) { if (verbose) fprintf(tree,"Cycle 1 = SEQ: 1 (%9.5f) joins SEQ: 2 (%9.5f)",tmat[first_seq][first_seq+1],tmat[first_seq][first_seq+1]); return; } mini = minj = 0; left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) ); right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) ); tkill = (sint *) ckalloc( (nseqs+1) * sizeof (sint) ); av = (double *) ckalloc( (nseqs+1) * sizeof (double) ); for(i=1;i<=last_seq-first_seq+1;++i) { tmat[i][i] = av[i] = 0.0; tkill[i] = 0; }/*********************** Enter The Main Cycle ***************************/ /* for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) { */ /**start main cycle**/ for(nc=1; nc<=(last_seq-first_seq+1-3); ++nc) { sumd = 0.0; for(j=2; j<=last_seq-first_seq+1; ++j) for(i=1; i<j; ++i) { tmat[j][i] = tmat[i][j]; sumd = sumd + tmat[i][j]; } tmin = 99999.0;/*.................compute SMATij values and find the smallest one ........*/ for(jj=2; jj<=last_seq-first_seq+1; ++jj) if(tkill[jj] != 1) for(ii=1; ii<jj; ++ii)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -