📄 treesub.c
字号:
/* TREESUB.c
subroutines that operates on trees, inserted into other programs
such as baseml, basemlg, codeml, and pamp.
*/
extern char BASEs[], nBASEs[], *EquateNUC[], AAs[], BINs[];
extern int noisy;
#ifdef BASEML
#define EIGEN
#define REALSEQUENCE
#define NODESTRUCTURE
#define TREESEARCH
#define LSDISTANCE
#define LFUNCTIONS
#define RECONSTRUCTION
#endif
#ifdef CODEML
#define EIGEN
#define REALSEQUENCE
#define NODESTRUCTURE
#define TREESEARCH
#define LSDISTANCE
#define LFUNCTIONS
#define RECONSTRUCTION
#endif
#ifdef BASEMLG
#define REALSEQUENCE
#define NODESTRUCTURE
#define LSDISTANCE
#endif
#ifdef RECONSTRUCTION
#define PARSIMONY
#endif
#ifdef EVOLVE
#define BIRTHDEATH
#endif
#define EqPartition(p1,p2,ns) (p1==p2||p1+p2+1==(1<<ns))
#ifdef REALSEQUENCE
int PopEmptyLines (FILE* fseq, int lline, char line[])
{
/* pop out empty lines in the sequence data file.
returns -1 if EOF.
*/
char *eqdel=".-?", *p;
int i;
for (i=0; ;i++) {
p=fgets (line, lline, fseq);
if (p==NULL) return(-1);
while (*p)
if (*p==eqdel[0] || *p==eqdel[1] || *p==eqdel[2] || isalpha(*p))
return(0);
else p++;
}
}
int hasbase (char *str)
{
char *p=str, *eqdel=".-?";
while (*p)
if (*p==eqdel[0] || *p==eqdel[1] || *p==eqdel[2] || isalpha(*p++))
return(1);
return(0);
}
int blankline (char *str)
{
char *p=str;
while (*p) if (isalnum(*p++)) return(0);
return(1);
}
int GetSeqFileType(FILE *fseq, int *paupseq);
int IdenticalSeqs(void);
int GetSeqFileType(FILE *fseq, int *paupseq)
{
/* paupstart="begin data" and paupend="matrix" identify paup seq files.
Modify if necessary.
*/
int lline=1000;
char line[1000], *paupstart="begin data",*paupend="matrix", *p;
char *ntax="ntax",*nchar="nchar";
if(fscanf(fseq,"%d%d",&com.ns,&com.ls)==2) { *paupseq=0; return(0); }
*paupseq=1;
puts("\nseq file is not paml format. Trying to read it as a paup file.");
for ( ; ; ) {
if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");
strcase(line,0);
if(strstr(line,paupstart)) break;
}
for ( ; ; ) {
if(fgets(line,lline,fseq)==NULL) error2("seq err2: EOF");
strcase(line,0);
if((p=strstr(line,ntax))!=NULL) {
while (*p != '=') { if(*p==0) error2("seq err"); p++; }
sscanf(p+1,"%d", &com.ns);
if((p=strstr(line,nchar))==NULL) error2("expect nchar");
while (*p != '=') { if(*p==0) error2("expect ="); p++; }
sscanf(p+1,"%d", &com.ls);
break;
}
}
/* printf("\nns: %d\tls: %d\n", com.ns, com.ls); */
for ( ; ; ) {
if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");
strcase(line,0);
if (strstr(line,paupend)) break;
}
return(0);
}
/* whether to keep site-to-pattern matching information in com.pose */
static unsigned char *posec=NULL; /* needed if (com.ngene>1) */
static int keeppose=0, *posei=NULL; /* needed if (com.ngene>254) */
int ReadSeq (FILE *fout, FILE *fseq)
{
/* read in sequence, translate into protein (CODON2AAseq), and
This counts ngene but does not initialize lgene[].
It also codes (transforms) the sequences.
com.seqtype: 0=nucleotides; 1=codons; 2:AAs; 3:CODON2AAs; 4:BINs
posec[] (instead of com.pose[]) is used to store gene marks.
ls/3 gene marks for codon sequences.
char opt_c[]="AKGI";
A:alpha given. K:kappa given
G:many genes, I:interlaved format
lcode: # of characters in the final sequence
*/
char *p,*p1, eq='.', *line;
int i,j,k, ch, noptline=0, lspname=LSPNAME, miss=0, paupseq;
int lline=10000,lt[NS], igroup, lcode, Sequential=1,basecoding=0;
int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
int gap=(n31==3?3:10), nchar=(com.seqtype==AAseq?20:4);
int h,b[3]={0};
char *pch=((com.seqtype<=1||com.seqtype==CODON2AAseq)?BASEs:(com.seqtype==2?AAs:BINs));
char str[4]=" ";
str[0]=0; h=-1; b[0]=-1; /* avoid warning */
keeppose=com.print;
#ifdef CODEML
if(com.seqtype==1 && com.NSsites) keeppose=1;
#endif
if(posec) free(posec); if(posei) free(posei);
if (com.seqtype==4) error2("seqtype==BINs, check with author");
if (noisy>=9 && (com.seqtype<=CODONseq||com.seqtype==CODON2AAseq)) {
puts("\n\nAmbiguity character definition table:\n");
FOR (i,(int)strlen(BASEs)) {
printf("%c (%d): ", BASEs[i],nBASEs[i]);
FOR(j,nBASEs[i]) printf("%c ", EquateNUC[i][j]);
FPN(F0);
}
}
GetSeqFileType(fseq, &paupseq);
if (com.ns>NS) error2("too many sequences.. raise NS?");
if (com.ls%n31!=0) {
printf ("\n%d nucleotides, not a multiple of 3!", com.ls); exit(-1);
}
lcode=com.ls/n31;
if (noisy) printf ("\nns = %d \tls = %d\n", com.ns, com.ls);
FOR(j,com.ns) {
if (com.spname[j]) free(com.spname[j]);
if (com.z[j]) free(com.z[j]);
com.spname[j]=(char*)malloc((LSPNAME+1)*sizeof(char));
if((com.z[j]=(char*)malloc(com.ls*sizeof(char)))==NULL) error2("oom z");
FOR(i,lspname) com.spname[j][i]=0;
}
com.rgene[0]=1; com.ngene=1;
lline=max2(lline, com.ls/n31*(n31+1)+lspname+50);
if((line=(char*)malloc(lline*sizeof(char)))==NULL) error2("oom line");
if(paupseq) goto readseq;
/* first line */
if (!fgets(line,lline,fseq)) error2("ReadSeq: first line");
for (j=0; j<lline && line[j] && line[j]!='\n'; j++) {
if (!isalnum(line[j])) continue;
line[j]=(char)toupper(line[j]);
switch (line[j]) {
case 'G': noptline++; break;
case 'C': basecoding=1; break;
case 'S': Sequential=1; break;
case 'I': Sequential=0; break;
default :
printf ("\nBad option '%c' in first line of seqfile\n", line[j]);
exit (-1);
}
}
if (strchr(line,'C')) { /* protein-coding DNA sequences */
if(com.seqtype==2) error2("option C?");
else if (com.seqtype==0) {
if (com.ls%3!=0 || noptline<1) error2("option C?");
com.ngene=3; FOR(i,3) com.lgene[i]=com.ls/3;
#if BASEML
com.coding=1;
if((posec=(unsigned char*)malloc(com.ls*sizeof(char)))==NULL)
error2("oom posec");
for (i=0;i<com.ls;i++) posec[i]=(char)(i%3);
#endif
}
noptline--;
}
/* option lines */
FOR (j, noptline) {
line[0]=0;
while (!isalnum(line[0])) line[0]=(char)fgetc(fseq);
line[0]=(char)toupper(line[0]);
switch (line[0]) {
case ('G') :
if(basecoding) error2("incorrect option format, use GC?\n");
if (fscanf(fseq,"%d",&com.ngene)!=1) error2("expecting #gene here..");
if (com.ngene>NGENE) error2("raise NGENE?");
if(com.ngene>254) {
if((posei=(int*)malloc(com.ls*sizeof(int)))==NULL)
error2("oom posei");
}
else if(com.ngene>1)
if((posec=(unsigned char*)malloc(com.ls*sizeof(char)))==NULL)
error2("oom posec");
if (com.fix_rgene) { /* specified in the main program? */
puts ("reading rates for genes from the seq. file, correct?");
FOR (k, com.ngene) if (fscanf (fseq, "%lf", &com.rgene[k]) != 1)
error2("fix_rgene?");
}
fgets(line,lline,fseq);
if (!blankline(line)) { /* #sites in each gene on the 2nd line */
for (i=0,p=line; i<com.ngene; i++) {
while (*p && !isalnum(*p)) p++;
if (sscanf(p,"%d",&com.lgene[i])!=1) break;
while (*p && isalnum(*p)) p++;
}
for (; i<com.ngene; i++)
if (fscanf(fseq,"%d", &com.lgene[i])!=1) error2("EOF at lgene");
if(!fgets(line,lline,fseq)) error2("sequence file, gene lengths");
for(i=0,k=0; i<com.ngene; k+=com.lgene[i],i++)
FOR(j,com.lgene[i])
if(com.ngene>254) posei[k+j]=i;
else posec[k+j]=(unsigned char)i;
for (i=0,k=0; i<com.ngene; i++) k+=com.lgene[i];
if (k!=lcode) {
matIout(F0, com.lgene, 1, com.ngene);
printf("\n%6d != %d", lcode, k);
error2("total length over genes is not equal to sequence length");
}
}
else { /* site marks on later line(s) */
for(k=0; k<lcode;) {
if (com.ngene>9) fscanf(fseq,"%d", &ch);
else {
do ch=fgetc(fseq); while (!isdigit(ch));
ch=ch-(int)'1'+1; /* assumes 1,2,...,9 are consecutive */
}
if (ch<1 || ch>com.ngene)
{ printf("\ngene mark %d at %d?\n", ch, k+1); exit (-1); }
if(com.ngene>254) posei[k++]=ch-1;
else posec[k++]=(unsigned char)(ch-1);
}
if(!fgets(line,lline,fseq)) error2("sequence file, gene marks");
}
break;
default :
printf ("..Bad option '%c' in option lines in seqfile\n", line[0]);
exit (-1);
}
}
readseq:
/* read sequence */
if (Sequential) { /* sequential */
if (noisy) printf ("Sequential format..\n");
FOR (j,com.ns) {
lspname=LSPNAME;
FOR (i, 2*lspname) line[i]='\0';
if (!fgets (line, lline, fseq)) error2("EOF?");
if (blankline(line)) {
if (PopEmptyLines (fseq, lline, line))
{ printf("err: empty line (seq %d)\n",j+1); exit(-1); }
}
p=line+(line[0]=='=' || line[0]=='>') ;
while(isspace(*p)) p++;
if ((ch=strstr(p," ")-p)<lspname && ch>0) lspname=ch;
strncpy (com.spname[j], p, lspname);
k=strlen(com.spname[j]);
p+=(k<lspname?k:lspname);
for (; k>0; k--) /* trim spaces */
if (!isgraph(com.spname[j][k])) com.spname[j][k]=0;
else break;
if (noisy>=2) printf ("Reading seq #%2d: %s\n", j+1, com.spname[j]);
for (k=0; k<com.ls; p++) {
while (*p=='\n' || *p=='\0') {
p=fgets(line, lline, fseq);
if(p==NULL)
{ printf("\nEOF at site %d, seq %d\n", k+1,j+1); exit(-1); }
}
*p=(char)toupper(*p);
if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U')
*p='T';
p1=strchr(pch,*p);
if (p1 && p1-pch>=nchar)
miss=1;
if (*p==eq) {
if (j==0) error2(". in 1st seq.?");
com.z[j][k]=com.z[0][k]; k++;
}
else if (p1)
com.z[j][k++]=*p;
else if (isalpha(*p))
{ printf("\nerr: %c at %d seq %d.",*p,k+1,j+1); exit(0); }
else if (*p==(char)EOF) error2("EOF?");
} /* for(k) */
if(strchr(p,'\n')==NULL) /* pop up line return */
while((ch=fgetc(fseq))!='\n' && ch!=EOF) ;
} /* for (j,com.ns) */
}
else { /* interlaved */
if (noisy) printf ("Interlaved format..\n");
FOR (j, com.ns) lt[j]=0; /* temporary seq length */
for (igroup=0; ; igroup++) {
/*
printf ("\nreading block %d ", igroup+1); matIout(F0,lt,1,com.ns);*/
FOR (j, com.ns) if (lt[j]<com.ls) break;
if (j==com.ns) break;
FOR (j,com.ns) {
if (!fgets(line,lline,fseq)) {
printf("\nerr reading site %d, seq %d group %d",
lt[j]+1,j+1,igroup+1);
error2("EOF?");
}
if (!hasbase(line)) {
if (j) {
printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);
error2("empty line.");
}
else
if (PopEmptyLines(fseq,lline,line)==-1) {
printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);
error2("EOF?");
}
}
p=line;
if (igroup==0) {
lspname=LSPNAME;
while(isspace(*p)) p++;
if ((ch=strstr(p," ")-p)<lspname && ch>0) lspname=ch;
strncpy (com.spname[j], p, lspname);
k=strlen(com.spname[j]);
p+=(k<lspname?k:lspname);
for (; k>0; k--) /* trim spaces */
if (!isgraph(com.spname[j][k])) com.spname[j][k]=0;
else break;
if(noisy>=2) printf("Reading seq #%2d: %s\r",j+1,com.spname[j]);
}
for (; *p && *p!='\n'; p++) {
if (lt[j]==com.ls) break;
*p=(char)toupper(*p);
if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U')
*p='T';
p1=strchr(pch,*p);
if (p1 && p1-pch>=nchar)
miss=1;
if (*p==eq) {
if (j==0) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -