📄 treesub.c

📁 一个神经网络工具箱
💻 C
📖 第 1 页 / 共 5 页
字号:
                     printf("err: . in 1st seq, group %d.\n",igroup);
                     exit (-1);
                  }
                  com.z[j][lt[j]] = com.z[0][lt[j]];
                  lt[j]++;
               }
               else if (p1)
                  com.z[j][lt[j]++]=*p;
               else if (isalpha(*p)) {
                  printf("\nerr:%c at %d seq %d block %d.",
                          *p,lt[j]+1,j+1,igroup+1);
                  exit(-1);
               }
               else if (*p==(char)EOF) error2("EOF");
            }         /* for (*p) */
         }            /* for (j,com.ns) */
      }               /* for (igroup) */
   }
   free(line);


/*
{
char *za[4], *sp[4];
FOR(i,4) za[0]=NULL;
FOR(i,4) switch(toupper(com.spname[i][0])) {
   case 'H': za[0]=com.z[i]; sp[0]=com.spname[i]; break;
   case 'C': za[1]=com.z[i]; sp[1]=com.spname[i]; break;
   case 'G': za[2]=com.z[i]; sp[2]=com.spname[i]; break;
   case 'O': za[3]=com.z[i]; sp[3]=com.spname[i]; break;
   default: error2("wrong");
   }
fprintf(frst1," 4  %6d \n",com.ls);
printsma(frst1,sp,za, 4, com.ls,com.ls,gap,com.seqtype,0,0,NULL);
}
*/

   if(!miss) com.cleandata=1;
   else if (com.cleandata) {  /* remove ambiguity characters */
      if(noisy>2)  puts("\nSites with gaps or missing data are removed.");
      if(fout) {
         fprintf(fout,"\nBefore deleting alignment gaps. %d sites\n",com.ls);
         printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);
      }
      RemoveIndel ();
      lcode=com.ls/n31;
      if(fout) fprintf(fout,"\nAfter deleting gaps. %d sites\n",com.ls);
   }

   if(fout) 
      printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);
   if(n31==3) com.ls=lcode;

   /* IdenticalSeqs(); */

#ifdef CODEML
/*
   if(com.seqtype==1) Get4foldSites();
*/
   if(com.seqtype==CODON2AAseq) {
      if (noisy>2) puts("\nTranslating into AA sequences\n");
      FOR(j,com.ns) {
         if (noisy>2) printf("Translating sequence %d\n",j+1);
         DNA2protein(com.z[j], com.z[j], com.ls,com.icode);
      }
      com.seqtype=AAseq;
      if(fout) {
         fputs("\nTranslated AA Sequences\n",fout);
         fprintf(fout,"%4d  %6d",com.ns,com.ls);
         printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,10,com.seqtype,0,0,NULL);
      }
   }
#endif

   if(com.cleandata) {  /* transforming data */
      FOR(j,com.ns) 
         if(transform(com.z[j],com.ls*(com.seqtype==1?3:1),1,com.seqtype))
            error2("strange?");
#ifdef CODEML
      if(com.seqtype==1) {
         FOR(j,com.ns) 
            FOR(h,com.ls) {
               b[0]=com.z[j][h*3]; b[1]=com.z[j][h*3+1]; b[2]=com.z[j][h*3+2];
               if(FROM64[k=b[0]*16+b[1]*4+b[2]]==-1) {
                  printf("\ncodon %s in seq. %d is stop (icode=%d)\n", 
                     getcodon(str,k),j+1,com.icode);
                  exit(-1);
               }
               com.z[j][h]=(char)FROM64[k];
         }
         if(com.ls>10000) com.z[j]=(char*)realloc(com.z[j],com.ls);
      }
#endif
   }

   if(noisy>=2) printf ("\nSequences read..\n");
   if(com.ls==0) 
      error2("no sites. Got nothing to do");
   return (0);
}


int IdenticalSeqs(void)
{
/* This checks for identical sequences and create a data set of unique 
   sequences.  The file name is <SeqDataFile>.tmp.  This is casually 
   written and need more testing.
   The routine is called right after the sequence data are read.
   For codon sequences, com.ls has the number of codons, which are NOT
   coded.
*/
   char tmpf[96], keep[NS];
   FILE *ftmp;
   int is,js,h, same,nkept=com.ns;
   int ls1=com.ls*(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);

   puts("\nIdenticalSeqs: not tested\a");
   FOR(is,com.ns) keep[is]=1;
   FOR(is,com.ns)  { 
      if(!keep[is]) continue;
      FOR(js,is) {
         for(h=0,same=1; h<ls1; h++)
            if(com.z[is][h]!=com.z[js][h]) break;
         if(h==ls1) {
            printf("Seqs. %3d & %3d (%s & %s) are identical!\n",
               js+1,is+1,com.spname[js],com.spname[is]);
            keep[is]=0;
         }
      }
   }
   FOR(is,com.ns) if(!keep[is]) nkept--;
   if(nkept<com.ns) {
      strcpy(tmpf,com.seqf);  strcat(tmpf,".tmp");
      if((ftmp=fopen(tmpf,"w"))==NULL) error2("IdenticalSeqs: file error");
      printSeqs(ftmp,NULL,keep,1);
      fclose(ftmp);
      printf("\nUnique sequences collected in %s.\n",tmpf);
   }
   return(0);
}



int PatternWeight (FILE *fout)
{
/* This collaps sites into patterns, for nucleotide or amino acid sequences, 
   or transformed codon sequences.  
   posec & zt[] (space) is temporary and is copied back to com.z[]
   com.pose[i] takes value from 0 to npatt and maps sites to patterns
           (i=0,...,ls)
*/
   int *fpatti, lst=com.ls,h,ht,j,k=-1,newpatt,ig;
   int gap=(com.seqtype==CODONseq?3:10);
   int nb=(com.seqtype==1&&!com.cleandata ? 3 : 1);
   char *zt[NS], b1[NS],b3[NS][3], miss[]="-?"; /* b[][] data at site h */
   double nc = (com.seqtype == 1 ? 64 : com.ncode) + !com.cleandata;

   if(noisy) puts("Counting patterns..");
   if(com.fpatt) free(com.fpatt);
   if((com.seqtype==1 && com.ns<6) || (com.seqtype!=1 && com.ns<8))
      lst = (int)(pow(nc, com.ns*1.)+0.5);
   lst=min2(lst*com.ngene,com.ls);
   if((fpatti=(int*)malloc(lst*sizeof(int)))==NULL) error2("oom fpatti");
   if(keeppose && (com.pose=(int*)malloc(com.ls*sizeof(int)))==NULL) 
      error2("oom com.pose");
   for(j=0;j<com.ns;j++) 
      if((zt[j]=(char*)malloc(lst*nb*sizeof(char)))==NULL) error2("oom zt");

   FOR (h,lst) fpatti[h]=0;
   FOR (ig, com.ngene) com.lgene[ig]=0;

   for(ig=0,com.npatt=0; ig<com.ngene; ig++) {
      com.posG[ig]=com.npatt;
      for(h=0; h<com.ls; h++) {
         if(com.ngene>245) { if(posei[h]!=ig) continue; }
         else if (com.ngene>1 && posec[h]!=ig) continue;
         if(nb==3) {
            FOR(j,com.ns) FOR(k,nb) b3[j][k]=com.z[j][h*nb+k];
            FOR(j,com.ns) FOR(k,nb) 
               if(b3[j][k]!=miss[0] || b3[j][k]!=miss[1]) goto NotAllMiss3;
            NotAllMiss3:
            if(noisy>=9 && j==com.ns && k==nb) 
               puts("Some sites have no data (no harm).");
            for(ht=com.posG[ig],newpatt=1; ht<com.npatt; ht++) {
               FOR(j,com.ns) FOR(k,nb)
                  if(b3[j][k]!=zt[j][ht*nb+k]) goto CheckNextSite3;
               if(keeppose) com.pose[h]=ht;
               newpatt=0; break;    /* h has same data as ht */
               CheckNextSite3: ;
            }
         }
         else {
            FOR(j,com.ns) b1[j]=com.z[j][h];
            FOR(j,com.ns)
               if(b1[j]!=miss[0] || b1[j]!=miss[1]) break;
            if(noisy>=9 && j==com.ns) 
               puts("Some sites have no data (no harm).");

            for(ht=com.posG[ig],newpatt=1; ht<com.npatt; ht++) {
               FOR(j,com.ns) 
                  if(b1[j]!=zt[j][ht]) goto CheckNextSite1;
               if(keeppose) com.pose[h]=ht; 
               newpatt=0; break;    /* h has same data as ht */
               CheckNextSite1: ;
            }
         }
         com.lgene[ig]++;
         if(newpatt) {   /* h has a new pattern */
            if(nb==3) FOR(j,com.ns) FOR(k,nb) zt[j][com.npatt*nb+k]=b3[j][k];
            else      FOR(j,com.ns) zt[j][com.npatt]=b1[j];
            ht=com.npatt++;
            if(keeppose) com.pose[h]=ht;
         }
         if(com.npatt>lst) {
            printf("npatt = %d > lst = %d.. Contact author.", com.npatt,lst);
            exit(-1);
         }
         fpatti[ht]++;
         if(noisy && (h+1)%100000==0)
            printf("\r%d patterns at %d sites", com.npatt,h+1);
      }     /* for (h)  */
   }        /* for (ig) */
   if(noisy && com.ls>10000) FPN(F0);
 
   if(com.seqtype==CODONseq && com.ngene==3 &&com.lgene[0]==com.ls/3) {
      puts("\nCheck the G option in data file? (Enter)\n");
      getchar();
   }
   com.posG[com.ngene]=com.npatt;
   for(j=1; j<com.ngene; j++) com.lgene[j]+=com.lgene[j-1];

   if(com.lgene[com.ngene-1]!=com.ls) { puts("\algene vs. ls, missing data"); }

   if(com.ngene>254)    free(posei);
   else if(com.ngene>1) free(posec);
   com.fpatt=(double*)malloc(com.npatt*sizeof(double));
   for(h=0;h<com.npatt;h++) com.fpatt[h]=fpatti[h];
   free(fpatti);
   FOR(j,com.ns) {
      free(com.z[j]);
      com.z[j]=(char*)realloc(zt[j],com.npatt*nb*sizeof(char));
   }

   if (fout) {
      fprintf (fout, "\nns = %d  \tls = %d", com.ns,com.ls);
      if (com.ngene>1) {
         fprintf (fout,"\nngene =%3d, lengths =", com.ngene);
         FOR(j,com.ngene)
            fprintf (fout,"%7d", (j?com.lgene[j]-com.lgene[j-1]:com.lgene[j]));
         fprintf(fout,"\n  Starting at pattern");
         FOR(j,com.ngene) fprintf(fout,"%7d", com.posG[j]+1);
      }
      if(noisy) printf("# site patterns = %d\n", com.npatt);
      fprintf(fout,"\n# site patterns = %d\n", com.npatt);
      FOR (h,com.npatt) {
         fprintf(fout," %4.0f", com.fpatt[h]);
         if((h+1)%15==0) FPN(fout);
      }
      FPN(fout);
      if(com.seqtype==1 && com.cleandata) {
#ifdef CODEML
         printsmaCodon (fout,com.z,com.ns,com.npatt,com.npatt,1);
#endif
      }
      else {
         newpatt=com.npatt*nb;
         printsma(fout,com.spname,com.z,com.ns,newpatt,newpatt,gap,
            com.seqtype,com.cleandata,1,NULL);
      }
      FPN(fout);
   }
   return (0);
}



void AddFreqSeqGene(int js,int ig,double pi0[],double pi[],int *missing);

int Initialize (FILE *fout)
{
/* Count site patterns (com.fpatt) and calculate base or amino acid frequencies
   in genes and species.  This works on raw (uncoded) data.  
   Ambiguity characters in sequences are resolved by iteration. 
   For frequencies in each species, they are resolved within that sequence.
   For average base frequencies among species, they are resolved over all 
   species.

   This routine is called by baseml and aaml.  codonml uses another
   routine InitializeCodon()
*/
   char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)), indel[]="-?";
   int wname=30, h,js,k, ig, nconstp, nc=com.ncode;
   int irf,nrf=20, miss=0;
   double pi0[20], t,lmax=0;

   PatternWeight(fout);

   if(noisy) puts("Counting frequencies..");
   if(fout)  fprintf(fout,"\nFrequencies..");
   for(h=0,nconstp=0; h<com.npatt; h++) {
      for (js=1;js<com.ns;js++)  if(com.z[js][h]!=com.z[0][h]) break;
      if (js==com.ns && com.z[0][h]!=indel[0] && com.z[0][h]!=indel[1])  
         nconstp+=(int)com.fpatt[h];
   }
   for (ig=0,zero(com.pi,nc); ig<com.ngene; ig++) {
      if (com.ngene>1)  fprintf (fout,"\n\nGene # %2d (len %4d)",
                             ig+1, com.lgene[ig]-(ig==0?0:com.lgene[ig-1]));
      fprintf(fout,"\n%*s",wname, ""); FOR(k,nc) fprintf(fout,"%7c",pch[k]);

      /* The following block calculates freqs in each species for each gene.  
         Ambiguities are resolved in each species.  com.pi and com.piG are 
         used for output only, and are not be used later with missing data.
      */
      zero(com.piG[ig],nc);
      for(js=0; js<com.ns; js++) {
         FOR(k,nc) pi0[k]=1./nc;
         for(irf=0; irf<nrf; irf++) {
            zero(com.pi,nc);
            AddFreqSeqGene(js, ig, pi0, com.pi, &miss);
            t=sum(com.pi,nc);
            if(t<1e-10) error2("empty sequences?");
            abyx(1/t, com.pi, nc);
            t=distance(com.pi,pi0,nc);
            if((t=distance(com.pi,pi0,nc))<1e-8) 
               break;
            xtoy(com.pi,pi0,nc);
         }   /* for(irf) */
         fprintf(fout,"\n%-*s",wname,com.spname[js]);
         FOR(k,nc) fprintf(fout,"%7.4f",com.pi[k]);
         FOR(k,nc) com.piG[ig][k]+=com.pi[k]/com.ns;
      }    /* for(js,ns) */
      if(com.ngene>1) {
         fprintf(fout,"\n\n%-*s",wname,"Mean");
         FOR(k,nc) fprintf(fout,"%7.4f",com.piG[ig][k]);
      }
   }

   /* If there are missing data, the following block calculates freqs 
      in each gene for com.piG[], as well com.pi[] for the entire sequence.  
      Ambiguities are resolved over entire data sets across species (within 
      each gene for com.piG[]).  These are used in ML calculation later.
   */
   if(!miss) {
      for (ig=0,zero(com.pi,nc); ig<com.ngene; ig++) {
         t=(ig==0?com.lgene[0]:com.lgene[ig]-com.lgene[ig-1])/(double)com.ls;
         FOR(k,nc)  com.pi[k]+=com.piG[ig][k]*t;
      }
   }
   else {
      for (ig=0; ig<com.ngene; ig++) { 
         xtoy(com.piG[ig],pi0,nc);
         for(irf=0; irf<nrf; irf++) {  /* com.piG[] */
            zero(com.piG[ig],nc);
            FOR(js,com.ns)
               AddFreqSeqGene(js, ig, pi0, com.piG[ig], &miss);
            t=sum(com.piG[ig],nc);
            if(t<1e-10) error2("empty sequences?");
            abyx(1/t, com.piG[ig], nc);
            if(distance(com.piG[ig],pi0,nc)<1e-8) break;
            xtoy(com.piG[ig],pi0,nc);
         }         /* for(irf) */
      }            /* for(ig) */
      zero(pi0,nc);
      FOR(k,nc) FOR(ig,com.ngene) pi0[k]+=com.piG[ig][k]/com.ngene;
      for(irf=0; irf<nrf; irf++) {  /* com.pi[] */
         zero(com.pi,nc);
         FOR(ig, com.ngene)  FOR(js,com.ns)
            AddFreqSeqGene(js, ig, pi0, com.pi, &miss);
         abyx(1/sum(com.pi,nc), com.pi, nc);
         if(distance(com.pi,pi0,nc)<1e-8) break;
         xtoy(com.pi,pi0,nc);
      }            /* for(ig) */
   }
   fprintf (fout, "\n\n%-*s", wname, "Average");
   FOR (k,nc) fprintf(fout,"%7.4f", com.pi[k]);
   if(miss) fputs("\n(Ambiguity characters are used to calculate freqs.)\n",fout);

   fprintf (fout,"\n\n# constant sites: %6d (%.2f%%)",
            nconstp, (double)nconstp*100./com.ls);

   if (com.model==0 || (com.seqtype==BASEseq && com.model==1)) {
      fillxc(com.pi, 1./nc, nc);
      FOR(ig,com.ngene) xtoy (com.pi, com.piG[ig], nc);
   }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -