📄 treesub.c

📁 一个神经网络工具箱
💻 C
📖 第 1 页 / 共 5 页
字号:
   if (com.seqtype==BASEseq && com.model==5) { /* T92 model */
      com.pi[0]=com.pi[2]=(com.pi[0]+com.pi[2])/2;
      com.pi[1]=com.pi[3]=(com.pi[1]+com.pi[3])/2;
      FOR(ig,com.ngene) {
         com.piG[ig][0]=com.piG[ig][2]=(com.piG[ig][0]+com.piG[ig][2])/2;
         com.piG[ig][1]=com.piG[ig][3]=(com.piG[ig][1]+com.piG[ig][3])/2;
      }
   }

   /* this is used only for REV & REVu in baseml and model==3 in aaml */
   getpi_sqrt (com.pi, com.pi_sqrt, com.ncode, &com.npi0);
   if(com.seqtype==AAseq) {
      for (k=0,t=0; k<nc; k++) t+=(com.pi[k]>0);
      if (t<=4)  puts("\n\a\t\tAre these a.a. sequences?");
   }
   if(!miss && com.ngene==1) {
      for(h=0,lmax=-(double)com.ls*log((double)com.ls); h<com.npatt; h++)
         if(com.fpatt[h]>1) lmax+=com.fpatt[h]*log((double)com.fpatt[h]);
   }
   if(fout) {
      if(lmax) fprintf(fout, "\nln Lmax (unconstrained) = %.6f\n", lmax);
      fflush(fout);
   }

   return(0);
}


void AddFreqSeqGene(int js, int ig, double pi0[], double pi[], int *missing)
{
/* This adds the character counts in sequence js in gene ig to pi, 
   using pi0, by resolving ambiguities
   The data are coded if com.cleandata==1.
*/
   char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs));
   int k, h, b, nc=com.ncode, nb,ib[4];
   double t;

   if(com.cleandata) {
      for(h=com.posG[ig]; h<com.posG[ig+1]; h++) 
         { b=com.z[js][h]; pi[b]+=com.fpatt[h]; }
   }
   else {
      for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {
         b=strchr(pch,com.z[js][h])-pch;
         if(b<nc)
            pi[b]+=com.fpatt[h];
         else {
            *missing=1;
            if(com.seqtype==BASEseq) {
               NucListall(com.z[js][h], &nb, ib);
               for(k=0,t=0; k<nb; k++) t+=pi0[ib[k]];
               FOR(k,nb) pi[ib[k]]+=pi0[ib[k]]/t*com.fpatt[h];
            }
            else if(com.seqtype==AAseq)
               for(k=0;k<20;k++) pi[k]+=pi0[k]*com.fpatt[h];
         }
      }
   }
}




int RemoveIndel(void)
{
/* Remove ambiguity characters and indels in the untranformed sequences, 
   Changing com.ls and posec[] (site marks for multiple genes).
   For codonml, com.ls is still 3*#codons
   Called at the end of ReadSeq, when posec[] are still site marks.
   All characters in com.z[][] not found in the character string pch are
   considered ambiguity characters and are removed.
*/
   int  h,k, j,js,lnew,nindel, n31,nchar;
   char b, *pch, *miss;  /* miss[h]=1 if site (codon) h is missing, 0 otherwise */

   if(com.seqtype==CODONseq||com.seqtype==CODON2AAseq)
      { n31=3; nchar=4; pch=BASEs; }
   else {
      n31=1;
      if(com.seqtype==AAseq)        { nchar=20; pch=AAs; }
      else if(com.seqtype==BASEseq) { nchar=4; pch=BASEs; }
      else                          { nchar=2; pch=BINs; }
    }

   if (com.ls%n31) error2("ls in RemoveIndel.");
   if((miss=(char*)malloc(com.ls/n31 *sizeof(char)))==NULL)
      error2("oom miss");
   FOR (h,com.ls/n31) miss[h]=0;
   for (js=0; js<com.ns; js++) {
      for (h=0,nindel=0; h<com.ls/n31; h++) {
         for (k=0; k<n31; k++) {
            b=(char)toupper(com.z[js][h*n31+k]);
            FOR(j,nchar) if(b==pch[j]) break;
            if(j==nchar) { miss[h]=1; nindel++; }
         }
      }
      if (noisy>2 && nindel) 
         printf("\n%6d ambiguity characters in seq. %d", nindel,js+1);
   }
   if(noisy>2) {
      puts("\nThe following sites are removed: ");
      for (h=0; h<com.ls/n31; h++)  if(miss[h]) printf(" %2d", h+1);
   }

   for (h=0,lnew=0; h<com.ls/n31; h++)  {
      if(miss[h]) continue;
      for (js=0; js<com.ns; js++) {
         for (k=0; k<n31; k++)
            com.z[js][lnew*n31+k]=com.z[js][h*n31+k];
      }
      if(com.ngene>1) posec[lnew]=posec[h];
      lnew++;
   }
   com.ls=lnew*n31;
   free(miss);
   return (0);
}



int MPInformSites (void)
{
/* Outputs parsimony informative and noninformative sites into 
   two files named MPinf.seq and MPninf.seq
   Uses transformed sequences.  
   Not used for a long time.  Does not work if com.pose is NULL.  
   So choose RateAncestor=1 to force allocation of com.pose to use this routine.
*/
   char *imark, *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs));
   int h, i, markb[NS], inf, lsinf;
   FILE *finf, *fninf;

puts("\nMPInformSites: missing data not dealt with yet?\n");

   finf=fopen("MPinf.seq","w");
   fninf=fopen("MPninf.seq","w");
   if (finf==NULL || fninf==NULL) error2("MPInformSites: file creation error");

   puts ("\nSorting parsimony-informative sites: MPinf.seq & MPninf.seq");
   if ((imark=(char*)malloc(com.ls*sizeof(char)))==NULL) error2("oom imark");
   for (h=0,lsinf=0; h<com.ls; h++) {
      for (i=0; i<com.ns; i++) markb[i]=0;
      for (i=0; i<com.ns; i++) markb[(int)com.z[i][com.pose[h]]]++;

      for (i=0,inf=0; i<com.ncode; i++)  if (markb[i]>=2)  inf++;
      if (inf>=2) { imark[h]=1; lsinf++; }
      else imark[h]=0;
   }
   fprintf (finf, "%6d%6d\n", com.ns, lsinf);
   fprintf (fninf, "%6d%6d\n", com.ns, com.ls-lsinf);
   for (i=0; i<com.ns; i++) {
      fprintf (finf, "\n%s\n", com.spname[i]);
      fprintf (fninf, "\n%s\n", com.spname[i]);
      for (h=0; h<com.ls; h++)
         fprintf ((imark[h]?finf:fninf), "%c", pch[com.z[i][com.pose[h]]]);
      FPN (finf); FPN(fninf);
   }
   free (imark);
   fclose(finf);  fclose(fninf);
   return (0);
}


int PatternJC69like (FILE *fout)
{
/* further collaps of site patterns for JC69-like models, called after
   PatternWeight().  Used for nucleotide and amino acid models.
   This is now made to work with unclean data as well. (August 2002)
*/
   char zh[NS],b, whole[4]={'-','?','N'};
   int npatt0=com.npatt, h, ht, j,k, same=0, ig, recode;
   char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:NULL));

   if(com.seqtype==2) whole[2]='\0';
   for (h=0,com.npatt=0,ig=-1; h<npatt0; h++) {
      if (ig<com.ngene-1 && h==com.posG[ig+1]) { com.posG[++ig]=com.npatt; }
      /* copy data to zh, recode if possible */
      if(com.cleandata) { /* always recode */
         zh[0]=b=0; b++;
         for (j=1; j<com.ns; j++) {
            FOR(k,j) if (com.z[j][h]==com.z[k][h]) break;
            zh[j]= (char)(k<j?zh[k]:b++);
         }
      }
      else { /* not recoded if ambiguity characters exist */
         FOR(j,com.ns) zh[j]=com.z[j][h];
         for (j=0,recode=1; j<com.ns; j++) {
            if(strchr(whole, zh[j])) 
               { zh[j]=whole[0]; continue; } /* OK */
            k=strchr(pch,zh[j])-pch;
            if(k<0)
               printf("strange character %c in seq.", zh[j]);
            else if(k<com.ncode) continue;      /* OK */
            recode=0; break;
         }
         if(recode) {
            b=0;
            if(zh[0]!=whole[0]) 
               zh[0]=pch[b++];
            for (j=1; j<com.ns; j++) {
               if(zh[j]==whole[0]) continue;
               FOR(k,j) if (com.z[j][h]==com.z[k][h]) break;
               if(k<j) zh[j]=zh[k];
               else    zh[j]=pch[b++];
            }
         }
      }

      for (ht=com.posG[ig],same=0; ht<com.npatt; ht++) {
         for (j=0,same=1; j<com.ns; j++)
            if (zh[j]!=com.z[j][ht]) { same=0; break; }
         if (same) break; 
      }
      if (same)  com.fpatt[ht]+=com.fpatt[h];
      else {
         FOR (j, com.ns) com.z[j][com.npatt]=zh[j];
         com.fpatt[com.npatt++]=com.fpatt[h];
      }
      if(keeppose) 
         FOR (k, com.ls) if (com.pose[k]==h) com.pose[k]=ht;
   }     /* for (h)   */
   com.posG[com.ngene]=com.npatt;
   if (noisy>=2) printf ("\nnew no. site patterns:%7d\n", com.npatt);

   if (fout) {
      fprintf(fout,"\nnew number of site patterns = %d\n", com.npatt);
      FOR (h,com.npatt) {
         fprintf(fout," %4.0f", com.fpatt[h]);
         if((h+1)%15==0) FPN(fout);
      }
/*    for baseml only, to print out the patterns.  comment out. */

      fprintf (fout, "\nno. site patterns:%7d\n", com.npatt);
      printsma(fout,com.spname,com.z,com.ns,com.npatt,com.npatt,10,com.seqtype, com.cleandata, 0,NULL);

#if 0
{ int n[5]={0,0,0,0,0};
   FOR (h,com.npatt) {
      if(com.z[0][h]==com.z[1][h] && com.z[0][h]==com.z[2][h])       n[0]=(int)com.fpatt[h];
      else if(com.z[0][h]==com.z[1][h] && com.z[0][h]!=com.z[2][h])  n[1]=(int)com.fpatt[h];
      else if(com.z[0][h]!=com.z[1][h] && com.z[1][h]==com.z[2][h])  n[2]=(int)com.fpatt[h];
      else if(com.z[0][h]!=com.z[1][h] && com.z[0][h]==com.z[2][h])  n[3]=(int)com.fpatt[h];
      else                                                           n[4]=(int)com.fpatt[h];
   }
   fprintf(frst1, "%-15s %6d%6d%6d%6d%6d%15d\n", com.spname[0]+1, n[0],n[1],n[2],n[3],n[4], n[0]+n[1]+n[2]+n[3]+n[4]);
}
#endif

   }

  return (0);
}

int Site2Pattern (FILE *fout)
{
   int h;
   fprintf(fout,"\n\nMapping site to pattern (i.e. site %d has pattern %d):\n",
      com.ls-1, com.pose[com.ls-2]+1);
   FOR (h, com.ls) {
      fprintf (fout, "%6d", com.pose[h]+1);
      if ((h+1)%10==0) FPN (fout);
   }
   FPN (fout);
   return (0);
}

#endif


int print1seq (FILE*fout, char *z, int ls, int encoded, int pose[])
{
/* This prints out one sequence, which may be coded.  Codon seqs are coded
   as 0,1,2,...60 if called from codeml.c or 0,1,2,...,63 otherwise.
   This may be risking.  Check when use.
   z[] contains patterns if (pose!=NULL)
   This uses com.seqtype.
*/
   int i, h,hp, gap=10;
   char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)), str[4]="";
   int nb=(com.seqtype==CODONseq?3:1);


   if(!encoded) {  /* raw data, not coded */
      FOR(h,ls) {
         hp=(pose?pose[h]:h);
         FOR(i,nb) 
            fputc(z[hp*nb+i],fout);
/*
         FOR(i,nb) fputc(z[(pose?pose[h]:h) *nb+i],fout);
*/
         if(com.seqtype==CODONseq || (h+1)%gap==0) fputc(' ',fout);
      }
   }
   else {    /* cleandata, coded */
      FOR(h,ls) {
         hp=(pose?pose[h]:h);
         if(com.seqtype!=CODONseq) 
            fputc(pch[z[hp]],fout);
         else {
#ifdef CODEML
            fprintf(fout,"%s",getcodon(str,FROM61[z[hp]])); /* 0,1,...,60 */
#else
            fprintf(fout,"%s",getcodon(str,z[hp]));         /* 0,1,...,63 */
#endif
         }
         if(com.seqtype==CODONseq || (h+1)%gap==0) fputc(' ',fout);
      }
   }
   return(0);
}

void printSeqs(FILE *fout, int *pose, char keep[], int format)
{
/* Print sequences into fout, using paml (format=0) or paup (format=1) formats.
   Use pose=NULL if called before site patterns are collapsed.  
   Use NULL for keep if all sequences are to be printed.
   Sequences may (com.cleandata==1) and may not (com.cleandata=0) be coded.
   com.z[] has site patterns if pose!=NULL.
   This uses com.seqtype, and com.ls is the number of codons for codon seqs.
   See notes in print1seq()
   
   format=0:PAML & PHYLIP format
          1:NEXUS, with help from John Huelsenbeck.  Thanks to John.
*/
   int j,ls1=(com.seqtype==CODONseq?3*com.ls:com.ls), nskept=com.ns, wname=30;
   char *dt=(com.seqtype==AAseq?"protein":"dna");

   if(keep) FOR(j,com.ns) nskept -= !keep[j];
   if (format==1) {  /* NEXUS format */
      fprintf(fout,"\nbegin data;\n");
      fprintf(fout,"   dimensions ntax=%d nchar=%d;\n", nskept,ls1);
      fprintf(fout,"   format datatype=%s missing=? gap=-;\n   matrix\n",dt);
   }
   else    
      fprintf(fout,"%8d %8d\n",nskept,ls1);

   for(j=0; j<com.ns; j++,FPN(fout)) {
      if(keep && !keep[j]) continue;
      fprintf(fout,"%s%-*s  ", (format?"      ":""),wname,com.spname[j]);
      print1seq(fout,com.z[j],com.ls,com.cleandata, pose);
   }
   if (format==1) fprintf(fout, "   ;\nend;");
   FPN(fout);
   fflush(fout);
}

#define gammap(x,alpha) (alpha*(1-pow(x,-1/alpha)))
/* DistanceREV () used to be here, moved to pamp. 
*/

#if (NCODE==4) 

double SeqDivergence (double x[], int model, double alpha, double *kappa)
{
/* HKY85 model, following Tamura (1993, MBE, ..)
   alpha=0 if no gamma 
   return -1 if in error.
   Check DistanceF84() if variances are wanted.
*/
   int i,j;
   double p[4], Y,R, a1,a2,b, P1,P2,Q,fd,tc,ag, GC;
   double small=1e-6,largek=999;

   if (testXMat(x))
      error2("X err..");
   for (i=0,fd=1,zero(p,4); i<4; i++) {
      fd -= x[i*4+i];
      FOR (j,4) { p[i]+=x[i*4+j]/2;  p[j]+=x[i*4+j]/2; }
   }
   P1=x[0*4+1]+x[1*4+0];
   P2=x[2*4+3]+x[3*4+2];
   Q = fd-P1-P2;
   if(fd<small/com.ls) fd=0;
   if(P1<small/com.ls) P1=0; 
   if(P2<small/com.ls) P2=0; 
   if(Q<small/com.ls) Q=0;
   Y=p[0]+p[1];    R=p[2]+p[3];  tc=p[0]*p[1]; ag=p[2]*p[3];

   switch (model) {
   case (JC69):
      FOR (i,4) p[i]=.25;
   case (F81):
      for (i=0,b=0; i<4; i++)  b += p[i]*(1-p[i]);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -