📄 treesub.c
字号:
if (com.seqtype==BASEseq && com.model==5) { /* T92 model */
com.pi[0]=com.pi[2]=(com.pi[0]+com.pi[2])/2;
com.pi[1]=com.pi[3]=(com.pi[1]+com.pi[3])/2;
FOR(ig,com.ngene) {
com.piG[ig][0]=com.piG[ig][2]=(com.piG[ig][0]+com.piG[ig][2])/2;
com.piG[ig][1]=com.piG[ig][3]=(com.piG[ig][1]+com.piG[ig][3])/2;
}
}
/* this is used only for REV & REVu in baseml and model==3 in aaml */
getpi_sqrt (com.pi, com.pi_sqrt, com.ncode, &com.npi0);
if(com.seqtype==AAseq) {
for (k=0,t=0; k<nc; k++) t+=(com.pi[k]>0);
if (t<=4) puts("\n\a\t\tAre these a.a. sequences?");
}
if(!miss && com.ngene==1) {
for(h=0,lmax=-(double)com.ls*log((double)com.ls); h<com.npatt; h++)
if(com.fpatt[h]>1) lmax+=com.fpatt[h]*log((double)com.fpatt[h]);
}
if(fout) {
if(lmax) fprintf(fout, "\nln Lmax (unconstrained) = %.6f\n", lmax);
fflush(fout);
}
return(0);
}
void AddFreqSeqGene(int js, int ig, double pi0[], double pi[], int *missing)
{
/* This adds the character counts in sequence js in gene ig to pi,
using pi0, by resolving ambiguities
The data are coded if com.cleandata==1.
*/
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs));
int k, h, b, nc=com.ncode, nb,ib[4];
double t;
if(com.cleandata) {
for(h=com.posG[ig]; h<com.posG[ig+1]; h++)
{ b=com.z[js][h]; pi[b]+=com.fpatt[h]; }
}
else {
for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {
b=strchr(pch,com.z[js][h])-pch;
if(b<nc)
pi[b]+=com.fpatt[h];
else {
*missing=1;
if(com.seqtype==BASEseq) {
NucListall(com.z[js][h], &nb, ib);
for(k=0,t=0; k<nb; k++) t+=pi0[ib[k]];
FOR(k,nb) pi[ib[k]]+=pi0[ib[k]]/t*com.fpatt[h];
}
else if(com.seqtype==AAseq)
for(k=0;k<20;k++) pi[k]+=pi0[k]*com.fpatt[h];
}
}
}
}
int RemoveIndel(void)
{
/* Remove ambiguity characters and indels in the untranformed sequences,
Changing com.ls and posec[] (site marks for multiple genes).
For codonml, com.ls is still 3*#codons
Called at the end of ReadSeq, when posec[] are still site marks.
All characters in com.z[][] not found in the character string pch are
considered ambiguity characters and are removed.
*/
int h,k, j,js,lnew,nindel, n31,nchar;
char b, *pch, *miss; /* miss[h]=1 if site (codon) h is missing, 0 otherwise */
if(com.seqtype==CODONseq||com.seqtype==CODON2AAseq)
{ n31=3; nchar=4; pch=BASEs; }
else {
n31=1;
if(com.seqtype==AAseq) { nchar=20; pch=AAs; }
else if(com.seqtype==BASEseq) { nchar=4; pch=BASEs; }
else { nchar=2; pch=BINs; }
}
if (com.ls%n31) error2("ls in RemoveIndel.");
if((miss=(char*)malloc(com.ls/n31 *sizeof(char)))==NULL)
error2("oom miss");
FOR (h,com.ls/n31) miss[h]=0;
for (js=0; js<com.ns; js++) {
for (h=0,nindel=0; h<com.ls/n31; h++) {
for (k=0; k<n31; k++) {
b=(char)toupper(com.z[js][h*n31+k]);
FOR(j,nchar) if(b==pch[j]) break;
if(j==nchar) { miss[h]=1; nindel++; }
}
}
if (noisy>2 && nindel)
printf("\n%6d ambiguity characters in seq. %d", nindel,js+1);
}
if(noisy>2) {
puts("\nThe following sites are removed: ");
for (h=0; h<com.ls/n31; h++) if(miss[h]) printf(" %2d", h+1);
}
for (h=0,lnew=0; h<com.ls/n31; h++) {
if(miss[h]) continue;
for (js=0; js<com.ns; js++) {
for (k=0; k<n31; k++)
com.z[js][lnew*n31+k]=com.z[js][h*n31+k];
}
if(com.ngene>1) posec[lnew]=posec[h];
lnew++;
}
com.ls=lnew*n31;
free(miss);
return (0);
}
int MPInformSites (void)
{
/* Outputs parsimony informative and noninformative sites into
two files named MPinf.seq and MPninf.seq
Uses transformed sequences.
Not used for a long time. Does not work if com.pose is NULL.
So choose RateAncestor=1 to force allocation of com.pose to use this routine.
*/
char *imark, *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs));
int h, i, markb[NS], inf, lsinf;
FILE *finf, *fninf;
puts("\nMPInformSites: missing data not dealt with yet?\n");
finf=fopen("MPinf.seq","w");
fninf=fopen("MPninf.seq","w");
if (finf==NULL || fninf==NULL) error2("MPInformSites: file creation error");
puts ("\nSorting parsimony-informative sites: MPinf.seq & MPninf.seq");
if ((imark=(char*)malloc(com.ls*sizeof(char)))==NULL) error2("oom imark");
for (h=0,lsinf=0; h<com.ls; h++) {
for (i=0; i<com.ns; i++) markb[i]=0;
for (i=0; i<com.ns; i++) markb[(int)com.z[i][com.pose[h]]]++;
for (i=0,inf=0; i<com.ncode; i++) if (markb[i]>=2) inf++;
if (inf>=2) { imark[h]=1; lsinf++; }
else imark[h]=0;
}
fprintf (finf, "%6d%6d\n", com.ns, lsinf);
fprintf (fninf, "%6d%6d\n", com.ns, com.ls-lsinf);
for (i=0; i<com.ns; i++) {
fprintf (finf, "\n%s\n", com.spname[i]);
fprintf (fninf, "\n%s\n", com.spname[i]);
for (h=0; h<com.ls; h++)
fprintf ((imark[h]?finf:fninf), "%c", pch[com.z[i][com.pose[h]]]);
FPN (finf); FPN(fninf);
}
free (imark);
fclose(finf); fclose(fninf);
return (0);
}
int PatternJC69like (FILE *fout)
{
/* further collaps of site patterns for JC69-like models, called after
PatternWeight(). Used for nucleotide and amino acid models.
This is now made to work with unclean data as well. (August 2002)
*/
char zh[NS],b, whole[4]={'-','?','N'};
int npatt0=com.npatt, h, ht, j,k, same=0, ig, recode;
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:NULL));
if(com.seqtype==2) whole[2]='\0';
for (h=0,com.npatt=0,ig=-1; h<npatt0; h++) {
if (ig<com.ngene-1 && h==com.posG[ig+1]) { com.posG[++ig]=com.npatt; }
/* copy data to zh, recode if possible */
if(com.cleandata) { /* always recode */
zh[0]=b=0; b++;
for (j=1; j<com.ns; j++) {
FOR(k,j) if (com.z[j][h]==com.z[k][h]) break;
zh[j]= (char)(k<j?zh[k]:b++);
}
}
else { /* not recoded if ambiguity characters exist */
FOR(j,com.ns) zh[j]=com.z[j][h];
for (j=0,recode=1; j<com.ns; j++) {
if(strchr(whole, zh[j]))
{ zh[j]=whole[0]; continue; } /* OK */
k=strchr(pch,zh[j])-pch;
if(k<0)
printf("strange character %c in seq.", zh[j]);
else if(k<com.ncode) continue; /* OK */
recode=0; break;
}
if(recode) {
b=0;
if(zh[0]!=whole[0])
zh[0]=pch[b++];
for (j=1; j<com.ns; j++) {
if(zh[j]==whole[0]) continue;
FOR(k,j) if (com.z[j][h]==com.z[k][h]) break;
if(k<j) zh[j]=zh[k];
else zh[j]=pch[b++];
}
}
}
for (ht=com.posG[ig],same=0; ht<com.npatt; ht++) {
for (j=0,same=1; j<com.ns; j++)
if (zh[j]!=com.z[j][ht]) { same=0; break; }
if (same) break;
}
if (same) com.fpatt[ht]+=com.fpatt[h];
else {
FOR (j, com.ns) com.z[j][com.npatt]=zh[j];
com.fpatt[com.npatt++]=com.fpatt[h];
}
if(keeppose)
FOR (k, com.ls) if (com.pose[k]==h) com.pose[k]=ht;
} /* for (h) */
com.posG[com.ngene]=com.npatt;
if (noisy>=2) printf ("\nnew no. site patterns:%7d\n", com.npatt);
if (fout) {
fprintf(fout,"\nnew number of site patterns = %d\n", com.npatt);
FOR (h,com.npatt) {
fprintf(fout," %4.0f", com.fpatt[h]);
if((h+1)%15==0) FPN(fout);
}
/* for baseml only, to print out the patterns. comment out. */
fprintf (fout, "\nno. site patterns:%7d\n", com.npatt);
printsma(fout,com.spname,com.z,com.ns,com.npatt,com.npatt,10,com.seqtype, com.cleandata, 0,NULL);
#if 0
{ int n[5]={0,0,0,0,0};
FOR (h,com.npatt) {
if(com.z[0][h]==com.z[1][h] && com.z[0][h]==com.z[2][h]) n[0]=(int)com.fpatt[h];
else if(com.z[0][h]==com.z[1][h] && com.z[0][h]!=com.z[2][h]) n[1]=(int)com.fpatt[h];
else if(com.z[0][h]!=com.z[1][h] && com.z[1][h]==com.z[2][h]) n[2]=(int)com.fpatt[h];
else if(com.z[0][h]!=com.z[1][h] && com.z[0][h]==com.z[2][h]) n[3]=(int)com.fpatt[h];
else n[4]=(int)com.fpatt[h];
}
fprintf(frst1, "%-15s %6d%6d%6d%6d%6d%15d\n", com.spname[0]+1, n[0],n[1],n[2],n[3],n[4], n[0]+n[1]+n[2]+n[3]+n[4]);
}
#endif
}
return (0);
}
int Site2Pattern (FILE *fout)
{
int h;
fprintf(fout,"\n\nMapping site to pattern (i.e. site %d has pattern %d):\n",
com.ls-1, com.pose[com.ls-2]+1);
FOR (h, com.ls) {
fprintf (fout, "%6d", com.pose[h]+1);
if ((h+1)%10==0) FPN (fout);
}
FPN (fout);
return (0);
}
#endif
int print1seq (FILE*fout, char *z, int ls, int encoded, int pose[])
{
/* This prints out one sequence, which may be coded. Codon seqs are coded
as 0,1,2,...60 if called from codeml.c or 0,1,2,...,63 otherwise.
This may be risking. Check when use.
z[] contains patterns if (pose!=NULL)
This uses com.seqtype.
*/
int i, h,hp, gap=10;
char *pch=(com.seqtype==0?BASEs:(com.seqtype==2?AAs:BINs)), str[4]="";
int nb=(com.seqtype==CODONseq?3:1);
if(!encoded) { /* raw data, not coded */
FOR(h,ls) {
hp=(pose?pose[h]:h);
FOR(i,nb)
fputc(z[hp*nb+i],fout);
/*
FOR(i,nb) fputc(z[(pose?pose[h]:h) *nb+i],fout);
*/
if(com.seqtype==CODONseq || (h+1)%gap==0) fputc(' ',fout);
}
}
else { /* cleandata, coded */
FOR(h,ls) {
hp=(pose?pose[h]:h);
if(com.seqtype!=CODONseq)
fputc(pch[z[hp]],fout);
else {
#ifdef CODEML
fprintf(fout,"%s",getcodon(str,FROM61[z[hp]])); /* 0,1,...,60 */
#else
fprintf(fout,"%s",getcodon(str,z[hp])); /* 0,1,...,63 */
#endif
}
if(com.seqtype==CODONseq || (h+1)%gap==0) fputc(' ',fout);
}
}
return(0);
}
void printSeqs(FILE *fout, int *pose, char keep[], int format)
{
/* Print sequences into fout, using paml (format=0) or paup (format=1) formats.
Use pose=NULL if called before site patterns are collapsed.
Use NULL for keep if all sequences are to be printed.
Sequences may (com.cleandata==1) and may not (com.cleandata=0) be coded.
com.z[] has site patterns if pose!=NULL.
This uses com.seqtype, and com.ls is the number of codons for codon seqs.
See notes in print1seq()
format=0:PAML & PHYLIP format
1:NEXUS, with help from John Huelsenbeck. Thanks to John.
*/
int j,ls1=(com.seqtype==CODONseq?3*com.ls:com.ls), nskept=com.ns, wname=30;
char *dt=(com.seqtype==AAseq?"protein":"dna");
if(keep) FOR(j,com.ns) nskept -= !keep[j];
if (format==1) { /* NEXUS format */
fprintf(fout,"\nbegin data;\n");
fprintf(fout," dimensions ntax=%d nchar=%d;\n", nskept,ls1);
fprintf(fout," format datatype=%s missing=? gap=-;\n matrix\n",dt);
}
else
fprintf(fout,"%8d %8d\n",nskept,ls1);
for(j=0; j<com.ns; j++,FPN(fout)) {
if(keep && !keep[j]) continue;
fprintf(fout,"%s%-*s ", (format?" ":""),wname,com.spname[j]);
print1seq(fout,com.z[j],com.ls,com.cleandata, pose);
}
if (format==1) fprintf(fout, " ;\nend;");
FPN(fout);
fflush(fout);
}
#define gammap(x,alpha) (alpha*(1-pow(x,-1/alpha)))
/* DistanceREV () used to be here, moved to pamp.
*/
#if (NCODE==4)
double SeqDivergence (double x[], int model, double alpha, double *kappa)
{
/* HKY85 model, following Tamura (1993, MBE, ..)
alpha=0 if no gamma
return -1 if in error.
Check DistanceF84() if variances are wanted.
*/
int i,j;
double p[4], Y,R, a1,a2,b, P1,P2,Q,fd,tc,ag, GC;
double small=1e-6,largek=999;
if (testXMat(x))
error2("X err..");
for (i=0,fd=1,zero(p,4); i<4; i++) {
fd -= x[i*4+i];
FOR (j,4) { p[i]+=x[i*4+j]/2; p[j]+=x[i*4+j]/2; }
}
P1=x[0*4+1]+x[1*4+0];
P2=x[2*4+3]+x[3*4+2];
Q = fd-P1-P2;
if(fd<small/com.ls) fd=0;
if(P1<small/com.ls) P1=0;
if(P2<small/com.ls) P2=0;
if(Q<small/com.ls) Q=0;
Y=p[0]+p[1]; R=p[2]+p[3]; tc=p[0]*p[1]; ag=p[2]*p[3];
switch (model) {
case (JC69):
FOR (i,4) p[i]=.25;
case (F81):
for (i=0,b=0; i<4; i++) b += p[i]*(1-p[i]);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -