📄 msa.c
字号:
void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline){ switch (outfmt) { case MSAFILE_A2M: WriteA2M(fp, msa); break; case MSAFILE_CLUSTAL: WriteClustal(fp, msa); break; case MSAFILE_MSF: WriteMSF(fp, msa); break; case MSAFILE_PHYLIP: WritePhylip(fp, msa); break; case MSAFILE_SELEX: WriteSELEX(fp, msa); break; case MSAFILE_STOCKHOLM: if (do_oneline) WriteStockholmOneBlock(fp, msa); else WriteStockholm(fp, msa); break; default: Die("can't write. no such alignment format %d\n", outfmt); }}/* Function: MSAGetSeqidx() * Date: SRE, Wed May 19 15:08:25 1999 [St. Louis] * * Purpose: From a sequence name, return seqidx appropriate * for an MSA structure. * * 1) try to guess the index. (pass -1 if you can't guess) * 2) Look up name in msa's hashtable. * 3) If it's a new name, store in msa's hashtable; * expand allocs as needed; * save sqname. * * Args: msa - alignment object * name - a sequence name * guess - a guess at the right index, or -1 if no guess. * * Returns: seqidx */intMSAGetSeqidx(MSA *msa, char *name, int guess){ int seqidx; /* can we guess? */ if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0) return guess; /* else, a lookup in the index */ if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0) return seqidx; /* else, it's a new name */ seqidx = GKIStoreKey(msa->index, name); if (seqidx >= msa->nseqalloc) MSAExpand(msa); msa->sqname[seqidx] = sre_strdup(name, -1); msa->nseq++; return seqidx;}/* Function: MSAFromAINFO() * Date: SRE, Mon Jun 14 11:22:24 1999 [St. Louis] * * Purpose: Convert the old aseq/ainfo alignment structure * to new MSA structure. Enables more rapid conversion * of codebase to the new world order. * * Args: aseq - [0..nseq-1][0..alen-1] alignment * ainfo - old-style optional info * * Returns: MSA * */MSA *MSAFromAINFO(char **aseq, AINFO *ainfo){ MSA *msa; int i, j; msa = MSAAlloc(ainfo->nseq, ainfo->alen); for (i = 0; i < ainfo->nseq; i++) { strcpy(msa->aseq[i], aseq[i]); msa->wgt[i] = ainfo->wgt[i]; msa->sqname[i] = sre_strdup(ainfo->sqinfo[i].name, -1); msa->sqlen[i] = msa->alen; GKIStoreKey(msa->index, msa->sqname[i]); if (ainfo->sqinfo[i].flags & SQINFO_ACC) MSASetSeqAccession(msa, i, ainfo->sqinfo[i].acc); if (ainfo->sqinfo[i].flags & SQINFO_DESC) MSASetSeqDescription(msa, i, ainfo->sqinfo[i].desc); if (ainfo->sqinfo[i].flags & SQINFO_SS) { if (msa->ss == NULL) { msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc); msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc); for (j = 0; j < msa->nseqalloc; j++) { msa->ss[j] = NULL; msa->sslen[j] = 0; } } MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].ss, &(msa->ss[i])); msa->sslen[i] = msa->alen; } if (ainfo->sqinfo[i].flags & SQINFO_SA) { if (msa->sa == NULL) { msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc); msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc); for (j = 0; j < msa->nseqalloc; j++) { msa->sa[j] = NULL; msa->salen[j] = 0; } } MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].sa, &(msa->sa[i])); msa->salen[i] = msa->alen; } } /* note that sre_strdup() returns NULL when passed NULL */ msa->name = sre_strdup(ainfo->name, -1); msa->desc = sre_strdup(ainfo->desc, -1); msa->acc = sre_strdup(ainfo->acc, -1); msa->au = sre_strdup(ainfo->au, -1); msa->ss_cons = sre_strdup(ainfo->cs, -1); msa->rf = sre_strdup(ainfo->rf, -1); if (ainfo->flags & AINFO_TC) { msa->cutoff[MSA_CUTOFF_TC1] = ainfo->tc1; msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE; msa->cutoff[MSA_CUTOFF_TC2] = ainfo->tc2; msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE; } if (ainfo->flags & AINFO_NC) { msa->cutoff[MSA_CUTOFF_NC1] = ainfo->nc1; msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE; msa->cutoff[MSA_CUTOFF_NC2] = ainfo->nc2; msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE; } if (ainfo->flags & AINFO_GA) { msa->cutoff[MSA_CUTOFF_GA1] = ainfo->ga1; msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE; msa->cutoff[MSA_CUTOFF_GA2] = ainfo->ga2; msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE; } msa->nseq = ainfo->nseq; msa->alen = ainfo->alen; return msa;}/* Function: MSAFileFormat() * Date: SRE, Fri Jun 18 14:26:49 1999 [Sanger Centre] * * Purpose: (Attempt to) determine the format of an alignment file. * Since it rewinds the file pointer when it's done, * cannot be used on a pipe or gzip'ed file. Works by * calling SeqfileFormat() from sqio.c, then making sure * that the format is indeed an alignment. If the format * comes back as FASTA, it assumes that the format as A2M * (e.g. aligned FASTA). * * Args: fname - file to evaluate * * Returns: format code; e.g. MSAFILE_STOCKHOLM */intMSAFileFormat(MSAFILE *afp){ int fmt; fmt = SeqfileFormat(afp->f); if (fmt == SQFILE_FASTA) fmt = MSAFILE_A2M; if (fmt != MSAFILE_UNKNOWN && ! IsAlignmentFormat(fmt)) Die("File %s does not appear to be an alignment file;\n\rather, it appears to be an unaligned file in %s format.\n\I'm expecting an alignment file in this context.\n", afp->fname, SeqfileFormat2String(fmt)); return fmt;}/* Function: MSAMingap() * Date: SRE, Mon Jun 28 18:57:54 1999 [on jury duty, St. Louis Civil Court] * * Purpose: Remove all-gap columns from a multiple sequence alignment * and its associated per-residue data. * * Args: msa - the alignment * * Returns: (void) */voidMSAMingap(MSA *msa){ int *useme; /* array of TRUE/FALSE flags for which columns to keep */ int apos; /* position in original alignment */ int idx; /* sequence index */ useme = MallocOrDie(sizeof(int) * msa->alen); for (apos = 0; apos < msa->alen; apos++) { for (idx = 0; idx < msa->nseq; idx++) if (! isgap(msa->aseq[idx][apos])) break; if (idx == msa->nseq) useme[apos] = FALSE; else useme[apos] = TRUE; } MSAShorterAlignment(msa, useme); free(useme); return;}/* Function: MSANogap() * Date: SRE, Wed Nov 17 09:59:51 1999 [St. Louis] * * Purpose: Remove all columns from a multiple sequence alignment that * contain any gaps -- used for filtering before phylogenetic * analysis. * * Args: msa - the alignment * * Returns: (void). The alignment is modified, so if you want to keep * the original for something, make a copy. */voidMSANogap(MSA *msa){ int *useme; /* array of TRUE/FALSE flags for which columns to keep */ int apos; /* position in original alignment */ int idx; /* sequence index */ useme = MallocOrDie(sizeof(int) * msa->alen); for (apos = 0; apos < msa->alen; apos++) { for (idx = 0; idx < msa->nseq; idx++) if (isgap(msa->aseq[idx][apos])) break; if (idx == msa->nseq) useme[apos] = TRUE; else useme[apos] = FALSE; } MSAShorterAlignment(msa, useme); free(useme); return;}/* Function: MSAShorterAlignment() * Date: SRE, Wed Nov 17 09:49:32 1999 [St. Louis] * * Purpose: Given an array "useme" (0..alen-1) of TRUE/FALSE flags, * where TRUE means "keep this column in the new alignment": * Remove all columns annotated as "FALSE" in the useme * array. * * Args: msa - the alignment. The alignment is changed, so * if you don't want the original screwed up, make * a copy of it first. * useme - TRUE/FALSE flags for columns to keep: 0..alen-1 * * Returns: (void) */voidMSAShorterAlignment(MSA *msa, int *useme){ int apos; /* position in original alignment */ int mpos; /* position in new alignment */ int idx; /* sequence index */ int i; /* markup index */ /* Since we're minimizing, we can overwrite, using already allocated * memory. */ for (apos = 0, mpos = 0; apos < msa->alen; apos++) { if (useme[apos] == FALSE) continue; /* shift alignment and associated per-column+per-residue markup */ if (mpos != apos) { for (idx = 0; idx < msa->nseq; idx++) { msa->aseq[idx][mpos] = msa->aseq[idx][apos]; if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = msa->ss[idx][apos]; if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = msa->sa[idx][apos]; for (i = 0; i < msa->ngr; i++) if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = msa->gr[i][idx][apos]; } if (msa->ss_cons != NULL) msa->ss_cons[mpos] = msa->ss_cons[apos]; if (msa->sa_cons != NULL) msa->sa_cons[mpos] = msa->sa_cons[apos]; if (msa->rf != NULL) msa->rf[mpos] = msa->rf[apos]; for (i = 0; i < msa->ngc; i++) msa->gc[i][mpos] = msa->gc[i][apos]; } mpos++; } msa->alen = mpos; /* set new length */ /* null terminate everything */ for (idx = 0; idx < msa->nseq; idx++) { msa->aseq[idx][mpos] = '\0'; if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = '\0'; if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = '\0'; for (i = 0; i < msa->ngr; i++) if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = '\0'; } if (msa->ss_cons != NULL) msa->ss_cons[mpos] = '\0'; if (msa->sa_cons != NULL) msa->sa_cons[mpos] = '\0'; if (msa->rf != NULL) msa->rf[mpos] = '\0'; for (i = 0; i < msa->ngc; i++) msa->gc[i][mpos] = '\0'; return;}/* Function: MSASmallerAlignment() * Date: SRE, Wed Jun 30 09:56:08 1999 [St. Louis] * * Purpose: Given an array "useme" of TRUE/FALSE flags for * each sequence in an alignment, construct * and return a new alignment containing only * those sequences that are flagged useme=TRUE. * * Used by routines such as MSAFilterAlignment() * and MSASampleAlignment(). * * Limitations: * Does not copy unparsed Stockholm markup. * * Does not make assumptions about meaning of wgt; * if you want the new wgt vector renormalized, do * it yourself with FNorm(new->wgt, new->nseq). * * Args: msa -- the original (larger) alignment * useme -- [0..nseq-1] array of TRUE/FALSE flags; TRUE means include * this seq in new alignment * ret_new -- RETURN: new alignment * * Returns: void * ret_new is allocated here; free with MSAFree() */voidMSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new){ MSA *new; /* RETURN: new alignment */ int nnew; /* number of seqs in new msa (e.g. # of TRUEs) */ int oidx, nidx; /* old, new indices */ int i; nnew = 0; for (oidx = 0; oidx < msa->nseq; oidx++) if (useme[oidx]) nnew++; if (nnew == 0) { *ret_new = NULL; return; } new = MSAAlloc(nnew, 0); nidx = 0; for (oidx = 0; oidx < msa->nseq; oidx++) if (useme[oidx]) { new->aseq[nidx] = sre_strdup(msa->aseq[oidx], msa->alen); new->sqname[nidx] = sre_strdup(msa->sqname[oidx], msa->alen); GKIStoreKey(new->index, msa->sqname[oidx]); new->wgt[nidx] = msa->wgt[oidx]; if (msa->sqacc != NULL) MSASetSeqAccession(new, nidx, msa->sqacc[oidx]); if (msa->sqdesc != NULL) MSASetSeqDescription(new, nidx, msa->sqdesc[oidx]); if (msa->ss != NULL && msa->ss[oidx] != NULL) { if (new->ss == NULL) new->ss = MallocOrDie(sizeof(char *) * new->nseq); new->ss[nidx] = sre_strdup(msa->ss[oidx], -1); } if (msa->sa != NULL && msa->sa[oidx] != NULL) { if (new->sa == NULL) new->sa = MallocOrDie(sizeof(char *) * new->nseq); new->sa[nidx] = sre_strdup(msa->sa[oidx], -1); } nidx++; } new->nseq = nnew; new->alen = msa->alen; new->flags = msa->flags; new->type = msa->type; new->name = sre_strdup(msa->name, -1); new->desc = sre_strdup(msa->desc, -1); new->acc = sre_strdup(msa->acc, -1); new->au = sre_strdup(msa->au, -1); new->ss_cons = sre_strdup(msa->ss_cons, -1); new->sa_cons = sre_strdup(msa->sa_cons, -1); new->rf = sre_strdup(msa->rf, -1); for (i = 0; i < MSA_MAXCUTOFFS; i++) { new->cutoff[i] = msa->cutoff[i]; new->cutoff_is_set[i] = msa->cutoff_is_set[i]; } free(new->sqlen); MSAMingap(new); *ret_new = new; return;}/***************************************************************** * Retrieval routines * * Access to MSA structure data is possible through these routines. * I'm not doing this because of object oriented design, though * it might work in my favor someday. * I'm doing this because lots of MSA data is optional, and * checking through the chain of possible NULLs is a pain. *****************************************************************/char *MSAGetSeqAccession(MSA *msa, int idx){ if (msa->sqacc != NULL && msa->sqacc[idx] != NULL) return msa->sqacc[idx]; else return NULL;}char *MSAGetSeqDescription(MSA *msa, int idx){ if (msa->sqdesc != NULL && msa->sqdesc[idx] != NULL) return msa->sqdesc[idx]; else return NULL;}char *MSAGetSeqSS(MSA *msa, int idx){ if (msa->ss != NULL && msa->ss[idx] != NULL) return msa->ss[idx]; else return NULL;}char *MSAGetSeqSA(MSA *msa, int idx){ if (msa->sa != NULL && msa->sa[idx] != NULL) return msa->sa[idx]; else return NULL;}/***************************************************************** * Information routines * * Access information about the MSA. *****************************************************************//* Function: MSAAverageSequenceLength() * Date: SRE, Sat Apr 6 09:41:34 2002 [St. Louis] * * Purpose: Return the average length of the (unaligned) sequences * in the MSA. * * Args: msa - the alignment * * Returns: average length */floatMSAAverageSequenceLength(MSA *msa){ int i; float avg; avg = 0.; for (i = 0; i < msa->nseq; i++) avg += (float) DealignedLength(msa->aseq[i]); if (msa->nseq == 0) return 0.; else return (avg / msa->nseq);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -