📄 selex.c
字号:
} /* Side chain % surface accessibility code */ else if (strcmp(nptr, "#=SA") == 0) { if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in #=SA line in ReadSELEX()"); } /* Aligned sequence; avoid unparsed machine comments */ else if (strncmp(nptr, "#=", 2) != 0) { if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) Die("Parse error in alignment line in ReadSELEX()"); seqidx++; } /* get next line */ for (;;) { nptr = NULL; if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* EOF */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* data */ } } /* end of a block */ currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1; /* get line 1 of next block */ for (;;) { if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */ strcpy(bufcpy, buffer); if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */ if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */ } } /* end of the file */ /* Lengths in sqinfo are for raw sequence (ungapped), * and SS, SA are 0..rlen-1 not 0..alen-1. * Only the seqs with structures come out of here with lengths set. */ for (seqidx = 0; seqidx < num; seqidx++) { int apos, rpos; /* secondary structures */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SS) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos]; rpos++; } ainfo->sqinfo[seqidx].ss[rpos] = '\0'; } /* Surface accessibility */ if (ainfo->sqinfo[seqidx].flags & SQINFO_SA) { for (apos = rpos = 0; apos < alen; apos++) if (! isgap(aseqs[seqidx][apos])) { ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos]; rpos++; } ainfo->sqinfo[seqidx].sa[rpos] = '\0'; } } /* NULL-terminate all the strings */ if (ainfo->rf != NULL) ainfo->rf[alen] = '\0'; if (ainfo->cs != NULL) ainfo->cs[alen] = '\0'; for (seqidx = 0; seqidx < num; seqidx++) aseqs[seqidx][alen] = '\0'; /* find raw sequence lengths for sqinfo */ for (seqidx = 0; seqidx < num; seqidx++) { count = 0; for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++) if (!isgap(*sptr)) count++; ainfo->sqinfo[seqidx].len = count; ainfo->sqinfo[seqidx].flags |= SQINFO_LEN; } /*************************************************** * Garbage collection and return ***************************************************/ free(blocks); if (warn_names) Warn("sequences may be in different orders in blocks of %s?", afp->fname); /* Convert back to MSA structure. (Wasteful kludge.) */ msa = MSAFromAINFO(aseqs, ainfo); MSAVerifyParse(msa); FreeAlignment(aseqs, ainfo); return msa;}/* Function: WriteSELEX() * Date: SRE, Mon Jun 14 13:13:14 1999 [St. Louis] * * Purpose: Write a SELEX file in multiblock format. * * Args: fp - file that's open for writing * msa - multiple sequence alignment object * * Returns: (void) */voidWriteSELEX(FILE *fp, MSA *msa){ actually_write_selex(fp, msa, 50); /* 50 char per block */}/* Function: WriteSELEXOneBlock() * Date: SRE, Mon Jun 14 13:14:56 1999 [St. Louis] * * Purpose: Write a SELEX alignment file in Pfam's single-block * format style. A wrapper for actually_write_selex(). * * Args: fp - file that's open for writing * msa- alignment to write * * Returns: (void) */voidWriteSELEXOneBlock(FILE *fp, MSA *msa){ actually_write_selex(fp, msa, msa->alen); /* one big block */}/* Function: actually_write_selex() * Date: SRE, Mon Jun 14 12:54:46 1999 [St. Louis] * * Purpose: Write an alignment in SELEX format to an open * file. This is the function that actually does * the work. The API's WriteSELEX() and * WriteSELEXOneBlock() are wrappers. * * Args: fp - file that's open for writing * msa - alignment to write * cpl - characters to write per line in alignment block * * Returns: (void) */static voidactually_write_selex(FILE *fp, MSA *msa, int cpl){ int i; int len = 0; int namewidth; char *buf; int currpos; buf = malloc(sizeof(char) * (cpl+101)); /* 100 chars allowed for name, etc. */ /* Figure out how much space we need for name + markup * to keep the alignment in register, for easier human viewing -- * even though Stockholm format doesn't care about visual * alignment. */ namewidth = 0; for (i = 0; i < msa->nseq; i++) if ((len = strlen(msa->sqname[i])) > namewidth) namewidth = len; if (namewidth < 6) namewidth = 6; /* minimum space for markup tags */ /* Free text comments */ for (i = 0; i < msa->ncomment; i++) fprintf(fp, "# %s\n", msa->comment[i]); if (msa->ncomment > 0) fprintf(fp, "\n"); /* Per-file annotation */ if (msa->name != NULL) fprintf(fp, "#=ID %s\n", msa->name); if (msa->acc != NULL) fprintf(fp, "#=AC %s\n", msa->acc); if (msa->desc != NULL) fprintf(fp, "#=DE %s\n", msa->desc); if (msa->au != NULL) fprintf(fp, "#=AU %s\n", msa->au); if (msa->flags & MSA_SET_GA) fprintf(fp, "#=GA %.1f %.1f\n", msa->ga1, msa->ga2); if (msa->flags & MSA_SET_NC) fprintf(fp, "#=NC %.1f %.1f\n", msa->nc1, msa->nc2); if (msa->flags & MSA_SET_TC) fprintf(fp, "#=TC %.1f %.1f\n", msa->tc1, msa->tc2); /* Per-sequence annotation */ for (i = 0; i < msa->nseq; i++) fprintf(fp, "#=SQ %-*.*s %6.4f %s %s %d..%d::%d %s\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i], "-", /* MSA has no ID field */ (msa->sqacc != NULL && msa->sqacc[i] != NULL) ? msa->sqacc[i] : "-", 0, 0, 0, /* MSA has no start, stop, olen field */ (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) ? msa->sqdesc[i] : "-"); fprintf(fp, "\n"); /* Alignment section: */ for (currpos = 0; currpos < msa->alen; currpos += cpl) { if (currpos > 0) fprintf(fp, "\n"); if (msa->ss_cons != NULL) { strncpy(buf, msa->ss_cons + currpos, cpl); buf[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=CS", buf); } if (msa->rf != NULL) { strncpy(buf, msa->rf + currpos, cpl); buf[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=RF", buf); } for (i = 0; i < msa->nseq; i++) { strncpy(buf, msa->aseq[i] + currpos, cpl); buf[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, msa->sqname[i], buf); if (msa->ss != NULL && msa->ss[i] != NULL) { strncpy(buf, msa->ss[i] + currpos, cpl); buf[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SS", buf); } if (msa->sa != NULL && msa->sa[i] != NULL) { strncpy(buf, msa->sa[i] + currpos, cpl); buf[cpl] = '\0'; fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SA", buf); } } } free(buf);}/* Function: copy_alignment_line() * * Purpose: Given a line from an alignment file, and bounds lcol,rcol * on what part of it may be sequence, save the alignment into * aseq starting at position apos. * * name_rcol is set to the rightmost column this aseqs's name * occupies; if name_rcol >= lcol, we have a special case in * which the name intrudes into the sequence zone. */static intcopy_alignment_line(char *aseq, int apos, int name_rcol, char *buffer, int lcol, int rcol, char gapsym){ char *s1, *s2; int i; s1 = aseq + apos; s2 = buffer; /* be careful that buffer doesn't end before lcol! */ for (i = 0; i < lcol; i++) if (*s2) s2++; for (i = lcol; i <= rcol; i++) { if (*s2 == '\t') { Warn("TAB characters will corrupt a SELEX alignment! Please remove them first."); return 0; } if (name_rcol >= i) /* name intrusion special case: pad left w/ gaps */ *s1 = gapsym; /* short buffer special case: pad right w/ gaps */ else if (*s2 == '\0' || *s2 == '\n') *s1 = gapsym; else if (*s2 == ' ') /* new: disallow spaces as gap symbols */ *s1 = gapsym; else /* normal case: copy buffer into aseq */ *s1 = *s2; s1++; if (*s2) s2++; } return 1;} /* Function: DealignAseqs() * * Given an array of (num) aligned sequences aseqs, * strip the gaps. Store the raw sequences in a new allocated array. * * Caller is responsible for free'ing the memory allocated to * rseqs. * * Returns 1 on success. Returns 0 and sets squid_errno on * failure. */intDealignAseqs(char **aseqs, int num, char ***ret_rseqs){ char **rseqs; /* de-aligned sequence array */ int idx; /* counter for sequences */ int depos; /* position counter for dealigned seq*/ int apos; /* position counter for aligned seq */ int seqlen; /* length of aligned seq */ /* alloc space */ rseqs = (char **) MallocOrDie (num * sizeof(char *)); /* main loop */ for (idx = 0; idx < num; idx++) { seqlen = strlen(aseqs[idx]); /* alloc space */ rseqs[idx] = (char *) MallocOrDie ((seqlen + 1) * sizeof(char)); /* strip gaps */ depos = 0; for (apos = 0; aseqs[idx][apos] != '\0'; apos++) if (!isgap(aseqs[idx][apos])) { rseqs[idx][depos] = aseqs[idx][apos]; depos++; } rseqs[idx][depos] = '\0'; } *ret_rseqs = rseqs; return 1;}/* Function: IsSELEXFormat() * * Return TRUE if filename may be in SELEX format. * * Accuracy is sacrificed for speed; a TRUE return does * *not* guarantee that the file will pass the stricter * error-checking of ReadSELEX(). All it checks is that * the first 500 non-comment lines of a file are * blank, or if there's a second "word" on the line * it looks like sequence (i.e., it's not kOtherSeq). * * Returns TRUE or FALSE. */intIsSELEXFormat(char *filename){ FILE *fp; /* ptr to open sequence file */ char buffer[LINEBUFLEN]; char *sptr; /* ptr to first word */ int linenum; if ((fp = fopen(filename, "r")) == NULL) { squid_errno = SQERR_NOFILE; return 0; } linenum = 0; while (linenum < 500 && fgets(buffer, LINEBUFLEN, fp) != NULL) { linenum++; /* dead giveaways for extended SELEX */ if (strncmp(buffer, "#=AU", 4) == 0) goto DONE; else if (strncmp(buffer, "#=ID", 4) == 0) goto DONE; else if (strncmp(buffer, "#=AC", 4) == 0) goto DONE; else if (strncmp(buffer, "#=DE", 4) == 0) goto DONE; else if (strncmp(buffer, "#=GA", 4) == 0) goto DONE; else if (strncmp(buffer, "#=TC", 4) == 0) goto DONE; else if (strncmp(buffer, "#=NC", 4) == 0) goto DONE; else if (strncmp(buffer, "#=SQ", 4) == 0) goto DONE; else if (strncmp(buffer, "#=SS", 4) == 0) goto DONE; else if (strncmp(buffer, "#=CS", 4) == 0) goto DONE; else if (strncmp(buffer, "#=RF", 4) == 0) goto DONE; /* a comment? */ if (strchr(commentsyms, *buffer) != NULL) continue; /* a blank line? */ if ((sptr = strtok(buffer, WHITESPACE)) == NULL) continue; /* a one-word line (name only) is possible, though rare */ if ((sptr = strtok(NULL, "\n")) == NULL) continue; if (Seqtype(sptr) == kOtherSeq) {fclose(fp); return 0;} } DONE: fclose(fp); return 1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -