📄 sqio.c
字号:
} /* Unambiguous decisions: */ if (nother) return kOtherSeq; if (namino == nseq) return kAmino; if (ndna == nseq) return kDNA; if (nrna == nseq) return kRNA; /* Ambiguous decisions: */ if (namino == 0) return kRNA; /* it's nucleic acid, but seems mixed RNA/DNA */ return kAmino; /* some amino acid seen; others probably short seqs, some of which may be entirely ACGT (ala,cys,gly,thr). We could be a little more sophisticated: U would be a giveaway that we're not in protein seqs */}/* Function: WriteSimpleFASTA() * Date: SRE, Tue Nov 16 18:06:00 1999 [St. Louis] * * Purpose: Just write a FASTA format sequence to a file; * minimal interface, mostly for quick and dirty programs. * * Args: fp - open file handle (stdout, possibly) * seq - sequence to output * name - name for the sequence * desc - optional description line, or NULL. * * Returns: void */voidWriteSimpleFASTA(FILE *fp, char *seq, char *name, char *desc){ char buf[61]; int len; int pos; len = strlen(seq); buf[60] = '\0'; fprintf(fp, ">%s %s\n", name, desc != NULL ? desc : ""); for (pos = 0; pos < len; pos += 60) { strncpy(buf, seq+pos, 60); fprintf(fp, "%s\n", buf); }}intWriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo){ int numline = 0; int lines = 0, spacer = 0, width = 50, tab = 0; int i, j, l, l1, ibase; char endstr[10]; char s[100]; /* buffer for sequence */ char ss[100]; /* buffer for structure */ int checksum = 0; int seqlen; int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */ int dostruc; /* TRUE to print structure lines*/ which_case = 0; dostruc = FALSE; seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq); if (IsAlignmentFormat(outform)) Die("Tried to write an aligned format with WriteSeq() -- bad, bad."); strcpy( endstr,""); l1 = 0; checksum = GCGchecksum(seq, seqlen); switch (outform) { case SQFILE_UNKNOWN: /* no header, just sequence */ strcpy(endstr,"\n"); /* end w/ extra blank line */ break; case SQFILE_GENBANK: fprintf(outf,"LOCUS %s %d bp\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name, seqlen); fprintf(outf,"DEFINITION %s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); fprintf(outf,"ACCESSION %s\n", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); fprintf(outf,"ORIGIN \n"); spacer = 11; numline = 1; strcpy(endstr, "\n//"); break; case SQFILE_GCGDATA: fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen); fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); break; case SQFILE_PIR: fprintf(outf, "ENTRY %s\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); fprintf(outf, "TITLE %s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); fprintf(outf, "ACCESSION %s\n", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); fprintf(outf, "SUMMARY #Length %d #Checksum %d\n", sqinfo->len, checksum); fprintf(outf, "SEQUENCE\n"); fprintf(outf, " 5 10 15 20 25 30\n"); spacer = 2; /* spaces after every residue */ numline = 1; /* number lines w/ coords */ width = 30; /* 30 aa per line */ strcpy(endstr, "\n///"); break; case SQFILE_SQUID: fprintf(outf, "NAM %s\n", sqinfo->name); if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN)) fprintf(outf, "SRC %s %s %d..%d::%d\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-", (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0, (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0, (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0); if (sqinfo->flags & SQINFO_DESC) fprintf(outf, "DES %s\n", sqinfo->desc); if (sqinfo->flags & SQINFO_SS) { fprintf(outf, "SEQ +SS\n"); dostruc = TRUE; /* print structure lines too */ } else fprintf(outf, "SEQ\n"); numline = 1; /* number seq lines w/ coords */ strcpy(endstr, "\n++"); break; case SQFILE_EMBL: fprintf(outf,"ID %s\n", (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); fprintf(outf,"AC %s\n", (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); fprintf(outf,"DE %s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); fprintf(outf,"SQ %d BP\n", seqlen); strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ tab = 5; /** added 31jan91 */ spacer = 11; /** added 31jan91 */ break; case SQFILE_GCG: fprintf(outf,"%s\n", sqinfo->name); if (sqinfo->flags & SQINFO_ACC) fprintf(outf,"ACCESSION %s\n", sqinfo->acc); if (sqinfo->flags & SQINFO_DESC) fprintf(outf,"DEFINITION %s\n", sqinfo->desc); fprintf(outf," %s Length: %d (today) Check: %d ..\n", sqinfo->name, seqlen, checksum); spacer = 11; numline = 1; strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ break; case SQFILE_STRIDER: /* ?? map ?*/ fprintf(outf,"; ### from DNA Strider ;-)\n"); fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n", sqinfo->name, seqlen, checksum); strcpy(endstr, "\n//"); break; /* SRE: Don had Zuker default to Pearson, which is not intuitive or helpful, since Zuker's MFOLD can't read Pearson format. More useful to use kIG */ case SQFILE_ZUKER: which_case = 1; /* MFOLD requires upper case. */ /*FALLTHRU*/ case SQFILE_IG: fprintf(outf,";%s %s\n", sqinfo->name, (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); fprintf(outf,"%s\n", sqinfo->name); strcpy(endstr,"1"); /* == linear dna */ break; case SQFILE_RAW: /* Raw: no header at all. */ break; default : case SQFILE_FASTA: fprintf(outf,">%s %s\n", sqinfo->name, (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); break; } if (which_case == 1) s2upper(seq); if (which_case == 2) s2lower(seq); width = MIN(width,100); for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) { if (l1 < 0) l1 = 0; else if (l1 == 0) { if (numline) fprintf(outf,"%8d ",ibase); for (j=0; j<tab; j++) fputc(' ',outf); } if ((spacer != 0) && ((l+1) % spacer == 1)) { s[l] = ' '; ss[l] = ' '; l++; } s[l] = seq[i]; ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.'; l++; i++; l1++; /* don't count spaces for width*/ if (l1 == width || i == seqlen) { s[l] = ss[l] = '\0'; l = 0; l1 = 0; if (dostruc) { fprintf(outf, "%s\n", s); if (numline) fprintf(outf," "); for (j=0; j<tab; j++) fputc(' ',outf); if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr); else fprintf(outf,"%s\n",ss); } else { if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr); else fprintf(outf,"%s\n",s); } lines++; ibase = i+1; } } return lines;} /* Function: ReadMultipleRseqs() * * Purpose: Open a data file and * parse it into an array of rseqs (raw, unaligned * sequences). * * Caller is responsible for free'ing memory allocated * to ret_rseqs, ret_weights, and ret_names. * * Weights are currently only supported for MSF format. * Sequences read from all other formats will be assigned * weights of 1.0. If the caller isn't interested in * weights, it passes NULL as ret_weights. * * Returns 1 on success. Returns 0 on failure and sets * squid_errno to indicate the cause. */intReadMultipleRseqs(char *seqfile, int fformat, char ***ret_rseqs, SQINFO **ret_sqinfo, int *ret_num){ SQINFO *sqinfo; /* array of sequence optional info */ SQFILE *dbfp; /* open ptr for sequential access of file */ char **rseqs; /* sequence array */ int numalloced; /* num of seqs currently alloced for */ int num; num = 0; numalloced = 16; rseqs = (char **) MallocOrDie (numalloced * sizeof(char *)); sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO)); if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0; while (ReadSeq(dbfp, dbfp->format, &rseqs[num], &(sqinfo[num]))) { num++; if (num == numalloced) /* more seqs coming, alloc more room */ { numalloced += 16; rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *)); sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO)); } } SeqfileClose(dbfp); *ret_rseqs = rseqs; *ret_sqinfo = sqinfo; *ret_num = num; return 1;}/* Function: String2SeqfileFormat() * Date: SRE, Sun Jun 27 15:25:54 1999 [TW 723 over Canadian Shield] * * Purpose: Convert a string (e.g. from command line option arg) * to a format code. Case insensitive. Return * MSAFILE_UNKNOWN/SQFILE_UNKNOWN if string is bad. * Uses codes defined in squid.h (unaligned formats) and * msa.h (aligned formats). * * Args: s - string to convert; e.g. "stockholm" * * Returns: format code; e.g. MSAFILE_STOCKHOLM */intString2SeqfileFormat(char *s){ char *s2; int code = SQFILE_UNKNOWN; if (s == NULL) return SQFILE_UNKNOWN; s2 = sre_strdup(s, -1); s2upper(s2); if (strcmp(s2, "FASTA") == 0) code = SQFILE_FASTA; else if (strcmp(s2, "GENBANK") == 0) code = SQFILE_GENBANK; else if (strcmp(s2, "EMBL") == 0) code = SQFILE_EMBL; else if (strcmp(s2, "GCG") == 0) code = SQFILE_GCG; else if (strcmp(s2, "GCGDATA") == 0) code = SQFILE_GCGDATA; else if (strcmp(s2, "RAW") == 0) code = SQFILE_RAW; else if (strcmp(s2, "IG") == 0) code = SQFILE_IG; else if (strcmp(s2, "STRIDER") == 0) code = SQFILE_STRIDER; else if (strcmp(s2, "IDRAW") == 0) code = SQFILE_IDRAW; else if (strcmp(s2, "ZUKER") == 0) code = SQFILE_ZUKER; else if (strcmp(s2, "PIR") == 0) code = SQFILE_PIR; else if (strcmp(s2, "SQUID") == 0) code = SQFILE_SQUID; else if (strcmp(s2, "STOCKHOLM") == 0) code = MSAFILE_STOCKHOLM; else if (strcmp(s2, "SELEX") == 0) code = MSAFILE_SELEX; else if (strcmp(s2, "MSF") == 0) code = MSAFILE_MSF; else if (strcmp(s2, "CLUSTAL") == 0) code = MSAFILE_CLUSTAL; else if (strcmp(s2, "A2M") == 0) code = MSAFILE_A2M; else if (strcmp(s2, "PHYLIP") == 0) code = MSAFILE_PHYLIP; else if (strcmp(s2, "EPS") == 0) code = MSAFILE_EPS; free(s2); return code;}char *SeqfileFormat2String(int code){ switch (code) { case SQFILE_UNKNOWN: return "unknown"; case SQFILE_FASTA: return "FASTA"; case SQFILE_GENBANK: return "Genbank"; case SQFILE_EMBL: return "EMBL"; case SQFILE_GCG: return "GCG"; case SQFILE_GCGDATA: return "GCG data library"; case SQFILE_RAW: return "raw"; case SQFILE_IG: return "Intelligenetics"; case SQFILE_STRIDER: return "MacStrider"; case SQFILE_IDRAW: return "Idraw Postscript"; case SQFILE_ZUKER: return "Zuker"; case SQFILE_PIR: return "PIR"; case SQFILE_SQUID: return "SQUID"; case MSAFILE_STOCKHOLM: return "Stockholm"; case MSAFILE_SELEX: return "SELEX"; case MSAFILE_MSF: return "MSF"; case MSAFILE_CLUSTAL: return "Clustal"; case MSAFILE_A2M: return "a2m"; case MSAFILE_PHYLIP: return "Phylip"; case MSAFILE_EPS: return "EPS"; default: Die("Bad code passed to MSAFormat2String()"); } /*NOTREACHED*/ return NULL;}/* Function: MSAToSqinfo() * Date: SRE, Tue Jul 20 14:36:56 1999 [St. Louis] * * Purpose: Take an MSA and generate a SQINFO array suitable * for use in annotating the unaligned sequences. * Return the array. * * Permanent temporary code. sqinfo was poorly designed. * it must eventually be replaced, but the odds * of this happening soon are nil, so I have to deal. * * Args: msa - the alignment * * Returns: ptr to allocated sqinfo array. * Freeing is ghastly: free in each individual sqinfo[i] * with FreeSequence(NULL, &(sqinfo[i])), then * free(sqinfo). */SQINFO *MSAToSqinfo(MSA *msa){ int idx; SQINFO *sqinfo; sqinfo = MallocOrDie(sizeof(SQINFO) * msa->nseq); for (idx = 0; idx < msa->nseq; idx++) { sqinfo[idx].flags = 0; SetSeqinfoString(&(sqinfo[idx]), msa->sqname[idx], SQINFO_NAME); SetSeqinfoString(&(sqinfo[idx]), MSAGetSeqAccession(msa, idx), SQINFO_ACC); SetSeqinfoString(&(sqinfo[idx]), MSAGetSeqDescription(msa, idx), SQINFO_DESC); if (msa->ss != NULL && msa->ss[idx] != NULL) { MakeDealignedString(msa->aseq[idx], msa->alen, msa->ss[idx], &(sqinfo[idx].ss)); sqinfo[idx].flags |= SQINFO_SS; } if (msa->sa != NULL && msa->sa[idx] != NULL) { MakeDealignedString(msa->aseq[idx], msa->alen, msa->sa[idx], &(sqinfo[idx].sa)); sqinfo[idx].flags |= SQINFO_SA; } sqinfo[idx].len = DealignedLength(msa->aseq[idx]); sqinfo[idx].flags |= SQINFO_LEN; } return sqinfo;}/* cc -o sqio_test -DA_QUIET_DAY -L. sqio.c -lsquid */#ifdef A_QUIET_DAY#include "ssi.h"intmain(int argc, char **argv){ FILE *fp; char *filename; char *buf; int len; int mode = 3; SSIOFFSET off; filename = argv[1]; if (mode == 1) { buf = malloc(sizeof(char) * 256); if ((fp = fopen(filename, "r")) == NULL) Die("open of %s failed", filename); while (fgets(buf, 255, fp) != NULL) ; fclose(fp); free(buf); } else if (mode == 2) { if ((fp = fopen(filename, "r")) == NULL) Die("open of %s failed", filename); buf = NULL; len = 0; while (sre_fgets(&buf, &len, fp) != NULL) SSIGetFilePosition(fp, SSI_OFFSET_I32, &off); fclose(fp); free(buf); } else if (mode == 3) { SQFILE *dbfp; SQINFO info; if ((dbfp = SeqfileOpen(filename, SQFILE_FASTA, NULL)) == NULL) Die("open of %s failed", filename); while (ReadSeq(dbfp, dbfp->format, &buf, &info)) { SSIGetFilePosition(dbfp->f, SSI_OFFSET_I32, &off); FreeSequence(buf, &info); } SeqfileClose(dbfp); } } #endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -