ureadseq.c
来自「EM算法的改进」· C语言 代码 · 共 1,911 行 · 第 1/4 页
C
1,911 行
check %= 10000; *checktotal += check; *checktotal %= 10000; return check;}/* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */const unsigned long crctab[] = { 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL, 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL, 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL, 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL, 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL, 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL, 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL, 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL, 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL, 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL, 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL, 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL, 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL, 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL, 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL, 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL, 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL, 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL, 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL, 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL, 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL, 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL, 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL, 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL, 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, 0x2d02ef8dL};unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)/*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */{ register unsigned long c = 0xffffffffUL; register long n = seqlen; /* tbailey; 11-11-96: move seq++ out of macro! */ while (n--) { c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8); seq++; } c= c ^ 0xffffffffUL; *checktotal += c; return c;}short getseqtype( const char *seq, const long seqlen){ /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */ char c; short i, maxtest; short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0; maxtest = min(300, seqlen); for (i = 0; i < maxtest; i++) { c = to_upper(seq[i]); if (strchr(protonly, c)) po++; else if (strchr(primenuc,c)) { na++; if (c == 'T') nt++; else if (c == 'U') nu++; } else if (strchr(aminos,c)) aa++; else if (strchr(seqsymbols,c)) ns++; else if (isalpha((int)c)) no++; } if ((no > 0) || (po+aa+na == 0)) return kOtherSeq; /* ?? test for probability of kOtherSeq ?, e.g., else if (po+aa+na / maxtest < 0.70) return kOtherSeq; */ else if (po > 0) return kAmino; else if (aa == 0) { if (nu > nt) return kRNA; else return kDNA; } else if (na > aa) return kNucleic; else return kAmino;} /* getseqtype */char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen){ register char *a, *b; register long i; char *newseq; *newlen= 0; if (!seq) return NULL; newseq = (char*) malloc(seqlen+1); if (!newseq) return NULL; for (a= (char*)seq, b=newseq, i=0; *a!=0; a++) if (*a != gapc) { *b++= *a; i++; } *b= '\0'; newseq = (char*) realloc(newseq, i+1); *newlen= i; return newseq;}/***char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \{\\fonttbl\ {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\ {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\ {\\f5\\froman Times;}{\\f6\\froman Palatino;}\ {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\{\\stylesheet\ {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\ {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\ {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}";char *rtftail = "}";****/short writeSeq(FILE *outf, const char *seq, const long seqlen, const short outform, const char *seqid)/* dump sequence to standard output */{ const short kSpaceAll = -9;#define kMaxseqwidth 250 boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */ short numline = 0; /* only true if we are writing seq number line (for interleave) */ boolean numright = false, numleft = false; boolean nameright = false, nameleft = false; short namewidth = 8, numwidth = 8; short spacer = 0, width = 50, tab = 0; /* new parameters: width, spacer, those above... */ short linesout = 0, seqtype = kNucleic; long i, j, l, l1, ibase; char idword[31], endstr[10]; char seqnamestore[MAXLINE], *seqname = seqnamestore; char s[kMaxseqwidth], *cp=NULL; char nameform[10], numform[10], nocountsymbols[10]; unsigned long checksum = 0, checktotal = 0; gPretty.atseq++; skipwhitespace(seqid); /*l = min(128, strlen(seqid)); */ strlcpy(seqnamestore, seqid, MAXLINE); /*seqname[l] = 0; */ sscanf( seqname, "%30s", idword); sprintf(numform, "%ld", seqlen); numwidth= strlen(numform)+1; nameform[0]= '\0'; if (strstr(seqname,"checksum") != NULL) { cp = strstr(seqname,"bases"); if (cp!=NULL) { for ( ; (cp!=seqname) && (*cp!=','); cp--) ; if (cp!=seqname) *cp=0; } } strcpy( endstr,""); l1 = 0; if (outform == kGCG || outform == kMSF) checksum = GCGchecksum(seq, seqlen, &checktotal); else checksum = seqchecksum(seq, seqlen, &checktotal); switch (outform) { case kPlain: case kUnknown: /* no header, just sequence */ strcpy(endstr,"\n"); /* end w/ extra blank line */ break; case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */ case kGenBank: fprintf(outf,"LOCUS %s %ld bp\n", idword, seqlen); fprintf(outf,"DEFINITION %s, %ld bases, %X checksum.\n", seqname, seqlen, (int)checksum); /* fprintf(outf,"ACCESSION %s\n", accnum); */ fprintf(outf,"ORIGIN \n"); spacer = 11; numleft = true; numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */ strcpy(endstr, "\n//"); linesout += 4; break; case kPIR: /* somewhat like genbank... \\\*/ /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */ fprintf(outf,"ENTRY %s \n", idword); fprintf(outf,"TITLE %s, %ld bases, %X checksum.\n", seqname, seqlen, (int)checksum); /* fprintf(outf,"ACCESSION %s\n", accnum); */ fprintf(outf,"SEQUENCE \n"); numwidth = 7; width= 30; spacer = kSpaceAll; numleft = true; strcpy(endstr, "\n///"); /* run a top number line for PIR */ for (j=0; j<numwidth; j++) fputc(' ',outf); for (j= 5; j<=width; j += 5) fprintf(outf,"%10ld",j); fputc('\n',outf); linesout += 5; break; case kNBRF: if (getseqtype(seq, seqlen) == kAmino) fprintf(outf,">P1;%s\n", idword); else fprintf(outf,">DL;%s\n", idword); fprintf(outf,"%s, %ld bases, %X checksum.\n", seqname, seqlen, (int)checksum); spacer = 11; strcpy(endstr,"*\n"); linesout += 3; break; case kEMBL: fprintf(outf,"ID %s\n", idword); /* fprintf(outf,"AC %s\n", accnum); */ fprintf(outf,"DE %s, %ld bases, %X checksum.\n", seqname, seqlen, (int)checksum); fprintf(outf,"SQ %ld BP\n", seqlen); strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ tab = 4; /** added 31jan91 */ spacer = 11; /** added 31jan91 */ width = 60; linesout += 4; break; case kGCG: fprintf(outf,"%s\n", seqname); /* fprintf(outf,"ACCESSION %s\n", accnum); */ fprintf(outf," %s Length: %ld (today) Check: %d ..\n", idword, seqlen, (int)checksum); spacer = 11; numleft = true; strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ linesout += 3; break; case kStrider: /* ?? map ?*/ fprintf(outf,"; ### from DNA Strider ;-)\n"); fprintf(outf,"; DNA sequence %s, %ld bases, %X checksum.\n;\n", seqname, seqlen, (int)checksum); strcpy(endstr, "\n//"); linesout += 3; break; case kFitch: fprintf(outf,"%s, %ld bases, %X checksum.\n", seqname, seqlen, (int)checksum); spacer = 4; width = 60; linesout += 1; break; case kPhylip2: case kPhylip4: /* this is version 3.2/3.4 -- simplest way to write version 3.3 is to write as version 3.2, then re-read file and interleave the species lines */ if (strlen(idword)>10) idword[10] = 0; fprintf(outf,"%-10s ",idword); l1 = -1; tab = 12; spacer = 11; break; case kASN1: seqtype= getseqtype(seq, seqlen); switch (seqtype) { case kDNA : cp= "dna"; break; case kRNA : cp= "rna"; break; case kNucleic : cp= "na"; break; case kAmino : cp= "aa"; break; case kOtherSeq: cp= "not-set"; break; } fprintf(outf," seq {\n"); fprintf(outf," id { local id %d },\n", gPretty.atseq); fprintf(outf," descr { title \"%s\" },\n", seqid); fprintf(outf," inst {\n"); fprintf(outf," repr raw, mol %s, length %ld,\ntopology linear,\n", cp, seqlen); fprintf(outf," seq-data\n"); if (seqtype == kAmino) fprintf(outf," iupacaa \""); else fprintf(outf," iupacna \""); l1 = 17; spacer = 0; width = 78; tab = 0; strcpy(endstr,"\"\n } } ,"); linesout += 7; break; case kPAUP: nameleft= true; namewidth = 9; spacer = 21; width = 100; tab = 0; /* 1; */ /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */ /* do a header comment line for paup */ fprintf(outf,"[Name: %-16s Len:%6ld Check: %8X]\n", idword, seqlen, (int)checksum); linesout += 1; break; case kPretty: numline= gPretty.numline; baseonlynum= gPretty.baseonlynum; namewidth = gPretty.namewidth; numright = gPretty.numright; numleft = gPretty.numleft; nameright = gPretty.nameright; nameleft = gPretty.nameleft; spacer = gPretty.spacer + 1; width = gPretty.seqwidth; tab = gPretty.tab; /* also add rtf formatting w/ font, size, style */ if (gPretty.nametop) { fprintf(outf,"Name: %-16s Len:%6ld Check: %8X\n", idword, seqlen, (int)checksum); linesout++; } break; case kMSF: fprintf(outf," Name: %-16s Len:%6ld Check: %5d Weight: 1.00\n", idword, seqlen, (int)checksum); linesout++; nameleft= true; namewidth= 15; /* need MAX namewidth here... */ sprintf(nameform, "%%+%ds ",namewidth); spacer = 11; width = 50; tab = 0; /* 1; */ break; case kIG: fprintf(outf,";%s, %ld bases, %X checksum.\n", seqname, seqlen, (int)checksum); fprintf(outf,"%s\n", idword); strcpy(endstr,"1"); /* == linear dna */ linesout += 2; break; default : case kZuker: /* don't attempt Zuker's ftn format */ case kPearson: fprintf(outf,">%s %ld bases, %X checksum.\n", seqname, seqlen, (int)checksum); linesout += 1; break; } if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth); if (numline) sprintf(numform, "%%%ds ",numwidth); else sprintf(numform, "%%%dd ",numwidth); strcpy( nocountsymbols, kNocountsymbols); if (baseonlynum) { if (strchr(nocountsymbols,gPretty.gapchar)==NULL) { strcat(nocountsymbols," "); nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar; } if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) { *cp= ' '; } } if (numline) { *idword= 0; } width = min(width,kMaxseqwidth); for (i=0, l=0, ibase = 1; i < seqlen; ) { if (l1 < 0) l1 = 0; else if (l1 == 0) { if (nameleft) fprintf(outf, nameform, idword); if (numleft) { if (numline) fprintf(outf, numform, ""); else fprintf(outf, numform, ibase);} for (j=0; j<tab; j++) fputc(' ',outf); } l1++; /* don't count spaces for width*/ if (numline) { if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) { if (numline==1) fputc(' ',outf); s[l++] = ' '; } if (l1 % 10 == 1 || l1 == width) { if (numline==1) fprintf(outf,"%-9ld ",i+1); s[l++]= '|'; /* == put a number here */ } else s[l++]= ' '; i++; } else { if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) s[l++] = ' '; if (!baseonlynum) ibase++; else if (0==strchr(nocountsymbols,seq[i])) ibase++; s[l++] = seq[i++]; } if (l1 == width || i == seqlen) { if (outform==kPretty) for ( ; l1<width; l1++) { if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) s[l++] = ' '; s[l++]=' '; /* pad w/ blanks */ } s[l] = '\0'; l = 0; l1 = 0; if (numline) { if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */ } else { if (i == seqlen) fprintf(outf,"%s%s",s,endstr); else fprintf(outf,"%s",s); if (numright || nameright) fputc(' ',outf); if (numright) fprintf(outf,numform, ibase-1); if (nameright) fprintf(outf, nameform,idword); } fputc('\n',outf); linesout++; } } return linesout;} /*writeSeq*//* End file: ureadseq.c */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?