ureadseq.c

来自「EM算法的改进」· C语言 代码 · 共 1,911 行 · 第 1/4 页

C
1,911
字号
  check %= 10000;  *checktotal += check;  *checktotal %= 10000;  return check;}/* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */const unsigned long crctab[] = {  0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,  0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,  0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,  0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,  0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,  0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,  0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,  0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,  0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,  0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,  0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,  0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,  0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,  0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,  0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,  0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,  0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,  0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,  0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,  0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,  0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,  0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,  0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,  0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,  0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,  0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,  0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,  0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,  0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,  0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,  0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,  0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,  0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,  0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,  0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,  0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,  0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,  0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,  0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,  0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,  0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,  0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,  0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,  0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,  0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,  0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,  0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,  0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,  0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,  0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,  0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,  0x2d02ef8dL};unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)/*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */{  register unsigned long c = 0xffffffffUL;  register long n = seqlen;  /* tbailey; 11-11-96: move seq++ out of macro! */  while (n--) {    c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8);     seq++;  }  c= c ^ 0xffffffffUL;  *checktotal += c;  return c;}short getseqtype( const char *seq, const long seqlen){ /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */  char  c;  short i, maxtest;  short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0;  maxtest = min(300, seqlen);  for (i = 0; i < maxtest; i++) {    c = to_upper(seq[i]);    if (strchr(protonly, c)) po++;    else if (strchr(primenuc,c)) {      na++;      if (c == 'T') nt++;      else if (c == 'U') nu++;      }    else if (strchr(aminos,c)) aa++;    else if (strchr(seqsymbols,c)) ns++;    else if (isalpha((int)c)) no++;    }  if ((no > 0) || (po+aa+na == 0)) return kOtherSeq;  /* ?? test for probability of kOtherSeq ?, e.g.,  else if (po+aa+na / maxtest < 0.70) return kOtherSeq;  */  else if (po > 0) return kAmino;  else if (aa == 0) {    if (nu > nt) return kRNA;    else return kDNA;    }  else if (na > aa) return kNucleic;  else return kAmino;} /* getseqtype */char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen){  register char *a, *b;  register long i;  char  *newseq;  *newlen= 0;  if (!seq) return NULL;  newseq = (char*) malloc(seqlen+1);  if (!newseq) return NULL;  for (a= (char*)seq, b=newseq, i=0; *a!=0; a++)    if (*a != gapc) {      *b++= *a;      i++;      }  *b= '\0';  newseq = (char*) realloc(newseq, i+1);  *newlen= i;  return newseq;}/***char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \{\\fonttbl\  {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\  {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\  {\\f5\\froman Times;}{\\f6\\froman Palatino;}\  {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\{\\stylesheet\  {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\  {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\  {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}";char *rtftail = "}";****/short writeSeq(FILE *outf, const char *seq, const long seqlen,                const short outform, const char *seqid)/* dump sequence to standard output */{  const short kSpaceAll = -9;#define kMaxseqwidth  250  boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */  short  numline = 0; /* only true if we are writing seq number line (for interleave) */  boolean numright = false, numleft = false;  boolean nameright = false, nameleft = false;  short   namewidth = 8, numwidth = 8;  short   spacer = 0, width  = 50, tab = 0;  /* new parameters: width, spacer, those above... */  short linesout = 0, seqtype = kNucleic;  long  i, j, l, l1, ibase;  char  idword[31], endstr[10];  char  seqnamestore[MAXLINE], *seqname = seqnamestore;  char  s[kMaxseqwidth], *cp=NULL;  char  nameform[10], numform[10], nocountsymbols[10];  unsigned long checksum = 0, checktotal = 0;  gPretty.atseq++;  skipwhitespace(seqid);  /*l = min(128, strlen(seqid)); */  strlcpy(seqnamestore, seqid, MAXLINE);  /*seqname[l] = 0; */  sscanf( seqname, "%30s", idword);  sprintf(numform, "%ld", seqlen);  numwidth= strlen(numform)+1;  nameform[0]= '\0';  if (strstr(seqname,"checksum") != NULL) {    cp = strstr(seqname,"bases");    if (cp!=NULL) {      for ( ; (cp!=seqname) && (*cp!=','); cp--) ;      if (cp!=seqname) *cp=0;      }    }  strcpy( endstr,"");  l1 = 0;  if (outform == kGCG || outform == kMSF)    checksum = GCGchecksum(seq, seqlen, &checktotal);  else    checksum = seqchecksum(seq, seqlen, &checktotal);  switch (outform) {    case kPlain:    case kUnknown:    /* no header, just sequence */      strcpy(endstr,"\n"); /* end w/ extra blank line */      break;    case kOlsen:  /* Olsen seq. editor takes plain nucs OR Genbank  */    case kGenBank:      fprintf(outf,"LOCUS       %s       %ld bp\n", idword, seqlen);      fprintf(outf,"DEFINITION  %s, %ld bases, %X checksum.\n",         seqname, seqlen, (int)checksum);   /* fprintf(outf,"ACCESSION   %s\n", accnum); */      fprintf(outf,"ORIGIN      \n");      spacer = 11;      numleft = true;      numwidth = 8;  /* dgg. 1Feb93, patch for GDE fail to read short numwidth */      strcpy(endstr, "\n//");      linesout += 4;      break;    case kPIR:      /* somewhat like genbank... \\\*/      /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */      fprintf(outf,"ENTRY           %s \n", idword);      fprintf(outf,"TITLE           %s, %ld bases, %X checksum.\n",         seqname, seqlen, (int)checksum);   /* fprintf(outf,"ACCESSION       %s\n", accnum); */      fprintf(outf,"SEQUENCE        \n");      numwidth = 7;      width= 30;      spacer = kSpaceAll;      numleft = true;      strcpy(endstr, "\n///");      /* run a top number line for PIR */      for (j=0; j<numwidth; j++) fputc(' ',outf);      for (j= 5; j<=width; j += 5) fprintf(outf,"%10ld",j);      fputc('\n',outf);      linesout += 5;      break;    case kNBRF:      if (getseqtype(seq, seqlen) == kAmino)        fprintf(outf,">P1;%s\n", idword);      else        fprintf(outf,">DL;%s\n", idword);      fprintf(outf,"%s, %ld bases, %X checksum.\n",         seqname, seqlen, (int)checksum);      spacer = 11;      strcpy(endstr,"*\n");      linesout += 3;      break;    case kEMBL:      fprintf(outf,"ID   %s\n", idword);  /*  fprintf(outf,"AC   %s\n", accnum); */      fprintf(outf,"DE   %s, %ld bases, %X checksum.\n",         seqname, seqlen, (int)checksum);      fprintf(outf,"SQ             %ld BP\n", seqlen);      strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/      tab = 4;     /** added 31jan91 */      spacer = 11; /** added 31jan91 */      width = 60;      linesout += 4;      break;    case kGCG:      fprintf(outf,"%s\n", seqname);   /* fprintf(outf,"ACCESSION   %s\n", accnum); */      fprintf(outf,"    %s  Length: %ld  (today)  Check: %d  ..\n",         idword, seqlen, (int)checksum);      spacer = 11;      numleft = true;      strcpy(endstr, "\n");  /* this is insurance to help prevent misreads at eof */      linesout += 3;      break;    case kStrider: /* ?? map ?*/      fprintf(outf,"; ### from DNA Strider ;-)\n");      fprintf(outf,"; DNA sequence  %s, %ld bases, %X checksum.\n;\n",         seqname, seqlen, (int)checksum);      strcpy(endstr, "\n//");      linesout += 3;      break;    case kFitch:      fprintf(outf,"%s, %ld bases, %X checksum.\n",         seqname, seqlen, (int)checksum);      spacer = 4;      width = 60;      linesout += 1;      break;    case kPhylip2:    case kPhylip4:      /* this is version 3.2/3.4 -- simplest way to write        version 3.3 is to write as version 3.2, then        re-read file and interleave the species lines */      if (strlen(idword)>10) idword[10] = 0;      fprintf(outf,"%-10s  ",idword);      l1  = -1;      tab = 12;      spacer = 11;      break;    case kASN1:      seqtype= getseqtype(seq, seqlen);      switch (seqtype) {        case kDNA     : cp= "dna"; break;        case kRNA     : cp= "rna"; break;        case kNucleic : cp= "na"; break;        case kAmino   : cp= "aa"; break;        case kOtherSeq: cp= "not-set"; break;        }      fprintf(outf,"  seq {\n");      fprintf(outf,"    id { local id %d },\n", gPretty.atseq);      fprintf(outf,"    descr { title \"%s\" },\n", seqid);      fprintf(outf,"    inst {\n");      fprintf(outf,"      repr raw, mol %s, length %ld,\ntopology linear,\n", cp, seqlen);      fprintf(outf,"      seq-data\n");      if (seqtype == kAmino)        fprintf(outf,"        iupacaa \"");      else        fprintf(outf,"        iupacna \"");      l1  = 17;      spacer = 0;      width  = 78;      tab  = 0;      strcpy(endstr,"\"\n      } } ,");      linesout += 7;      break;    case kPAUP:      nameleft= true;      namewidth = 9;      spacer = 21;      width  = 100;      tab  = 0; /* 1; */      /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */      /* do a header comment line for paup */      fprintf(outf,"[Name: %-16s  Len:%6ld  Check: %8X]\n",         idword, seqlen, (int)checksum);      linesout += 1;      break;    case kPretty:      numline= gPretty.numline;      baseonlynum= gPretty.baseonlynum;      namewidth = gPretty.namewidth;      numright = gPretty.numright;      numleft = gPretty.numleft;      nameright = gPretty.nameright;      nameleft = gPretty.nameleft;      spacer = gPretty.spacer + 1;      width  = gPretty.seqwidth;      tab  = gPretty.tab;      /* also add rtf formatting w/ font, size, style */      if (gPretty.nametop) {        fprintf(outf,"Name: %-16s  Len:%6ld  Check: %8X\n",          idword, seqlen, (int)checksum);        linesout++;        }      break;    case kMSF:      fprintf(outf," Name: %-16s Len:%6ld  Check: %5d  Weight:  1.00\n",                    idword, seqlen, (int)checksum);      linesout++;      nameleft= true;      namewidth= 15; /* need MAX namewidth here... */      sprintf(nameform, "%%+%ds ",namewidth);      spacer = 11;      width  = 50;      tab = 0; /* 1; */      break;    case kIG:      fprintf(outf,";%s, %ld bases, %X checksum.\n",         seqname, seqlen, (int)checksum);      fprintf(outf,"%s\n", idword);      strcpy(endstr,"1"); /* == linear dna */      linesout += 2;      break;    default :    case kZuker: /* don't attempt Zuker's ftn format */    case kPearson:      fprintf(outf,">%s %ld bases, %X checksum.\n",         seqname, seqlen, (int)checksum);      linesout += 1;      break;    }  if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth);  if (numline) sprintf(numform, "%%%ds ",numwidth);  else sprintf(numform, "%%%dd ",numwidth);  strcpy( nocountsymbols, kNocountsymbols);  if (baseonlynum) {    if (strchr(nocountsymbols,gPretty.gapchar)==NULL) {      strcat(nocountsymbols," ");      nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar;      }    if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) {      *cp= ' ';      }    }  if (numline) {   *idword= 0;   }  width = min(width,kMaxseqwidth);  for (i=0, l=0, ibase = 1; i < seqlen; ) {    if (l1 < 0) l1 = 0;    else if (l1 == 0) {      if (nameleft) fprintf(outf, nameform, idword);      if (numleft) { if (numline) fprintf(outf, numform, "");                    else fprintf(outf, numform, ibase);}      for (j=0; j<tab; j++) fputc(' ',outf);      }    l1++;                 /* don't count spaces for width*/    if (numline) {      if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) {        if (numline==1) fputc(' ',outf);        s[l++] = ' ';        }      if (l1 % 10 == 1 || l1 == width) {        if (numline==1) fprintf(outf,"%-9ld ",i+1);        s[l++]= '|'; /* == put a number here */        }      else s[l++]= ' ';      i++;      }    else {      if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))        s[l++] = ' ';      if (!baseonlynum) ibase++;      else if (0==strchr(nocountsymbols,seq[i])) ibase++;      s[l++] = seq[i++];      }    if (l1 == width || i == seqlen) {      if (outform==kPretty) for ( ; l1<width; l1++) {        if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))          s[l++] = ' ';        s[l++]=' '; /* pad w/ blanks */        }      s[l] = '\0';      l = 0; l1 = 0;      if (numline) {        if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */        }      else {        if (i == seqlen) fprintf(outf,"%s%s",s,endstr);        else fprintf(outf,"%s",s);        if (numright || nameright) fputc(' ',outf);        if (numright)  fprintf(outf,numform, ibase-1);        if (nameright) fprintf(outf, nameform,idword);        }      fputc('\n',outf);      linesout++;      }    }  return linesout;}  /*writeSeq*//* End file: ureadseq.c */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?