📄 comp_lib2.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
📖 第 1 页 / 共 5 页
字号:
      if (m_msg.quiet == 1)	s_abort("Query sequence undefined","");    l1:	fputs (iprompt1, stdout);      fflush  (stdout);      if (fgets (m_msg.tname, MAX_FN, stdin) == NULL)	s_abort ("Unable to read query library name","");      m_msg.tname[MAX_FN-1]='\0';      if ((bp=strchr(m_msg.tname,'\n'))!=NULL) *bp='\0';      if (m_msg.tname[0] == '\0') goto l1;  }    /* Open query library */  if ((q_file_p= openlib(m_msg.tname, m_msg.qdnaseq,qascii,!m_msg.quiet,NULL))==NULL) {    s_abort(" cannot open library ",m_msg.tname);  }  /* Fetch first sequence */  qlib = 0;  m_msg.q_offset = 0l;  qlcont = 0;  m_msg.n0 =     QGETLIB (aa0[0], MAXTST, m_msg.qtitle, sizeof(m_msg.qtitle),	     &qseek, &qlcont,q_file_p,&m_msg.q_off);  if ((bp=strchr(m_msg.qtitle,' '))!=NULL) *bp='\0';  strncpy(info_qlabel,m_msg.qtitle,sizeof(info_qlabel));  if (bp != NULL) *bp = ' ';  info_qlabel[sizeof(info_qlabel)-1]='\0';  /* if annotations are included in sequence, remove them */  if (m_msg.q_ann_flg) {    m_msg.n0 = ann_scan(aa0[0],m_msg.n0,&m_msg,m_msg.qdnaseq);  }  if (m_msg.term_code && !(m_msg.qdnaseq==SEQT_DNA || m_msg.qdnaseq==SEQT_RNA) &&      aa0[0][m_msg.n0-1]!='*') {    aa0[0][m_msg.n0++]='*';    aa0[0][m_msg.n0]=0;  }  /* check for subset */  if (q_file_p->opt_text[0]!='\0') {    if (q_file_p->opt_text[0]=='-') {      sstart=0; sscanf(&q_file_p->opt_text[1],"%d",&sstop);    }    else {      sscanf(&q_file_p->opt_text[0],"%d-%d",&sstart,&sstop);      sstart--;      if (sstop <= 0 ) sstop = BIGNUM;    }    for (id=0,is=sstart; is<min(m_msg.n0,sstop); ) {      aa0[0][id++]=aa0[0][is++];    }    aa0[0][id]=0;    m_msg.n0 = min(m_msg.n0,sstop)-sstart;    m_msg.q_off += sstart;  }  /* do this all the time */  /* #if defined(SW_ALTIVEC) || defined(SW_SSE2) */  /* for ALTIVEC, must pad with SEQ_PAD NULL's */  for (id=0; id<SEQ_PAD; id++) {aa0[0][m_msg.n0+id]=0;}  /* #endif */  if (qlcont) {    m_msg.q_offset += m_msg.n0 - m_msg.q_overlap;  }  else {    m_msg.q_offset = 0l;  }  if (m_msg.n0 > MAXTST) {    fprintf(stderr," sequence truncated to %d\n %s\n",MAXTST,m_msg.sqnam);    if (m_msg.std_output)       fprintf(stdout," sequence truncated to %d\n %s\n",MAXTST,m_msg.sqnam);    aa0[0][MAXTST]='\0';    m_msg.n0=MAXTST;  }  if (m_msg.qdnaseq == SEQT_UNK) {  /* do automatic sequence recognition,but only for sequences > 20 residues */    if (m_msg.n0 > 20 &&	(float)scanseq(aa0[0],m_msg.n0,"ACGTUNacgtun")/(float)m_msg.n0 >0.85) {      pascii = nascii;      m_msg.qdnaseq = SEQT_DNA;    }    else {	/* its protein */      pascii = aascii;      m_msg.qdnaseq = SEQT_PROT;    }    /* modify qascii to use encoded version        cannot use memcpy() because it loses annotations     */    re_ascii(qascii,pascii);    init_ascii(pst.ext_sq_set,qascii,m_msg.qdnaseq);    m_msg.n0 = recode(aa0[0],m_msg.n0,qascii, pst.nsqx);  }  if (m_msg.n0 <= 0)    s_abort ("Query sequence length <= 0: ", m_msg.tname);#ifdef SUPERFAMNUM  m_msg.nqsfnum = nsfnum;  for (i=0; i <= nsfnum & i<10; i++) m_msg.qsfnum[i] = sfnum[i];  m_msg.nqsfnum_n = nsfnum_n;  for (i=0; i <= nsfnum_n & i<10; i++) m_msg.qsfnum_n[i] = sfnum_n[i];#endif  resetp (&m_msg, &pst);#ifndef COMP_MLIB  gettitle(m_msg.tname,m_msg.qtitle,sizeof(m_msg.qtitle));  if (m_msg.tname[0]=='-' || m_msg.tname[0]=='@') {    strncmp(m_msg.tname,m_msg.qtitle,sizeof(m_msg.tname));    if ((bp=strchr(m_msg.tname,' '))!=NULL) *bp='\0';  }#endif  /* get library file names */  if (strlen (m_msg.lname) == 0) {    if (m_msg.quiet == 1) s_abort("Library name undefined","");    libchoice(m_msg.lname,sizeof(m_msg.lname),&m_msg);  }    libselect(m_msg.lname, &m_msg);  /* Get additional parameters here */  if (!m_msg.quiet) query_parm (&m_msg, &pst);    last_init(&m_msg, &pst);  /* Allocate space for saved scores */  if ((best =        (struct beststr *)calloc((MAX_BEST+1),sizeof(struct beststr)))==NULL)    s_abort("Cannot allocate best struct","");  if ((bestp_arr =        (struct beststr **)malloc((MAX_BEST+1)*sizeof(struct beststr *)))==NULL)    s_abort("Cannot allocate bestp_arr","");  /* initialize high score boundary */  bestp_arr[0] = &best[0];  best[0].rst.score[0]=best[0].rst.score[1]=best[0].rst.score[2]= INT_MAX;  best[0].rst.escore=FLT_MIN;	/* for E()-values, lower is best */  best[0].zscore=FLT_MAX;	/* for Z-scores, bigger is best */  best++; bestp_arr++;  /* save best score sequence info */  if ((best_seqs =       (struct seq_record *)calloc((MAX_BEST+1),sizeof(struct seq_record)))==NULL)    s_abort("Cannot allocate best_seqs","");  /* allocate space for sampled scores */  if ((stats =       (struct stat_str *)calloc(MAX_STATS,sizeof(struct stat_str)))==NULL)    s_abort ("Cannot allocate stats struct","");  /* allocate space for shuffled library scores */  if ((rstats =       (struct stat_str *)calloc(m_msg.shuff_max,sizeof(struct stat_str)))==NULL)    s_abort ("Cannot allocate rstats struct","");#ifdef UNIX  /* set up signals now that input is done */  signal(SIGHUP,SIG_IGN);#endif  /* **************************************************************** */  /*      This section defines max_buf2_cnt, the average number of entries per buffer,      and max_work_buf, the total number of buffers     Use a 2 Mbyte (DEF_WORKER_BUF) buffer for each worker.  For     proteins, that means 5,000 sequences of length 400 (average).     For DNA, that means 2,000 sequences of length 1000.     To accommodate larger libraries in memory, use more buffers, not     bigger buffers.  */  if (m_msg.ldnaseq== SEQT_DNA) {    ave_seq_len = AVE_NT_LEN;    max_buf2_cnt = DEF_WORKER_BUF/AVE_NT_LEN;  }  else {    ave_seq_len = AVE_AA_LEN;    max_buf2_cnt = DEF_WORKER_BUF/AVE_AA_LEN;  }  /* however - buffer sizes should be a function of the number of     workers so that all the workers are kept busy.  Assuming a 10,000     entry library is the smallest we want to schedule, then  if (max_buf2_cnt > 10000/max_workers)     max_buf2_cnt = 10000/(2*max_workers);  */  max_buf2_cnt /= m_msg.thr_fact;  /* finally, max_work_buf should be mod 6 for tfasta/s/f */  max_buf2_cnt -= (max_buf2_cnt % 6);  /* max_work_buf is the number of buffers - if the worker buffers are     small, then make lots more buffers */  max_work_buf = (DEF_WORKER_BUF * 2 * max_workers)/(ave_seq_len * max_buf2_cnt);  if (max_work_buf < 2*max_workers) max_work_buf = 2*max_workers;  max_work_buf -= (max_work_buf%max_workers);#ifndef COMP_THR  /* if not threaded, only one (larger) buffer */  max_buf2_cnt *= max_work_buf;  max_work_buf = 1;#endif  /* first allocate space for buffer headers */  if ((lib_buf2_list = (struct buf_head *)calloc((size_t)(max_work_buf),sizeof(struct buf_head))) == NULL) {    fprintf(stderr," cannot allocate lib_buf2_list[%d]\n",max_work_buf);    exit(1);  }#ifdef COMP_THR  if ((worker_buf = (struct buf_head **)calloc((size_t)(max_work_buf),sizeof(struct buf_head *))) == NULL) {    fprintf(stderr," cannot allocate **worker_buf[%d]\n",max_work_buf);    exit(1);  }  if ((reader_buf = (struct buf_head **)calloc((size_t)(max_work_buf),sizeof(struct buf_head *))) == NULL) {    fprintf(stderr," cannot allocate **reader_buf[%d]\n",max_work_buf);    exit(1);  }#endif  /* allocate space for library buffers and results */  /* there are four structures/buffers used to keep track of     sequences/results:     (1) lib_buf2_list[] is a bhead_str array, which simply stores         whether the results are ready and the number of results         available.     (2) lib_buf2_list[].buf2 is a buf2_str, which points to a         sequence, and records frame and scores       (3) lib_buf2_list[].buf2[0]->seq points to a seq_record, which         contains information about a sequence, including an entry         into the sequence buffer described next.     (4) lib_buf2_list[].start points to the start of the sequence         buffer.  */  buf_siz = max(max_buf2_cnt*ave_seq_len, m_msg.max_tot * 4);  if (buf_siz < m_msg.max_tot) buf_siz = m_msg.max_tot;  for (i=0; i<max_work_buf; i++) {    /* allocate max_buf2_cnt buf2_str's into each buf2 */    if ((lib_buf2_list[i].buf2 =(struct buf2_str *)calloc((size_t)(max_buf2_cnt+1),						   sizeof(struct buf2_str)))        ==NULL) {      fprintf(stderr," cannot allocate buffer struct %d %d\n",i,max_buf2_cnt+1);      exit(1);    }    if ((lib_buf2_list[i].buf2[0].seq_b = 	 lib_buf2_list[i].buf2[0].seq =	 (struct seq_record *)calloc((size_t)(max_buf2_cnt+1),				     sizeof(struct seq_record)))        ==NULL) {      fprintf(stderr," cannot allocate seq_record buffer %d %d\n",i,max_buf2_cnt+1);      exit(1);    }    if ((lib_buf2_list[i].start =         (unsigned char *)calloc((size_t)(buf_siz),sizeof(unsigned char)))        ==NULL) {      fprintf(stderr," cannot allocate buffer %d: %d\n",i, buf_siz);      exit(1);    }    /* make certain there is a '\0' at the beginning */    lib_buf2_list[i].start++;    lib_buf2_list[i].buf2[0].seq_b->aa1b =       lib_buf2_list[i].buf2[0].seq->aa1b = lib_buf2_list[i].start;    /* make certain there is a '\0' at the beginning */    lib_buf2_list[i].have_results=0;#ifdef COMP_THR    reader_buf[i] = &lib_buf2_list[i];#endif  }  /* initialization of global variables for threads/buffers */#if defined(COMP_THR)#ifdef DEBUG  fprintf(stderr," max_work_buf: %d\n", max_work_buf);#endif  num_reader_bufs = max_work_buf;  filled_worker_bufs = empty_reader_bufs = 0;  num_worker_bufs = 0;  reader_done = 0;  worker_buf_workp = 0;  worker_buf_readp = 0;  reader_buf_workp = 0;  reader_buf_readp = 0;  start_thread = 1;	/* keeps threads from starting */#endif  /* Label the output */  if ((bp = (char *) strchr (m_msg.lname, ' ')) != NULL) *bp = '\0';  if (m_msg.ltitle[0] == '\0') {    strncpy(m_msg.ltitle,m_msg.lname,sizeof(m_msg.ltitle));    m_msg.ltitle[sizeof(m_msg.ltitle)-1]='\0';  }  if (m_msg.dfile[0]) fdata=fopen(m_msg.dfile,"w");#ifdef COMP_MLIB  if (m_msg.std_output) {    printf("Query: %s\n", m_msg.tname);  /*   if (m_msg.nln > 0) printf("searching %s library\n\n",m_msg.lbnames[0]); */  }  #endif  /* main loop for doing a search, getting the next query */  while(1) {    /* Initialize bestp_arr */    for (nbest = 0; nbest < MAX_BEST; nbest++)      bestp_arr[nbest] = &best[nbest];    nbest = 0;    m_msg.db.length = 0l;    m_msg.db.entries = m_msg.db.carry = 0;    qlib++;    stats_done = 0;    maxl = m_msg.max_tot - m_msg.n0 -2;	/* maxn = max library sequence space */    maxn = reset_maxn(&m_msg,maxl);    pst.maxlen = maxn;    outfd = stdout;      zbestcut = -FLT_MAX;    nstats = 0;    nrstats = 0;    /* get the last parameters */    last_params(aa0[0],m_msg.n0, &m_msg, &pst);    /*      if our function returns approximate E()-scores, we do not need to      work with raw scores and later calculate z-scores.  When      approx. E()-scores are calculated, we still need various      statistics structures, but we can get them immediately.  In this      case, find_zp() must produce a z_score (large positive is good)      from an e_score.    */    if (m_msg.escore_flg) {      pst.zsflag_f = process_hist(stats,nstats,m_msg,&pst,				  &m_msg.hist,&m_msg.pstat_void,0);      stats_done=1;    }#ifndef COMP_THR    if (m_msg.qshuffle) {      if ((aa0s=(unsigned char *)calloc(m_msg.n0+2+SEQ_PAD,sizeof(char)))==NULL) {	fprintf(stderr,"cannot allocate aa0s[%d]\n",m_msg.n0+2);	exit(1);      }      *aa0s='\0';      aa0s++;      memcpy(aa0s,aa0[0],m_msg.n0);      qshuffle(aa0s,m_msg.n0,m_msg.nm0);#if defined(SW_ALTIVEC) || defined(SW_SSE2)      /* for ALTIVEC, must pad with 16 NULL's */      for (id=0; id<SEQ_PAD; id++) {aa0s[m_msg.n0+id]=0;}#endif    }    /* storage for shuffled library sequences */    if ((aa1shuff_b = aa1shuff = (unsigned char *)malloc((maxn+2)*sizeof (char))) == NULL) {      s_abort ("Unable to allocate shuffled library sequence", "");    }    *aa1shuff=0;    aa1shuff++;    irand(0);    /* previous versions of FASTA have stored the reverse complement in       the same array as the forward query sequence.  This version       changes that, by allocating separate space for the reverse complement,       and thus reducing the demand for a large MAXLIB/MAXTRN for long queries    */    if (m_msg.qframe == 2) {      if ((aa0[1]=(unsigned char *)calloc(m_msg.n0+2+SEQ_PAD,sizeof(char)))==NULL) {	fprintf(stderr,"cannot allocate aa0[1][%d]\n",m_msg.n0+2);	exit(1);      }      *aa0[1] = '\0';      aa0[1]++;      memcpy(aa0[1],aa0[0],m_msg.n0+1);#if defined(SW_ALTIVEC) || defined(SW_SSE2)      /* for ALTIVEC, must pad with 16 NULL's */      for (id=0; id<SEQ_PAD; id++) {aa0[1][m_msg.n0+id]=0;}#endif      revcomp(aa0[1],m_msg.n0,&pst.c_nt[0]);    }    /* set aa1 for serial - threaded points aa1 to buffer */    aa1 = aa0[0] + m_msg.n0+1;	/* modified now that aa0[1] is done separately */    *aa1++='\0';#else    init_thr(max_workers, work_info, m_msg, &pst, aa0[0], max_work_buf);#endif    /* allocate space for shuffled query scores (if needed */    if (m_msg.qshuffle && qstats==NULL) {      if ((qstats =	   (struct stat_str *)calloc(m_msg.shuff_max+1,sizeof(struct stat_str)))==NULL)	s_abort ("Cannot allocate qstats struct","");    }    nqstats = 0;    if (m_msg.markx & MX_HTML) fputs("<pre>\n",stdout);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -