⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hmmsearch.c

📁 马尔科夫模型的java版本实现
💻 C
📖 第 1 页 / 共 5 页
字号:
  if((scoring_method == SUBST_MTX_PRODUCT || scoring_method == SUBST_MTX_DOT_PRODUCT ||      scoring_method == SUBST_MTX_DOT_PRODUCT_PRIOR)     && read_subst_mtx == NO) {    printf("Error: No substitution matrix supplied\n");    exit(0);  }    /* read null model */  get_null_model_multi(nullfile);    if(query == SEQ) {    /* get hmms and score all seqs against them */    while(fgets(hmm_name, MAX_LINE, hmmnamefile) != NULL) {      /* get hmm from file */      hmm_name[strlen(hmm_name)-1] = '\0';      if((hmmfile = fopen(hmm_name, "r")) == NULL) {	perror(hmm_name);	continue;      }      hmmfiletype = readhmm_check(hmmfile);      if(hmmfiletype == SINGLE_HMM) {	readhmm(hmmfile, &hmm);      }      else if(hmmfiletype == MULTI_HMM) {	readhmm_multialpha(hmmfile, &hmm);      }      hmm.subst_mtx = subst_mtxp;      hmm.subst_mtx_2 = subst_mtxp_2;      hmm.subst_mtx_3 = subst_mtxp_3;      hmm.subst_mtx_4 = subst_mtxp_4;      hmm.replacement_letters = &replacement_letters;            /* open outfile + print init info */      if((pure_hmm_name_p = strrchr(hmm_name, '/')) != NULL) {	strcpy(pure_hmm_name_a, pure_hmm_name_p+1);      }      else {	strcpy(pure_hmm_name_a, hmm_name);      }      strcpy(cur_savefile_name, outfilepath_name);      if(strlen(cur_savefile_name) != 0 && cur_savefile_name[strlen(cur_savefile_name) - 1] != '/') {	strcat(cur_savefile_name, "/");      }      strcat(cur_savefile_name, pure_hmm_name_a);      strcat(cur_savefile_name, ".res");            if((outfile = fopen(cur_savefile_name, "w")) == NULL) {	perror(cur_savefile_name);	continue;      }      else {	fprintf(outfile, "# Scores for HMM: '%s'\n\n", pure_hmm_name_a);      }            /* rewind file with the sequence names */      rewind(seqnamefile);      /* loop over the sequences */      while(fgets(seq_name, MAX_LINE, seqnamefile) != NULL) {	/* open seqfile */	seq_name[strlen(seq_name)-1] = '\0';	if((seqfile = fopen(seq_name, "r")) == NULL) {	  perror(seq_name);	  continue;	}		/* check sequence file for labels + check nolabel flag */	if(seqfile_has_labels(seqfile) == YES) {	  use_labels = YES;	}	else {	  use_labels = NO;	}	if(args_info.nolabels_given) {	  use_labels = NO;	}		/* allocate space for msa_seq_info structs */	if(seq_format == MSA_STANDARD || seq_format == PROFILE) {	  msa_seq_infop = (struct msa_sequences_multi_s*)(malloc_or_die(1 * sizeof(struct msa_sequences_multi_s)));	}	else if(seq_format == FASTA || seq_format == STANDARD) {	  seq_info.seqs = malloc_or_die(sizeof(struct sequence_multi_s) * 1);	  seq_info.nr_alphabets = hmm.nr_alphabets;	  seq_info.nr_seqs = 1;	  seq_info.longest_seq = 0;	  seq_info.shortest_seq = INT_MAX;	  seq_info.avg_seq_len = 0;	}		if(seq_format == FASTA) {	  if(hmm.nr_alphabets > 1) {	    printf("Error: fasta is a one alphabet format only\n");	    exit(0);	  }	  else {	    get_sequence_fasta_multi(seqfile, &seq_info, 0);	    hmm.alphabet_type = DISCRETE;	    if(use_labels == YES) {	      get_labels_multi(seqfile, &seq_info, &hmm, 0);	    }	  }	}	else if(seq_format == STANDARD) {	  get_sequence_std_multi(seqfile, &seq_info, &hmm, 0);	  if(use_labels == YES) {	    get_labels_multi(seqfile, &seq_info, &hmm, 0);	  }	}	else if(seq_format == MSA_STANDARD) {	  if(prf_mode == LEAD_SEQ) {	    get_sequences_msa_std_multi(seqfile, priorfile, msa_seq_infop, &hmm,					lead_seq, &replacement_letters);	  }	  else {	    get_sequences_msa_std_multi(seqfile, priorfile, msa_seq_infop, &hmm, -1, &replacement_letters);	  }	  	  /* get labels */	  if(use_labels == YES) {	    if(prf_mode == LEAD_SEQ) {	      get_msa_labels_multi(seqfile, msa_seq_infop, &hmm);	    }	    else if(prf_mode == ALL_SEQS) {	      get_msa_labels_all_columns_multi(seqfile, msa_seq_infop, &hmm);	    }	    else {	      printf("Internal error: strange prf_mode value\n");	      exit(0);	    }	  }	}	else if(seq_format == PROFILE) {	  get_sequences_msa_prf_multi(seqfile, priorfile,  msa_seq_infop, &hmm);	}	if(seq_format == STANDARD || seq_format == FASTA) {	  seq_info.avg_seq_len = ((int)(seq_info.avg_seq_len / seq_info.nr_seqs));	  //dump_labeled_seqs_multi(&seq_info);	}	else if(seq_format == MSA_STANDARD || seq_format == PROFILE) {	  //dump_msa_seqs_multi(msa_seq_infop, &hmm);	  //exit(0);	}	fclose(seqfile);	/* print seq info */	if(verbose == YES) {	  printf("Running seq %s on %s\n", seq_name, hmm_name);	}		/* retrain model if run_max_d option is given */	if(run_max_d == YES) {	  if(verbose == YES) {	    printf("retraining ...\n");	  }	  if(seq_format == STANDARD || seq_format == FASTA) {	    copy_hmm_struct(&hmm, &retrain_hmm);	    baum_welch_dirichlet_multi(&retrain_hmm, seq_info.seqs, 1, NO, use_labels, NO,				       NO, multi_scoring_method, NO);	  }	  else if(seq_format == MSA_STANDARD || seq_format == PROFILE) {	    if(prf_mode == ALL_SEQS) {	      use_lead_columns = NO;	    }	    else if(prf_mode == LEAD_SEQ) {	      use_lead_columns = YES;	    }	    copy_hmm_struct(&hmm, &retrain_hmm);	    msa_baum_welch_dirichlet_multi(&retrain_hmm, msa_seq_infop, 1, NO, use_gap_shares, use_lead_columns, use_labels,					   NO, NO, normalize, scoring_method,					   NO, multi_scoring_method, aa_freqs, aa_freqs_2, aa_freqs_3, aa_freqs_4,					   NO);	  }	  if(verbose == YES) {	    printf("retraining done\n");	  }	}		if((pure_seq_name_p = strrchr(seq_name, '/')) != NULL) {	  strcpy(pure_seq_name_a, pure_seq_name_p+1);	}	else {	  strcpy(pure_seq_name_a, seq_name);	}	fprintf(outfile, "%s\n", pure_seq_name_a);		/* get score info for this seq and print it to outfile */	if(verbose == YES) {	  printf("scoring ...\n");	}	get_scores(run_max_d, run_forward, run_backward, run_viterbi, run_n_best, seq_format, prf_mode,		   use_gap_shares, use_prior, use_labels, normalize, scoring_method, multi_scoring_method,		   print_labels, print_post_probs, print_log_like, print_log_odds, print_reversi, print_path,		   outfile);	if(verbose == YES) {	  printf("scoring done\n");	}		/* deallocate retrain_hmm */	if(run_max_d == YES) {	  hmm_garbage_collection_multi_no_dirichlet(NULL, &retrain_hmm);	}	/* deallocate seqinfo */	if(seq_format == STANDARD || seq_format == FASTA) {	  free(((seq_info.seqs))->seq_1);	  if(hmm.nr_alphabets > 1) {	    free((seq_info.seqs)->seq_2);	  }	  if(hmm.nr_alphabets > 2) {	    free((seq_info.seqs)->seq_3);	  }	  if(hmm.nr_alphabets > 3) {	    free((seq_info.seqs)->seq_4);	  }	  free(seq_info.seqs);	}	if(seq_format == MSA_STANDARD || seq_format == PROFILE) {	  free((*(msa_seq_infop)).msa_seq_1);	  if(hmm.nr_alphabets > 1) {	    free((*(msa_seq_infop)).msa_seq_2);	  }	  if(hmm.nr_alphabets > 2) {	    free((*(msa_seq_infop)).msa_seq_3);	  }	  if(hmm.nr_alphabets > 3) {	    free((*(msa_seq_infop)).msa_seq_4);	  }	  free((*(msa_seq_infop)).gaps);	}	free(msa_seq_infop);      }            /* deallocate hmm_info */      hmm_garbage_collection_multi(hmmfile, &hmm);      fclose(outfile);    }  }    else if(query == HMM) {    /* for each sequence      * score it on all hmms and print results      * done as if query = seq, but results are saved differently */        /* print init info in outfiles */    rewind(seqnamefile);    while(fgets(seq_name, MAX_LINE, seqnamefile) != NULL) {      /* open outfile + print init info */      seq_name[strlen(seq_name)-1] = '\0';      if((pure_seq_name_p = strrchr(seq_name, '/')) != NULL) {	strcpy(pure_seq_name_a, pure_seq_name_p+1);      }      else {	strcpy(pure_seq_name_a, seq_name);      }      strcpy(cur_savefile_name, outfilepath_name);      if(strlen(cur_savefile_name) != 0 && cur_savefile_name[strlen(cur_savefile_name) - 1] != '/') {	strcat(cur_savefile_name, "/");      }      strcat(cur_savefile_name, pure_seq_name_a);      strcat(cur_savefile_name, ".res");            if((outfile = fopen(cur_savefile_name, "w")) == NULL) {	perror(cur_savefile_name);	continue;      }      else {	fprintf(outfile, "# Scores for sequence: '%s'\n\n", pure_seq_name_a);	fclose(outfile);      }    }    while(fgets(hmm_name, MAX_LINE, hmmnamefile) != NULL) {      /* get hmm from file */      hmm_name[strlen(hmm_name)-1] = '\0';      if((hmmfile = fopen(hmm_name, "r")) == NULL) {	perror(hmm_name);	continue;      }      hmmfiletype = readhmm_check(hmmfile);      if(hmmfiletype == SINGLE_HMM) {	readhmm(hmmfile, &hmm);      }      else if(hmmfiletype == MULTI_HMM) {	readhmm_multialpha(hmmfile, &hmm);      }      hmm.subst_mtx = subst_mtxp;      hmm.subst_mtx_2 = subst_mtxp_2;      hmm.subst_mtx_3 = subst_mtxp_3;      hmm.subst_mtx_4 = subst_mtxp_4;      hmm.replacement_letters = &replacement_letters;            if((pure_hmm_name_p = strrchr(hmm_name, '/')) != NULL) {	strcpy(pure_hmm_name_a, pure_hmm_name_p+1);      }      else {	strcpy(pure_hmm_name_a, hmm_name);      }      /* rewind file with the sequence names */      rewind(seqnamefile);      /* loop over the sequences */      while(fgets(seq_name, MAX_LINE, seqnamefile) != NULL) {	/* open seqfile */	seq_name[strlen(seq_name)-1] = '\0';	if((seqfile = fopen(seq_name, "r")) == NULL) {	  perror(seq_name);	  continue;	}		/* open outfile + print init info */	if((pure_seq_name_p = strrchr(seq_name, '/')) != NULL) {	  strcpy(pure_seq_name_a, pure_seq_name_p+1);	}	else {	  strcpy(pure_seq_name_a, seq_name);	}	strcpy(cur_savefile_name, outfilepath_name);	if(strlen(cur_savefile_name) != 0 && cur_savefile_name[strlen(cur_savefile_name) - 1] != '/') {	  strcat(cur_savefile_name, "/");	}	strcat(cur_savefile_name, pure_seq_name_a);	strcat(cur_savefile_name, ".res");      	if((outfile = fopen(cur_savefile_name, "a")) == NULL) {	  perror(cur_savefile_name);	  continue;	}	else {	}	/* check sequence file for labels + check nolabel flag */	if(seqfile_has_labels(seqfile) == YES) {	  use_labels = YES;	}	else {	  use_labels = NO;	}	if(args_info.nolabels_given) {	  use_labels = NO;	}		/* allocate space for msa_seq_info structs */	if(seq_format == MSA_STANDARD || seq_format == PROFILE) {	  msa_seq_infop = (struct msa_sequences_multi_s*)(malloc_or_die(1 * sizeof(struct msa_sequences_multi_s)));	}	else if(seq_format == FASTA || seq_format == STANDARD) {	  seq_info.seqs = malloc_or_die(sizeof(struct sequence_multi_s) * 1);	  seq_info.nr_alphabets = hmm.nr_alphabets;	  seq_info.nr_seqs = 1;	  seq_info.longest_seq = 0;	  seq_info.shortest_seq = INT_MAX;	  seq_info.avg_seq_len = 0;	}		if(seq_format == FASTA) {	  if(hmm.nr_alphabets > 1) {	    printf("Error: fasta is a one alphabet format only\n");	    exit(0);	  }	  else {	    get_sequence_fasta_multi(seqfile, &seq_info, 0);	    hmm.alphabet_type = DISCRETE;	    if(use_labels == YES) {	      get_labels_multi(seqfile, &seq_info, &hmm, 0);	    }	  }	}	else if(seq_format == STANDARD) {	  get_sequence_std_multi(seqfile, &seq_info, &hmm, 0);	  if(use_labels == YES) {	    get_labels_multi(seqfile, &seq_info, &hmm, 0);	  }	}	else if(seq_format == MSA_STANDARD) {	  if(prf_mode == LEAD_SEQ) {	    get_sequences_msa_std_multi(seqfile, priorfile, msa_seq_infop, &hmm,					lead_seq, &replacement_letters);	  }	  else {	    get_sequences_msa_std_multi(seqfile, priorfile, msa_seq_infop, &hmm, -1, &replacement_letters);	  }	  	  /* get labels */	  if(use_labels == YES) {	    if(prf_mode == LEAD_SEQ) {	      get_msa_labels_multi(seqfile, msa_seq_infop, &hmm);	    }	    else if(prf_mode == ALL_SEQS) {	      get_msa_labels_all_columns_multi(seqfile, msa_seq_infop, &hmm);	    }	    else {	      printf("Internal error: strange prf_mode value\n");	      exit(0);	    }	  }	}	else if(seq_format == PROFILE) {	  get_sequences_msa_prf_multi(seqfile, priorfile,  msa_seq_infop, &hmm);	}	if(seq_format == STANDARD || seq_format == FASTA) {	  seq_info.avg_seq_len = ((int)(seq_info.avg_seq_len / seq_info.nr_seqs));	  //dump_labeled_seqs_multi(&seq_info);	}	else if(seq_format == MSA_STANDARD || seq_format == PROFILE) {	  	}	fclose(seqfile);	fprintf(outfile, "%s:\n", pure_hmm_name_a);	if(verbose == YES) {	  printf("Running seq %s on %s\n", seq_name, hmm_name);	}		/* retrain model if run_max_d option is given */	if(run_max_d == YES) {	  if(verbose == YES) {	    printf("retraining ... \n");	  }	  if(seq_format == STANDARD || seq_format == FASTA) {	    copy_hmm_struct(&hmm, &retrain_hmm);	    baum_welch_dirichlet_multi(&retrain_hmm, seq_info.seqs, 1, NO, use_labels, NO,				       NO, multi_scoring_method, NO);	  }	  else if(seq_format == MSA_STANDARD || seq_format == PROFILE) {	    if(prf_mode == ALL_SEQS) {	      use_lead_columns = NO;	    }	    else if(prf_mode == LEAD_SEQ) {	      use_lead_columns = YES;	    }	    copy_hmm_struct(&hmm, &retrain_hmm);	    msa_baum_welch_dirichlet_multi(&retrain_hmm, msa_seq_infop, 1, NO, use_gap_shares, use_lead_columns, use_labels,

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -