⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 readseqs_multialpha.c

📁 马尔科夫模型的java版本实现
💻 C
📖 第 1 页 / 共 5 页
字号:
#include <stdio.h>#include <string.h>#include <stdlib.h>#include <math.h>#include <float.h>//#include <double.h>#include <limits.h>#include "structs.h"#include "funcs.h"#define MAX_LINE 60000#define DIRICHLET 12#define NONE -2#define LONGEST_SEQ -1 /* Note: this constant is also defined in hmmsearch_msa.c *///#define DEBUG_SEQ_STD//#define DEBUG_SEQ_FASTA//#define DEBUG_MSA_SEQ_STD//#define DEBUG_MSA_SEQ_PRF//#define DEBUG_PRIextern int verbose;int seqfile_has_labels(FILE *seqfile){  char row[30000];    rewind(seqfile);  while(1) {    if(fgets(row,30000,seqfile) != NULL) {      if(row[0] == '/') {	rewind(seqfile);	return YES;      }    }    else {      rewind(seqfile);      return NO;    }  }  }     void get_sequence_fasta_multi(FILE *seqfile, struct sequences_multi_s *seq_infop, int seq_nr){  int i,j,a;  int nr_letters;  int longest_seq, shortest_seq, avg_seq_len;  int inside_seq;  char c;  struct letter_s *seqsp;  nr_letters = 0;  longest_seq = seq_infop->longest_seq;  shortest_seq = seq_infop->shortest_seq;  inside_seq = NO;    while(1) {    i = fgetc(seqfile);    if(i == '>') {      break;    }    else if(i == '\s' && i == '\n') {          }    else {      printf("Sequence file does not seem to be in correct format\n");      exit(0);    }  }  rewind(seqfile);  /* Find out how much space to allocate for the sequence = count total number of letters*/  while((i = fgetc(seqfile)) != EOF) {    c = (char)i;    if(((i >= 65 && i <= 90) || (i >= 97 && i <= 122)) && inside_seq == YES) {      nr_letters++;    }    else if(c == '>') {      while((i = fgetc(seqfile)) != '\n') {	if(i == EOF) {	  printf("Sequence file is not in correct FASTA format\n");	  exit(0);	}      }      inside_seq = YES;    }    else if(c == '/') {      inside_seq = NO;    }  }#ifdef DEBUG_SEQ_FASTA  printf("nr_letters = %d\n", nr_letters);#endif  /* Allocate memory, must be freed by caller*/  if(nr_letters > seq_infop->longest_seq) {    seq_infop->longest_seq = nr_letters;  }  if(nr_letters < seq_infop->shortest_seq) {    seq_infop->shortest_seq = nr_letters;  }  seq_infop->avg_seq_len += nr_letters;  (seq_infop->seqs + seq_nr)->seq_1 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s)));  (seq_infop->seqs + seq_nr)->length = nr_letters;  //printf("%x\n", (seq_infop->seqs + seq_nr)->seq_1);  //printf("%x\n", nr_letters * 2 * sizeof(struct letter_s));/* Read sequences into memory */  seqsp = (struct letter_s*)((seq_infop->seqs + seq_nr)->seq_1);  rewind(seqfile);  inside_seq = NO;  while((c = (char)(fgetc(seqfile))) != '>' && c != EOF) {      }  if(c == '>') {    inside_seq = YES;    while((c = (char)(fgetc(seqfile))) == ' ') {          }    /* read sequence name */    a = 0;    (seq_infop->seqs + seq_nr)->name[a] = c;    a++;    while((c = (char)(fgetc(seqfile))) != ' ' && c != '\n') {      if(a < MAX_SEQ_NAME_SIZE-1) {	(seq_infop->seqs + seq_nr)->name[a] = c;	a++;      }    }    (seq_infop->seqs + seq_nr)->name[a] = '\0';    if(c != '\n') {      while((c = (char)(fgetc(seqfile))) != '\n') {	/* read rest of row and forget about it */      }    }    while((c = (char)(fgetc(seqfile))) != EOF) {      if(c == '\n' || c == ' ') {	      }      else if(c == '/') {	break;      }      else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) {	if(c >= 97) {	  c = c - 32; //convert to upper case	}	(seqsp->letter)[0] = c;	(seqsp->letter)[1] = '\0';	seqsp->label = '.';	seqsp++;	//printf("%x\n", seqsp);      }    }  }  else {    printf("Sequence file is not in correct FASTA format\n");    exit(0);  }} void get_sequence_std_multi(FILE *seqfile, struct sequences_multi_s *seq_infop, struct hmm_multi_s *hmmp, int seq_nr){  int i,j,a,k,last;  int nr_letters;  int longest_seq, shortest_seq, avg_seq_len;  int temp_seq_len;  int inside_seq;  int nr_alphabets, nr_alphabets_temp;  char c;  struct letter_s *seqsp;  struct sequence_multi_s temp_seq;  nr_letters = 0;  inside_seq = NO;  nr_alphabets = 0;  nr_alphabets_temp = 0;  last = '!';  char line[MAX_LINE], *cur_line;  double letter_val;    cur_line = line;  while(fgets(line, MAX_LINE, seqfile) != NULL) {    if(line[0] == '<' || line[0] == '#' || line[0] == '\s' || line[0] == '\n' || line[0] == '/') {          }    else {      printf("Sequence file does not seem to be in correct format\n");      exit(0);    }    for(i = 0; line[i] != '\0'; i++) {      if((line[i] == '>' || line[i] == '+') && i > 0 && line[i-1] != ';') {	printf("Sequence file does not seem to be in correct format\n");	printf("All letters must be followed by ';'\n");	exit(0);      }    }  }  rewind(seqfile);    /* Find out how much space to allocate for the sequences = count total number of letters*/  while((i = fgetc(seqfile)) != EOF) {    c = (char)i;    if(c == ';' && inside_seq == YES) {      nr_letters++;      last = '!';    }    else if(c == '<') {      inside_seq = YES;      nr_letters = 0;      if(last == '<') {	nr_alphabets_temp++;      }      else {	nr_alphabets_temp = 1;	last = '<';      }    }    else if(c == '#') {      inside_seq = YES;      if(last == '#') {	nr_alphabets_temp++;      }      else {	nr_alphabets_temp = 1;	last = '#';      }    }    else if((c == '>' || c == '+')&& inside_seq == YES) {      inside_seq = NO;      if(nr_alphabets_temp > nr_alphabets) {	nr_alphabets = nr_alphabets_temp;	nr_alphabets_temp = 0;      }      if(c == '>') {	switch(nr_alphabets) {	case 1: hmmp->alphabet_type = DISCRETE; break;	case 2: hmmp->alphabet_type_2 = DISCRETE; break;	case 3: hmmp->alphabet_type_3 = DISCRETE; break;	case 4: hmmp->alphabet_type_4 = DISCRETE; break;	}      }      if(c == '+') {	switch(nr_alphabets) {	case 1: hmmp->alphabet_type = CONTINUOUS; break;	case 2: hmmp->alphabet_type_2 = CONTINUOUS; break;	case 3: hmmp->alphabet_type_3 = CONTINUOUS; break;	case 4: hmmp->alphabet_type_4 = CONTINUOUS; break;	}      }      last = '!';    }    else if(c == '/') {      break;    }    else {      last = '!';    }  }#ifdef DEBUG_SEQ_STD  printf("nr_letters = %d\n", nr_letters);  printf("nr_alphabets = %d\n", nr_alphabets);#endif  /* check if nr of alphabets in sequence file corresponds to nr of alphabets in hmm file */  if(hmmp->nr_alphabets < nr_alphabets) {    printf("Warning: HMM has %d alphabets,  while sequence file contains %d alphabets\n", hmmp->nr_alphabets, nr_alphabets);    printf("Only the first %d alphabets will be read and used\n", hmmp->nr_alphabets);    nr_alphabets = hmmp->nr_alphabets;  }  else if(hmmp->nr_alphabets > nr_alphabets) {    printf("Error: HMM has %d alphabets,  while sequence file contains %d alphabets\n", hmmp->nr_alphabets, nr_alphabets);    exit(0);  }  /* Allocate memory, must be freed by caller*/  if(nr_letters > seq_infop->longest_seq) {    seq_infop->longest_seq = nr_letters;  }  if(nr_letters < seq_infop->shortest_seq) {    seq_infop->shortest_seq = nr_letters;  }  seq_infop->avg_seq_len += nr_letters;  (seq_infop->seqs + seq_nr)->seq_1 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s)));  if(nr_alphabets > 1) {    (seq_infop->seqs + seq_nr)->seq_2 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s)));  }  if(nr_alphabets > 2) {    (seq_infop->seqs + seq_nr)->seq_3 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s)));  }  if(nr_alphabets > 3) {    (seq_infop->seqs + seq_nr)->seq_4 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s)));  }  (seq_infop->seqs + seq_nr)->length = nr_letters;    /* Read sequences into memory */   rewind(seqfile);    for(k = 0; k < nr_alphabets;) {    if(k == 0) {      seqsp = (seq_infop->seqs + seq_nr)->seq_1;    }    if(k == 1) {      seqsp = (seq_infop->seqs + seq_nr)->seq_2;    }    if(k == 2) {      seqsp = (seq_infop->seqs + seq_nr)->seq_3;    }    if(k == 3) {      seqsp = (seq_infop->seqs + seq_nr)->seq_4;    }    a = 0;    c = (char)(fgetc(seqfile));    if(c != '<' && c != '#') {      /* not a sequence on this line, just continue */      if(c != '\n') {	while((c = (char)(fgetc(seqfile))) != '\n') {	}      }      continue;    }    else if(c == '<') /* this line is a sequence */ {      while((c = (char)(fgetc(seqfile))) != '>') {	if(c == '<') {	  	}	else if(c == ';') {	  seqsp->letter[a] = '\0';	  seqsp->label = '.';	  seqsp++;	  a = 0;	}	else {	  seqsp->letter[a] = c;	  a++;	}      }            seqsp->letter[0] = '\0';      seqsp++;      strcpy((seq_infop->seqs + seq_nr)->name, "\0");      if(k == 0) {	(seq_infop->seqs + seq_nr)->length = get_seq_length((seq_infop->seqs + seq_nr)->seq_1);      }      k++;    }    else if(c == '#') {      fgets(line, MAX_LINE, seqfile);      cur_line = line;      while(*cur_line == '#') {	cur_line++;      }      while(*cur_line != '+') {	letter_val = strtod(cur_line, &cur_line);	seqsp->letter[0] = 'X';	seqsp->letter[1] = '\0';	seqsp->label = '.';	seqsp->cont_letter = letter_val;	seqsp++;	if(*cur_line != ';') {	  printf("Strange continuous sequence file format\n");	}	else {	  cur_line = cur_line + 1;	}      }      seqsp->letter[0] = '\0';      seqsp++;      strcpy((seq_infop->seqs + seq_nr)->name, "\0");      if(k == 0) {	(seq_infop->seqs + seq_nr)->length = get_seq_length((seq_infop->seqs + seq_nr)->seq_1);      }      k++;    }  }  #ifdef DEBUG_SEQ_STD  printf("exiting read_seqs\n");#endif  }/* Note: msa_seq_infop->seq and msa_seq_infop->gaps will be allocated here but must be * freed by caller */void get_sequences_msa_std_multi(FILE *seqfile, FILE *priorfile_a1, struct msa_sequences_multi_s *msa_seq_infop,				 struct hmm_multi_s *hmmp, int lead_seq, struct replacement_letter_multi_s *replacement_letters){  int i,j,k,l,m;  int done, inside_seq, seq_pos, letter_pos, a_index, nr_seqs, cur_pos, cur_seq;  int msa_seq_length, msa_length, seq_length, longest_seq, longest_seq_length;  int seq_index, nr_lead_columns;  int gaps_per_column, tot_nr_gaps, nr_gaps;  double occurences_per_column;  int get_letter_columns, get_query_letters;  int use_priordistribution_1, use_priordistribution_2, use_priordistribution_3, use_priordistribution_4;  char c;  struct letter_s cur_letter;  int **cur_posp;  struct emission_dirichlet_s em_di_1, em_di_2, em_di_3, em_di_4;  int is_first;  int use_replacement_letters;  int is_empty;  char last;  int nr_alphabets, nr_alphabets_temp, nr_continuous_alphabets, nr_continuous_alphabets_temp;  char line[MAX_LINE], *cur_line;  double letter_val;  int read_priorfile;    /* find out length of alignment and allocate memory for probability matrix by   * reading all sequence rows and remembering the longest */  tot_nr_gaps = 0;  done = NO;  inside_seq = NO;  msa_seq_length = 0;  nr_seqs = 0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -