📄 readseqs_multialpha.c
字号:
msa_length = 0; seq_length = 0; seq_index = 0; longest_seq = -1; longest_seq_length = 0; is_first = YES; while(1) { i = fgetc(seqfile); if(i == '<') { break; } else if(i == '#') { break; } else if(i == '\s' && i == '\n') { } else { printf("Sequence file does not seem to be in correct format\n"); exit(0); } } rewind(seqfile); if(priorfile_a1 == NULL) { use_priordistribution_1 = NONE; use_priordistribution_2 = NONE; use_priordistribution_3 = NONE; use_priordistribution_4 = NONE; } else { read_priorfile = read_multi_prior_file_multi(&em_di_1, hmmp, priorfile_a1, 1); if(read_priorfile > 0) { use_priordistribution_1 = DIRICHLET; } else if(read_priorfile < 0) { printf("Error: Incorrect priorfile format\n"); exit(0); } else { use_priordistribution_1 = NONE; } if(hmmp->nr_alphabets > 1) { read_priorfile = read_multi_prior_file_multi(&em_di_2, hmmp, priorfile_a1, 2); if(read_priorfile > 0) { use_priordistribution_2 = DIRICHLET; } else if(read_priorfile < 0) { printf("Error: Incorrect priorfile format\n"); exit(0); } else { use_priordistribution_2 = NONE; } } if(hmmp->nr_alphabets > 2) { read_priorfile = read_multi_prior_file_multi(&em_di_3, hmmp, priorfile_a1, 3); if(read_priorfile > 0) { use_priordistribution_3 = DIRICHLET; } else if(read_priorfile < 0) { printf("Error: Incorrect priorfile format\n"); exit(0); } else { use_priordistribution_3 = NONE; } } if(hmmp->nr_alphabets > 3) { read_priorfile = read_multi_prior_file_multi(&em_di_4, hmmp, priorfile_a1, 4); if(read_priorfile > 0) { use_priordistribution_4 = DIRICHLET; } else if(read_priorfile < 0) { printf("Error: Incorrect priorfile format\n"); exit(0); } else { use_priordistribution_4 = NONE; } } } if(replacement_letters->nr_alphabets == 0) { use_replacement_letters = NO; } else { use_replacement_letters = YES; } /* check if file is empty */ is_empty = YES; while(done != YES) { c = (char)fgetc(seqfile); if((int)c == EOF) { break; } else if (c == '<' || c == '#') { is_empty = NO; break; } } if(is_empty == YES) { if(verbose == YES) { printf("File is empty\n"); } } else { } rewind(seqfile); last = '!'; nr_alphabets = 0; nr_alphabets_temp = 0; nr_continuous_alphabets = 0; nr_continuous_alphabets_temp = 0; while(done != YES) { c = (char)fgetc(seqfile); if((int)c == EOF) { done = YES; continue; } if(c == '<' && last != '<') { inside_seq = YES; seq_length = 0; if(is_first == YES) { msa_seq_length = 0; } seq_index++; nr_seqs++; last = '<'; nr_alphabets_temp = 1; } else if(c == '#' && last != '#') { inside_seq = YES; seq_length = 0; if(is_first == YES) { msa_seq_length = 0; } seq_index++; last = '#'; nr_alphabets_temp = 1; } else if(c == '<') { nr_alphabets_temp++; } else if(c == '#') { nr_alphabets_temp++; nr_continuous_alphabets_temp++; } else if(c == '>') { if(seq_length > longest_seq_length) { longest_seq = seq_index; longest_seq_length = seq_length; } inside_seq = NO; is_first = NO; if(nr_alphabets_temp > nr_alphabets) { nr_alphabets = nr_alphabets_temp; switch(nr_alphabets) { case 1: hmmp->alphabet_type = DISCRETE; break; case 2: hmmp->alphabet_type_2 = DISCRETE; break; case 3: hmmp->alphabet_type_3 = DISCRETE; break; case 4: hmmp->alphabet_type_4 = DISCRETE; break; } } nr_alphabets_temp = 0; last = '!'; } else if(c == '+') { if(seq_length > longest_seq_length) { longest_seq = seq_index; longest_seq_length = seq_length; } inside_seq = NO; is_first = NO; if(nr_alphabets_temp > nr_alphabets) { nr_alphabets = nr_alphabets_temp; switch(nr_alphabets) { case 1: hmmp->alphabet_type = CONTINUOUS; break; case 2: hmmp->alphabet_type_2 = CONTINUOUS; break; case 3: hmmp->alphabet_type_3 = CONTINUOUS; break; case 4: hmmp->alphabet_type_4 = CONTINUOUS; break; } } if(nr_continuous_alphabets_temp > nr_continuous_alphabets) { nr_continuous_alphabets = nr_continuous_alphabets_temp; } nr_alphabets_temp = 0; nr_continuous_alphabets_temp = 0; last = '!'; } if(c == ';' && inside_seq == YES) { if(is_first == YES) { msa_seq_length++; } seq_length++; last = '!'; } else if(inside_seq == YES && (c == '_' || c == ' ' || c == '-' || c == '.')) { seq_length--; last = '!'; } } nr_seqs = nr_seqs / (nr_alphabets - nr_continuous_alphabets); msa_seq_infop->nr_seqs = nr_seqs;#ifdef DEBUG_MSA_SEQ_STD printf("reached checkpoint 1\n"); printf("msa_seq_length = %d\n", msa_seq_length); printf("nr_alphabets = %d\n", nr_alphabets); printf("longest seq = %d\n", longest_seq); printf("longest_seq_length = %d\n", longest_seq_length); printf("nr_seqs = %d\n", nr_seqs);#endif msa_seq_infop->msa_seq_1 = (struct msa_letter_s*) malloc_or_die(msa_seq_length * (hmmp->a_size+1) * sizeof(struct msa_letter_s)); msa_seq_infop->lead_columns_start = (int*)malloc_or_die((msa_seq_length+1) * sizeof(int)); if(hmmp->nr_alphabets > 1){ msa_seq_infop->msa_seq_2 = (struct msa_letter_s*) malloc_or_die(msa_seq_length * (hmmp->a_size_2+1) * sizeof(struct msa_letter_s)); } if(hmmp->nr_alphabets > 2){ msa_seq_infop->msa_seq_3 = (struct msa_letter_s*) malloc_or_die(msa_seq_length * (hmmp->a_size_3+1) * sizeof(struct msa_letter_s)); } if(hmmp->nr_alphabets > 3){ msa_seq_infop->msa_seq_4 = (struct msa_letter_s*) malloc_or_die(msa_seq_length * (hmmp->a_size_4+1) * sizeof(struct msa_letter_s)); } /* read first alphabet, save nr of occurences of each letter in each position, * including nr of gaps */ rewind(seqfile); seq_pos = 0; inside_seq = NO; seq_index = 0; get_letter_columns = NO; k = 0; l = 0; nr_lead_columns = 0; if(lead_seq == LONGEST_SEQ) { lead_seq = longest_seq; } for(m = 0; m < nr_seqs;) { i = fgetc(seqfile); if(i == EOF) { break; } c = (char)i; if(c == '<') { seq_pos = 0; inside_seq = YES; seq_index++; if(seq_index == lead_seq) { get_letter_columns = YES; get_query_letters = YES; } } else if(c == '#') { get_letter_columns = YES; fgets(line, MAX_LINE, seqfile); cur_line = line; while(*cur_line == '#') { cur_line++; } while(*cur_line != '+') { if(*cur_line == '_' || *cur_line == ' ' || (*cur_line == '-' && *(cur_line + 1) == ';') || *cur_line == '.') /* gap */ { (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos, 0, hmmp->a_size+1))->share = 0.0; (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos, 0, hmmp->a_size+1))->nr_occurences = 0.0; cur_line++; } else if(*cur_line == 'X') { (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos, 0, hmmp->a_size+1))->share = 0.0; (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos, 0, hmmp->a_size+1))->nr_occurences = -1.0; cur_line++; } else { letter_val = strtod(cur_line, &cur_line); (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos,0, hmmp->a_size+1))->share = letter_val; (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos,0, hmmp->a_size+1))->nr_occurences = 1.0; if(get_letter_columns == YES) { *(msa_seq_infop->lead_columns_start + k) = seq_pos; /* letter column */ k++; nr_lead_columns++; } } if(*cur_line != ';') { printf("Strange continuous sequence file format\n"); exit(0); } else { cur_line = cur_line + 1; seq_pos++; } } get_letter_columns = NO; break; } else if(c == '>') { inside_seq = NO; get_letter_columns = NO; get_query_letters = NO; m++; } else if(inside_seq == YES) { /* reading one letter */ letter_pos = 0; cur_letter.letter[letter_pos] = c; letter_pos++; while((c = (char)fgetc(seqfile)) != ';') { cur_letter.letter[letter_pos] = c; letter_pos++; } if(letter_pos > 4) { printf("max letter size = 4 characters\n"); exit(0); } cur_letter.letter[letter_pos] = '\0'; if(seq_pos + 1 > msa_length) { msa_length = seq_pos + 1; /* update nr of positions */ } if(cur_letter.letter[0] == '_' || cur_letter.letter[0] == ' ' || cur_letter.letter[0] == '-' || cur_letter.letter[0] == '.') /* gap */ { (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos, hmmp->a_size, hmmp->a_size+1))->nr_occurences += 1; seq_pos++; } else /* non gap character */ { if(use_replacement_letters == YES && replacement_letter_multi(&cur_letter, replacement_letters, msa_seq_infop, hmmp, seq_pos,1) == YES){ /* adding of nr_occurences for these letters is done on the fly in replacement_letter-function */ } else /* add occurence of this letter */{ a_index = get_alphabet_index(&cur_letter, hmmp->alphabet, hmmp->a_size); //printf("a_index = %d\n", a_index); if(a_index < 0) { printf("Could not read msa seq from file: letter '%s' is not in alphabet\n", cur_letter.letter); exit(0); } else { (msa_seq_infop->msa_seq_1 + get_mtx_index(seq_pos,a_index, hmmp->a_size+1))->nr_occurences += 1; } } if(get_letter_columns == YES) { *(msa_seq_infop->lead_columns_start + k) = seq_pos; /* letter column */ k++; nr_lead_columns++; } seq_pos++; } if(get_query_letters == YES) { memcpy((msa_seq_infop->msa_seq_1 + l * (hmmp->a_size + 1))->query_letter, &(cur_letter.letter), sizeof(char) * 5); /* query letter */ l++; } } //printf("m = %d\n",m); } /* read second alphabet, save nr of occurences of each letter in each position, * including nr of gaps */ if(hmmp->nr_alphabets > 1) { seq_pos = 0; inside_seq = NO; seq_index = 0; get_letter_columns = NO; l = 0; last = '!'; for(m = 0; m < nr_seqs;) { i = fgetc(seqfile); if(i == EOF) { break; } c = (char)i; //printf("alpha2:c = %c\n",c); if(c == '<' && last != '<') { seq_pos = 0; inside_seq = YES; seq_index++; if(seq_index == lead_seq) { get_query_letters = YES; } last = c; } else if(c == '<') { last = c; } else if(c == '#') { fgets(line, MAX_LINE, seqfile); cur_line = line; while(*cur_line == '#') { cur_line++; } while(*cur_line != '+') { if(*cur_line == '_' || *cur_line == ' ' || (*cur_line == '-' && *(cur_line + 1) == ';') || *cur_line == '.') /* gap */ { (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos, 0, hmmp->a_size_2+1))->share = 0.0; (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos, 0, hmmp->a_size_2+1))->nr_occurences = 0.0; cur_line++; } else if(*cur_line == 'X') { (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos, 0, hmmp->a_size_2+1))->share = 0.0; (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos, 0, hmmp->a_size_2+1))->nr_occurences = -1.0; cur_line++; } else { letter_val = strtod(cur_line, &cur_line); (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos,0, hmmp->a_size_2+1))->share = letter_val; (msa_seq_infop->msa_seq_2 + get_mtx_index(seq_pos,0, hmmp->a_size_2+1))->nr_occurences = 1.0; } if(*cur_line != ';') { printf("Strange continuous sequence file format\n"); exit(0); } else { cur_line = cur_line + 1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -