📄 readseqs_multialpha.c
字号:
#include <stdio.h>#include <string.h>#include <stdlib.h>#include <math.h>#include <float.h>//#include <double.h>#include <limits.h>#include "structs.h"#include "funcs.h"#define MAX_LINE 60000#define DIRICHLET 12#define NONE -2#define LONGEST_SEQ -1 /* Note: this constant is also defined in hmmsearch_msa.c *///#define DEBUG_SEQ_STD//#define DEBUG_SEQ_FASTA//#define DEBUG_MSA_SEQ_STD//#define DEBUG_MSA_SEQ_PRF//#define DEBUG_PRIextern int verbose;int seqfile_has_labels(FILE *seqfile){ char row[30000]; rewind(seqfile); while(1) { if(fgets(row,30000,seqfile) != NULL) { if(row[0] == '/') { rewind(seqfile); return YES; } } else { rewind(seqfile); return NO; } } } void get_sequence_fasta_multi(FILE *seqfile, struct sequences_multi_s *seq_infop, int seq_nr){ int i,j,a; int nr_letters; int longest_seq, shortest_seq, avg_seq_len; int inside_seq; char c; struct letter_s *seqsp; nr_letters = 0; longest_seq = seq_infop->longest_seq; shortest_seq = seq_infop->shortest_seq; inside_seq = NO; while(1) { i = fgetc(seqfile); if(i == '>') { break; } else if(i == '\s' && i == '\n') { } else { printf("Sequence file does not seem to be in correct format\n"); exit(0); } } rewind(seqfile); /* Find out how much space to allocate for the sequence = count total number of letters*/ while((i = fgetc(seqfile)) != EOF) { c = (char)i; if(((i >= 65 && i <= 90) || (i >= 97 && i <= 122)) && inside_seq == YES) { nr_letters++; } else if(c == '>') { while((i = fgetc(seqfile)) != '\n') { if(i == EOF) { printf("Sequence file is not in correct FASTA format\n"); exit(0); } } inside_seq = YES; } else if(c == '/') { inside_seq = NO; } }#ifdef DEBUG_SEQ_FASTA printf("nr_letters = %d\n", nr_letters);#endif /* Allocate memory, must be freed by caller*/ if(nr_letters > seq_infop->longest_seq) { seq_infop->longest_seq = nr_letters; } if(nr_letters < seq_infop->shortest_seq) { seq_infop->shortest_seq = nr_letters; } seq_infop->avg_seq_len += nr_letters; (seq_infop->seqs + seq_nr)->seq_1 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s))); (seq_infop->seqs + seq_nr)->length = nr_letters; //printf("%x\n", (seq_infop->seqs + seq_nr)->seq_1); //printf("%x\n", nr_letters * 2 * sizeof(struct letter_s));/* Read sequences into memory */ seqsp = (struct letter_s*)((seq_infop->seqs + seq_nr)->seq_1); rewind(seqfile); inside_seq = NO; while((c = (char)(fgetc(seqfile))) != '>' && c != EOF) { } if(c == '>') { inside_seq = YES; while((c = (char)(fgetc(seqfile))) == ' ') { } /* read sequence name */ a = 0; (seq_infop->seqs + seq_nr)->name[a] = c; a++; while((c = (char)(fgetc(seqfile))) != ' ' && c != '\n') { if(a < MAX_SEQ_NAME_SIZE-1) { (seq_infop->seqs + seq_nr)->name[a] = c; a++; } } (seq_infop->seqs + seq_nr)->name[a] = '\0'; if(c != '\n') { while((c = (char)(fgetc(seqfile))) != '\n') { /* read rest of row and forget about it */ } } while((c = (char)(fgetc(seqfile))) != EOF) { if(c == '\n' || c == ' ') { } else if(c == '/') { break; } else if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122)) { if(c >= 97) { c = c - 32; //convert to upper case } (seqsp->letter)[0] = c; (seqsp->letter)[1] = '\0'; seqsp->label = '.'; seqsp++; //printf("%x\n", seqsp); } } } else { printf("Sequence file is not in correct FASTA format\n"); exit(0); }} void get_sequence_std_multi(FILE *seqfile, struct sequences_multi_s *seq_infop, struct hmm_multi_s *hmmp, int seq_nr){ int i,j,a,k,last; int nr_letters; int longest_seq, shortest_seq, avg_seq_len; int temp_seq_len; int inside_seq; int nr_alphabets, nr_alphabets_temp; char c; struct letter_s *seqsp; struct sequence_multi_s temp_seq; nr_letters = 0; inside_seq = NO; nr_alphabets = 0; nr_alphabets_temp = 0; last = '!'; char line[MAX_LINE], *cur_line; double letter_val; cur_line = line; while(fgets(line, MAX_LINE, seqfile) != NULL) { if(line[0] == '<' || line[0] == '#' || line[0] == '\s' || line[0] == '\n' || line[0] == '/') { } else { printf("Sequence file does not seem to be in correct format\n"); exit(0); } for(i = 0; line[i] != '\0'; i++) { if((line[i] == '>' || line[i] == '+') && i > 0 && line[i-1] != ';') { printf("Sequence file does not seem to be in correct format\n"); printf("All letters must be followed by ';'\n"); exit(0); } } } rewind(seqfile); /* Find out how much space to allocate for the sequences = count total number of letters*/ while((i = fgetc(seqfile)) != EOF) { c = (char)i; if(c == ';' && inside_seq == YES) { nr_letters++; last = '!'; } else if(c == '<') { inside_seq = YES; nr_letters = 0; if(last == '<') { nr_alphabets_temp++; } else { nr_alphabets_temp = 1; last = '<'; } } else if(c == '#') { inside_seq = YES; if(last == '#') { nr_alphabets_temp++; } else { nr_alphabets_temp = 1; last = '#'; } } else if((c == '>' || c == '+')&& inside_seq == YES) { inside_seq = NO; if(nr_alphabets_temp > nr_alphabets) { nr_alphabets = nr_alphabets_temp; nr_alphabets_temp = 0; } if(c == '>') { switch(nr_alphabets) { case 1: hmmp->alphabet_type = DISCRETE; break; case 2: hmmp->alphabet_type_2 = DISCRETE; break; case 3: hmmp->alphabet_type_3 = DISCRETE; break; case 4: hmmp->alphabet_type_4 = DISCRETE; break; } } if(c == '+') { switch(nr_alphabets) { case 1: hmmp->alphabet_type = CONTINUOUS; break; case 2: hmmp->alphabet_type_2 = CONTINUOUS; break; case 3: hmmp->alphabet_type_3 = CONTINUOUS; break; case 4: hmmp->alphabet_type_4 = CONTINUOUS; break; } } last = '!'; } else if(c == '/') { break; } else { last = '!'; } }#ifdef DEBUG_SEQ_STD printf("nr_letters = %d\n", nr_letters); printf("nr_alphabets = %d\n", nr_alphabets);#endif /* check if nr of alphabets in sequence file corresponds to nr of alphabets in hmm file */ if(hmmp->nr_alphabets < nr_alphabets) { printf("Warning: HMM has %d alphabets, while sequence file contains %d alphabets\n", hmmp->nr_alphabets, nr_alphabets); printf("Only the first %d alphabets will be read and used\n", hmmp->nr_alphabets); nr_alphabets = hmmp->nr_alphabets; } else if(hmmp->nr_alphabets > nr_alphabets) { printf("Error: HMM has %d alphabets, while sequence file contains %d alphabets\n", hmmp->nr_alphabets, nr_alphabets); exit(0); } /* Allocate memory, must be freed by caller*/ if(nr_letters > seq_infop->longest_seq) { seq_infop->longest_seq = nr_letters; } if(nr_letters < seq_infop->shortest_seq) { seq_infop->shortest_seq = nr_letters; } seq_infop->avg_seq_len += nr_letters; (seq_infop->seqs + seq_nr)->seq_1 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s))); if(nr_alphabets > 1) { (seq_infop->seqs + seq_nr)->seq_2 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s))); } if(nr_alphabets > 2) { (seq_infop->seqs + seq_nr)->seq_3 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s))); } if(nr_alphabets > 3) { (seq_infop->seqs + seq_nr)->seq_4 = (struct letter_s*)(malloc_or_die(nr_letters * 2 * sizeof(struct letter_s))); } (seq_infop->seqs + seq_nr)->length = nr_letters; /* Read sequences into memory */ rewind(seqfile); for(k = 0; k < nr_alphabets;) { if(k == 0) { seqsp = (seq_infop->seqs + seq_nr)->seq_1; } if(k == 1) { seqsp = (seq_infop->seqs + seq_nr)->seq_2; } if(k == 2) { seqsp = (seq_infop->seqs + seq_nr)->seq_3; } if(k == 3) { seqsp = (seq_infop->seqs + seq_nr)->seq_4; } a = 0; c = (char)(fgetc(seqfile)); if(c != '<' && c != '#') { /* not a sequence on this line, just continue */ if(c != '\n') { while((c = (char)(fgetc(seqfile))) != '\n') { } } continue; } else if(c == '<') /* this line is a sequence */ { while((c = (char)(fgetc(seqfile))) != '>') { if(c == '<') { } else if(c == ';') { seqsp->letter[a] = '\0'; seqsp->label = '.'; seqsp++; a = 0; } else { seqsp->letter[a] = c; a++; } } seqsp->letter[0] = '\0'; seqsp++; strcpy((seq_infop->seqs + seq_nr)->name, "\0"); if(k == 0) { (seq_infop->seqs + seq_nr)->length = get_seq_length((seq_infop->seqs + seq_nr)->seq_1); } k++; } else if(c == '#') { fgets(line, MAX_LINE, seqfile); cur_line = line; while(*cur_line == '#') { cur_line++; } while(*cur_line != '+') { letter_val = strtod(cur_line, &cur_line); seqsp->letter[0] = 'X'; seqsp->letter[1] = '\0'; seqsp->label = '.'; seqsp->cont_letter = letter_val; seqsp++; if(*cur_line != ';') { printf("Strange continuous sequence file format\n"); } else { cur_line = cur_line + 1; } } seqsp->letter[0] = '\0'; seqsp++; strcpy((seq_infop->seqs + seq_nr)->name, "\0"); if(k == 0) { (seq_infop->seqs + seq_nr)->length = get_seq_length((seq_infop->seqs + seq_nr)->seq_1); } k++; } } #ifdef DEBUG_SEQ_STD printf("exiting read_seqs\n");#endif }/* Note: msa_seq_infop->seq and msa_seq_infop->gaps will be allocated here but must be * freed by caller */void get_sequences_msa_std_multi(FILE *seqfile, FILE *priorfile_a1, struct msa_sequences_multi_s *msa_seq_infop, struct hmm_multi_s *hmmp, int lead_seq, struct replacement_letter_multi_s *replacement_letters){ int i,j,k,l,m; int done, inside_seq, seq_pos, letter_pos, a_index, nr_seqs, cur_pos, cur_seq; int msa_seq_length, msa_length, seq_length, longest_seq, longest_seq_length; int seq_index, nr_lead_columns; int gaps_per_column, tot_nr_gaps, nr_gaps; double occurences_per_column; int get_letter_columns, get_query_letters; int use_priordistribution_1, use_priordistribution_2, use_priordistribution_3, use_priordistribution_4; char c; struct letter_s cur_letter; int **cur_posp; struct emission_dirichlet_s em_di_1, em_di_2, em_di_3, em_di_4; int is_first; int use_replacement_letters; int is_empty; char last; int nr_alphabets, nr_alphabets_temp, nr_continuous_alphabets, nr_continuous_alphabets_temp; char line[MAX_LINE], *cur_line; double letter_val; int read_priorfile; /* find out length of alignment and allocate memory for probability matrix by * reading all sequence rows and remembering the longest */ tot_nr_gaps = 0; done = NO; inside_seq = NO; msa_seq_length = 0; nr_seqs = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -