📄 sequence.c
字号:
/********* Sequence input routines for CLUSTAL W *******************//* DES was here. FEB. 1994 *//* Now reads PILEUP/MSF and CLUSTAL alignment files */#include <stdio.h>#include <string.h>#include <ctype.h>#include <stdlib.h>#include "clustalw.h" #define MIN(a,b) ((a)<(b)?(a):(b))/** Prototypes*/static char * get_seq(char *,sint *,char *);static char * get_clustal_seq(char *,sint *,char *,sint);static char * get_msf_seq(char *,sint *,char *,sint);static void check_infile(sint *);static void p_encode(char *, char *, sint);static void n_encode(char *, char *, sint);static sint res_index(char *,char);static Boolean check_dnaflag(char *, sint);static sint count_clustal_seqs(void);static sint count_pir_seqs(void);static sint count_msf_seqs(void);static sint count_rsf_seqs(void);static void get_swiss_feature(char *line,sint len);static void get_rsf_feature(char *line,sint len);static void get_swiss_mask(char *line,sint len);static void get_clustal_ss(sint length);static void get_embl_ss(sint length);static void get_rsf_ss(sint length);static void get_gde_ss(sint length);static Boolean cl_blankline(char *line);/* * Global variables */extern sint max_names;FILE *fin;extern Boolean usemenu, dnaflag, explicit_dnaflag;extern Boolean interactive;extern char seqname[];extern sint nseqs;extern sint *seqlen_array;extern sint *output_index;extern char **names,**titles;extern char **seq_array;extern Boolean profile1_empty, profile2_empty;extern sint gap_pos2;extern sint max_aln_length;extern char *gap_penalty_mask, *sec_struct_mask;extern sint struct_penalties;extern char *ss_name;extern sint profile_no;extern sint debug;char *amino_acid_codes = "ABCDEFGHIKLMNPQRSTUVWXYZ-"; /* DES */static sint seqFormat;static char chartab[128];static char *formatNames[] = {"unknown","EMBL/Swiss-Prot","PIR", "Pearson","GDE","Clustal","Pileup/MSF","RSF","USER","PHYLIP","NEXUS"};void fill_chartab(void) /* Create translation and check table */{ register sint i; register char c; for(i=0;i<128;chartab[i++]=0); for(i=0;(c=amino_acid_codes[i]);i++) chartab[(int)c]=chartab[tolower(c)]=c;}static char * get_msf_seq(char *sname,sint *len,char *tit,sint seqno)/* read the seqno_th. sequence from a PILEUP multiple alignment file */{ static char line[MAXLINE+1]; char *seq = NULL; sint i,j,k; unsigned char c; fseek(fin,0,0); /* start at the beginning */ *len=0; /* initialise length to zero */ for(i=0;;i++) { if(fgets(line,MAXLINE+1,fin)==NULL) return NULL; /* read the title*/ if(linetype(line,"//") ) break; /* lines...ignore*/ } while (fgets(line,MAXLINE+1,fin) != NULL) { if(!blankline(line)) { for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin); for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break; for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break; strncpy(sname,line+j,MIN(MAXNAMES,k-j)); sname[MIN(MAXNAMES,k-j)]=EOS; rtrim(sname); blank_to_(sname); if(seq==NULL) seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); else seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); for(i=k;i<=MAXLINE;i++) { c=line[i]; if(c == '.' || c == '~' ) c = '-'; if(c == '*') c = 'X'; if(c == '\n' || c == EOS) break; /* EOL */ c=chartab[c]; if(c) seq[++(*len)]=c; } for(i=0;;i++) { if(fgets(line,MAXLINE+1,fin)==NULL) return seq; if(blankline(line)) break; } } } return seq;}static Boolean cl_blankline(char *line){ int i; if (line[0] == '!') return TRUE; for(i=0;line[i]!='\n' && line[i]!=EOS;i++) { if( isdigit(line[i]) || isspace(line[i]) || (line[i] == '*') || (line[i] == ':') || (line[i] == '.')) ; else return FALSE; } return TRUE;}static char * get_clustal_seq(char *sname,sint *len,char *tit,sint seqno)/* read the seqno_th. sequence from a clustal multiple alignment file */{ static char line[MAXLINE+1]; static char tseq[MAXLINE+1]; char *seq = NULL; sint i,j; unsigned char c; fseek(fin,0,0); /* start at the beginning */ *len=0; /* initialise length to zero */ fgets(line,MAXLINE+1,fin); /* read the title line...ignore it */ while (fgets(line,MAXLINE+1,fin) != NULL) { if(!cl_blankline(line)) { for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin); for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break; sscanf(line,"%s%s",sname,tseq); for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break; sname[j]=EOS; rtrim(sname); blank_to_(sname); if(seq==NULL) seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); else seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); for(i=0;i<=MAXLINE;i++) { c=tseq[i]; /*if(c == '\n' || c == EOS) break;*/ /* EOL */ if(isspace(c) || c == EOS) break; /* EOL */ c=chartab[c]; if(c) seq[++(*len)]=c; } for(i=0;;i++) { if(fgets(line,MAXLINE+1,fin)==NULL) return seq; if(cl_blankline(line)) break; } } } return seq;}static void get_clustal_ss(sint length)/* read the structure data from a clustal multiple alignment file */{ static char title[MAXLINE+1]; static char line[MAXLINE+1]; static char lin2[MAXLINE+1]; static char tseq[MAXLINE+1]; static char sname[MAXNAMES+1]; sint i,j,len,ix,struct_index=0; char c; fseek(fin,0,0); /* start at the beginning */ len=0; /* initialise length to zero */ if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the title line...ignore it */ if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the next line... *//* skip any blank lines */ for (;;) { if(fgets(line,MAXLINE+1,fin)==NULL) return; if(!blankline(line)) break; }/* look for structure table lines */ ix = -1; for(;;) { if(line[0] != '!') break; if(strncmp(line,"!SS",3) == 0) { ix++; sscanf(line+4,"%s%s",sname,tseq); for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break; sname[j]=EOS; rtrim(sname); blank_to_(sname); if (interactive) { strcpy(title,"Found secondary structure in alignment file: "); strcat(title,sname); (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); } else (*lin2) = 'y'; if ((*lin2 != 'n') && (*lin2 != 'N')) { struct_penalties = SECST; struct_index = ix; for (i=0;i<length;i++) { sec_struct_mask[i] = '.'; gap_penalty_mask[i] = '.'; } strcpy(ss_name,sname); for(i=0;len < length;i++) { c = tseq[i]; if(c == '\n' || c == EOS) break; /* EOL */ if (!isspace(c)) sec_struct_mask[len++] = c; } } } else if(strncmp(line,"!GM",3) == 0) { ix++; sscanf(line+4,"%s%s",sname,tseq); for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break; sname[j]=EOS; rtrim(sname); blank_to_(sname); if (interactive) { strcpy(title,"Found gap penalty mask in alignment file: "); strcat(title,sname); (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); } else (*lin2) = 'y'; if ((*lin2 != 'n') && (*lin2 != 'N')) { struct_penalties = GMASK; struct_index = ix; for (i=0;i<length;i++) gap_penalty_mask[i] = '1'; strcpy(ss_name,sname); for(i=0;len < length;i++) { c = tseq[i]; if(c == '\n' || c == EOS) break; /* EOL */ if (!isspace(c)) gap_penalty_mask[len++] = c; } } } if (struct_penalties != NONE) break; if(fgets(line,MAXLINE+1,fin)==NULL) return; } if (struct_penalties == NONE) return; /* skip any more comment lines */ while (line[0] == '!') { if(fgets(line,MAXLINE+1,fin)==NULL) return; }/* skip the sequence lines and any comments after the alignment */ for (;;) { if(isspace(line[0])) break; if(fgets(line,MAXLINE+1,fin)==NULL) return; } /* read the rest of the alignment */ for (;;) {/* skip any blank lines */ for (;;) { if(!blankline(line)) break; if(fgets(line,MAXLINE+1,fin)==NULL) return; }/* get structure table line */ for(ix=0;ix<struct_index;ix++) { if (line[0] != '!') { if(struct_penalties == SECST) error("bad secondary structure format"); else error("bad gap penalty mask format"); struct_penalties = NONE; return; } if(fgets(line,MAXLINE+1,fin)==NULL) return; } if(struct_penalties == SECST) { if (strncmp(line,"!SS",3) != 0) { error("bad secondary structure format"); struct_penalties = NONE; return; } sscanf(line+4,"%s%s",sname,tseq); for(i=0;len < length;i++) { c = tseq[i]; if(c == '\n' || c == EOS) break; /* EOL */ if (!isspace(c)) sec_struct_mask[len++] = c; } } else if (struct_penalties == GMASK) { if (strncmp(line,"!GM",3) != 0) { error("bad gap penalty mask format"); struct_penalties = NONE; return; } sscanf(line+4,"%s%s",sname,tseq); for(i=0;len < length;i++) { c = tseq[i]; if(c == '\n' || c == EOS) break; /* EOL */ if (!isspace(c)) gap_penalty_mask[len++] = c; } }/* skip any more comment lines */ while (line[0] == '!') { if(fgets(line,MAXLINE+1,fin)==NULL) return; }/* skip the sequence lines */ for (;;) { if(isspace(line[0])) break; if(fgets(line,MAXLINE+1,fin)==NULL) return; } }}static void get_embl_ss(sint length){ static char title[MAXLINE+1]; static char line[MAXLINE+1]; static char lin2[MAXLINE+1]; static char sname[MAXNAMES+1]; char feature[MAXLINE+1]; sint i;/* find the start of the sequence entry */ for (;;) { while( !linetype(line,"ID") ) if (fgets(line,MAXLINE+1,fin) == NULL) return; for(i=5;i<=strlen(line);i++) /* DES */ if(line[i] != ' ') break; strncpy(sname,line+i,MAXNAMES); /* remember entryname */ for(i=0;i<=strlen(sname);i++) if(sname[i] == ' ') { sname[i]=EOS; break; } sname[MAXNAMES]=EOS; rtrim(sname); blank_to_(sname); /* look for secondary structure feature table / gap penalty mask */ while(fgets(line,MAXLINE+1,fin) != NULL) { if (linetype(line,"FT")) { sscanf(line+2,"%s",feature); if (strcmp(feature,"HELIX") == 0 || strcmp(feature,"STRAND") == 0) { if (interactive) { strcpy(title,"Found secondary structure in alignment file: "); strcat(title,sname); (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); } else (*lin2) = 'y'; if ((*lin2 != 'n') && (*lin2 != 'N')) { struct_penalties = SECST; for (i=0;i<length;i++) sec_struct_mask[i] = '.'; do { get_swiss_feature(&line[2],length); fgets(line,MAXLINE+1,fin); } while( linetype(line,"FT") ); } else { do { fgets(line,MAXLINE+1,fin); } while( linetype(line,"FT") ); } strcpy(ss_name,sname); } } else if (linetype(line,"GM")) { if (interactive) { strcpy(title,"Found gap penalty mask in alignment file: "); strcat(title,sname); (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); } else (*lin2) = 'y'; if ((*lin2 != 'n') && (*lin2 != 'N')) { struct_penalties = GMASK; for (i=0;i<length;i++) gap_penalty_mask[i] = '1'; do { get_swiss_mask(&line[2],length); fgets(line,MAXLINE+1,fin); } while( linetype(line,"GM") ); } else { do { fgets(line,MAXLINE+1,fin); } while( linetype(line,"GM") ); } strcpy(ss_name,sname); } if (linetype(line,"SQ")) break; if (struct_penalties != NONE) break; } } }static void get_rsf_ss(sint length){ static char title[MAXLINE+1]; static char line[MAXLINE+1]; static char lin2[MAXLINE+1]; static char sname[MAXNAMES+1]; sint i;/* skip the comments */ while (fgets(line,MAXLINE+1,fin) != NULL) { if(line[strlen(line)-2]=='.' && line[strlen(line)-3]=='.') break; }/* find the start of the sequence entry */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -