📄 sequence.c
字号:
sint i,j; sint no_seqs; static sint l1; static Boolean dnaflag1; if(usemenu) getstr("Enter the name of the sequence file",line); else strcpy(line,seqname); if(*line == EOS) return -1; if ((sscanf(line,"file://%s",fileName) == 1 )) { strcpy(line,fileName); } if((fin=fopen(line,"r"))==NULL) { error("Could not open sequence file (%s) ",line); return -1; /* DES -1 => file not found */ } strcpy(seqname,line); no_seqs=0; check_infile(&no_seqs); info("Sequence format is %s",formatNames[seqFormat]); if(seqFormat==NEXUS) error("Cannot read nexus format");/* DES DEBUG fprintf(stdout,"\n\n File name = %s\n\n",seqname);*/ if(no_seqs == 0) return 0; /* return the number of seqs. (zero here)*//* if((no_seqs + first_seq -1) > MAXN) { error("Too many sequences. Maximum is %d",(pint)MAXN); return 0; }*//* DES *//* if(seqFormat == CLUSTAL) { info("no of sequences = %d",(pint)no_seqs); return no_seqs; }*/ max_aln_length = 0;/* if this is a multiple alignment, or profile 1 - free any memory usedby previous alignments, then allocate memory for the new alignment */ if(first_seq == 1) { max_names = 0; free_aln(nseqs); alloc_aln(no_seqs); }/* otherwise, this is a profile 2, and we need to reallocate the arrays,leaving the data for profile 1 intact */ else realloc_aln(first_seq,no_seqs); for(i=1;i<first_seq;i++) { if(seqlen_array[i]>max_aln_length) max_aln_length=seqlen_array[i]; if(strlen(names[i])>max_names) max_names=strlen(names[i]); } for(i=first_seq;i<=first_seq+no_seqs-1;i++) { /* get the seqs now*/ output_index[i] = i; /* default output order */ if(seqFormat == CLUSTAL) seq1=get_clustal_seq(sname1,&l1,title,i-first_seq+1); else if(seqFormat == MSF) seq1=get_msf_seq(sname1,&l1,title,i-first_seq+1); else seq1=get_seq(sname1,&l1,title); if(seq1==NULL) break;/* JULIE *//* Set max length of dynamically allocated arrays in prfalign.c */ if (l1 > max_aln_length) max_aln_length = l1; seqlen_array[i]=l1; /* store the length */ strcpy(names[i],sname1); /* " " name */ strcpy(titles[i],title); /* " " title */ if(!explicit_dnaflag) { dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */ if(i == 1) dnaflag = dnaflag1; } /* type decided by first seq*/ else dnaflag1 = dnaflag; alloc_seq(i,l1); if(dnaflag) n_encode(seq1,seq_array[i],l1); /* encode the sequence*/ else /* as ints */ p_encode(seq1,seq_array[i],l1); if(seq1!=NULL) seq1=ckfree(seq1); } max_aln_length *= 2;/* JULIE check sequence names are all different - otherwise phylip tree is confused.*/ for(i=1;i<=first_seq+no_seqs-1;i++) { for(j=i+1;j<=first_seq+no_seqs-1;j++) { if (strncmp(names[i],names[j],MAXNAMES) == 0) { error("Multiple sequences found with same name, %s (first %d chars are significant)", names[i],MAXNAMES); return 0; } } } for(i=first_seq;i<=first_seq+no_seqs-1;i++) { if(seqlen_array[i]>max_aln_length) max_aln_length=seqlen_array[i]; } /* look for a feature table / gap penalty mask (only if this is a profile) */ if (profile_no > 0) { rewind(fin); struct_penalties = NONE; gap_penalty_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char)); sec_struct_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char)); ss_name = (char *)ckalloc((MAXNAMES+1) * sizeof (char)); if (seqFormat == CLUSTAL) { get_clustal_ss(max_aln_length); } else if (seqFormat == GDE) { get_gde_ss(max_aln_length); } else if (seqFormat == EMBLSWISS) { get_embl_ss(max_aln_length); } else if (seqFormat == RSF) { get_rsf_ss(max_aln_length); } } for(i=first_seq;i<=first_seq+no_seqs-1;i++) { if(strlen(names[i])>max_names) max_names=strlen(names[i]); } if(max_names<10) max_names=10; fclose(fin); return no_seqs; /* return the number of seqs. read in this call */}static Boolean check_dnaflag(char *seq, sint slen)/* check if DNA or Protein The decision is based on counting all A,C,G,T,U or N. If >= 85% of all characters (except -) are as above => DNA */{ sint i, c, nresidues, nbases; float ratio; char *dna_codes="ACGTUN"; nresidues = nbases = 0; for(i=1; i <= slen; i++) { if(seq[i] != '-') { nresidues++; if(seq[i] == 'N') nbases++; else { c = res_index(dna_codes, seq[i]); if(c >= 0) nbases++; } } } if( (nbases == 0) || (nresidues == 0) ) return FALSE; ratio = (float)nbases/(float)nresidues;/* DES fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n", (pint)nbases,(pint)nresidues,(pint)ratio); */ if(ratio >= 0.85) return TRUE; else return FALSE;}static void check_infile(sint *nseqs){ char line[MAXLINE+1]; sint i; *nseqs=0; while (fgets(line,MAXLINE+1,fin) != NULL) { if(!blankline(line)) break; } for(i=strlen(line)-1;i>=0;i--) if(isgraph(line[i])) break; line[i+1]=EOS; for(i=0;i<=6;i++) line[i] = toupper(line[i]); if( linetype(line,"ID") ) { /* EMBL/Swiss-Prot format ? */ seqFormat=EMBLSWISS; (*nseqs)++; } else if( linetype(line,"CLUSTAL") ) { seqFormat=CLUSTAL; } else if( linetype(line,"PILEUP") ) { seqFormat = MSF; } else if( linetype(line,"!!AA_MULTIPLE_ALIGNMENT") ) { seqFormat = MSF; dnaflag = FALSE; } else if( linetype(line,"!!NA_MULTIPLE_ALIGNMENT") ) { seqFormat = MSF; dnaflag = TRUE; } else if( strstr(line,"MSF") && line[strlen(line)-1]=='.' && line[strlen(line)-2]=='.' ) { seqFormat = MSF; } else if( linetype(line,"!!RICH_SEQUENCE") ) { seqFormat = RSF; } else if( linetype(line,"#NEXUS") ) { seqFormat=NEXUS; return; } else if(*line == '>') { /* no */ seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */ (*nseqs)++; } else if((*line == '"') || (*line == '%') || (*line == '#')) { seqFormat=GDE; /* GDE format */ if (*line == '%') { (*nseqs)++; dnaflag = FALSE; } else if (*line == '#') { (*nseqs)++; dnaflag = TRUE; } } else { seqFormat=UNKNOWN; return; } while(fgets(line,MAXLINE+1,fin) != NULL) { switch(seqFormat) { case EMBLSWISS: if( linetype(line,"ID") ) (*nseqs)++; break; case PIR: *nseqs = count_pir_seqs(); fseek(fin,0,0); return; case PEARSON: if( *line == '>' ) (*nseqs)++; break; case GDE: if(( *line == '%' ) && ( dnaflag == FALSE)) (*nseqs)++; else if (( *line == '#') && ( dnaflag == TRUE)) (*nseqs)++; break; case CLUSTAL: *nseqs = count_clustal_seqs();/* DES */ /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */ fseek(fin,0,0); return; case MSF: *nseqs = count_msf_seqs(); fseek(fin,0,0); return; case RSF: fseek(fin,0,0); *nseqs = count_rsf_seqs(); fseek(fin,0,0); return; case USER: default: break; } } fseek(fin,0,0);}static sint count_pir_seqs(void)/* count the number of sequences in a pir alignment file */{ char line[MAXLINE+1],c; sint nseqs, i; Boolean seq_ok; seq_ok = FALSE; while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of first seq */ if(*line == '>') break; for(i=0;seq_ok == FALSE;i++) { c=line[i]; if(c == '*') { seq_ok = TRUE; /* ok - end of sequence found */ break; } /* EOL */ if(c == '\n' || c == EOS) break; /* EOL */ } if (seq_ok == TRUE) break; } if (seq_ok == FALSE) { error("PIR format sequence end marker '*'\nmissing for one or more sequences."); return (sint)0; /* funny format*/ } nseqs = 1; while (fgets(line,MAXLINE+1,fin) != NULL) { if(*line == '>') { /* Look for start of next seq */ seq_ok = FALSE; while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of seq */ if(*line == '>') { error("PIR format sequence end marker '*' missing for one or more sequences."); return (sint)0; /* funny format*/ } for(i=0;seq_ok == FALSE;i++) { c=line[i]; if(c == '*') { seq_ok = TRUE; /* ok - sequence found */ break; } /* EOL */ if(c == '\n' || c == EOS) break; /* EOL */ } if (seq_ok == TRUE) { nseqs++; break; } } } } return (sint)nseqs;}static sint count_clustal_seqs(void)/* count the number of sequences in a clustal alignment file */{ char line[MAXLINE+1]; sint nseqs; while (fgets(line,MAXLINE+1,fin) != NULL) { if(!cl_blankline(line)) break; /* Look for next non- */ } /* blank line */ nseqs = 1; while (fgets(line,MAXLINE+1,fin) != NULL) { if(cl_blankline(line)) return nseqs; nseqs++; } return (sint)0; /* if you got to here-funny format/no seqs.*/}static sint count_msf_seqs(void){/* count the number of sequences in a PILEUP alignment file */ char line[MAXLINE+1]; sint nseqs; while (fgets(line,MAXLINE+1,fin) != NULL) { if(linetype(line,"//")) break; } while (fgets(line,MAXLINE+1,fin) != NULL) { if(!blankline(line)) break; /* Look for next non- */ } /* blank line */ nseqs = 1; while (fgets(line,MAXLINE+1,fin) != NULL) { if(blankline(line)) return nseqs; nseqs++; } return (sint)0; /* if you got to here-funny format/no seqs.*/}static sint count_rsf_seqs(void){/* count the number of sequences in a GCG RSF alignment file */ char line[MAXLINE+1]; sint nseqs; nseqs = 0;/* skip the comments */ while (fgets(line,MAXLINE+1,fin) != NULL) { if(line[strlen(line)-2]=='.' && line[strlen(line)-3]=='.') break; } while (fgets(line,MAXLINE+1,fin) != NULL) { if( *line == '{' ) nseqs++; } return (sint)nseqs;}static void p_encode(char *seq, char *naseq, sint l){ /* code seq as ints .. use gap_pos2 for gap */ register sint i;/* static char *aacids="CSTPAGNDEQHRKMILVFYW";*/ for(i=1;i<=l;i++) if(seq[i] == '-') naseq[i] = gap_pos2; else naseq[i] = res_index(amino_acid_codes,seq[i]); naseq[i] = -3;}static void n_encode(char *seq,char *naseq,sint l){ /* code seq as ints .. use gap_pos2 for gap */ register sint i;/* static char *nucs="ACGTU"; */ for(i=1;i<=l;i++) { if(seq[i] == '-') /* if a gap character -> code = gap_pos2 */ naseq[i] = gap_pos2; /* this is the code for a gap in */ else { /* the input files */ naseq[i]=res_index(amino_acid_codes,seq[i]); } } naseq[i] = -3;}static sint res_index(char *t,char c){ register sint i; for(i=0;t[i] && t[i] != c;i++) ; if(t[i]) return(i); else return -1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -