📄 sequence.c

📁 生物序列比对程序clustw的源代码
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
	sint i,j;	sint no_seqs;	static sint l1;	static Boolean dnaflag1;		if(usemenu)		getstr("Enter the name of the sequence file",line);	else		strcpy(line,seqname);	if(*line == EOS) return -1;	if ((sscanf(line,"file://%s",fileName) == 1 )) {	  strcpy(line,fileName);	}	if((fin=fopen(line,"r"))==NULL) {		error("Could not open sequence file (%s) ",line);		return -1;      /* DES -1 => file not found */	}	strcpy(seqname,line);	no_seqs=0;	check_infile(&no_seqs);	info("Sequence format is %s",formatNames[seqFormat]);	if(seqFormat==NEXUS)		error("Cannot read nexus format");/* DES DEBUG 	fprintf(stdout,"\n\n File name = %s\n\n",seqname);*/	if(no_seqs == 0)		return 0;       /* return the number of seqs. (zero here)*//*	if((no_seqs + first_seq -1) > MAXN) {		error("Too many sequences. Maximum is %d",(pint)MAXN);		return 0;	}*//* DES *//*	if(seqFormat == CLUSTAL) {		info("no of sequences = %d",(pint)no_seqs);		return no_seqs;	}*/	max_aln_length = 0;/* if this is a multiple alignment, or profile 1 - free any memory usedby previous alignments, then allocate memory for the new alignment */	if(first_seq == 1) {		max_names = 0;		free_aln(nseqs);		alloc_aln(no_seqs);	}/* otherwise, this is a profile 2, and we need to reallocate the arrays,leaving the data for profile 1 intact */	else realloc_aln(first_seq,no_seqs);        for(i=1;i<first_seq;i++)	{                if(seqlen_array[i]>max_aln_length)                        max_aln_length=seqlen_array[i];		if(strlen(names[i])>max_names)			max_names=strlen(names[i]);	}	for(i=first_seq;i<=first_seq+no_seqs-1;i++) {    /* get the seqs now*/		output_index[i] = i;	/* default output order */		if(seqFormat == CLUSTAL) 			seq1=get_clustal_seq(sname1,&l1,title,i-first_seq+1);		else if(seqFormat == MSF)			    seq1=get_msf_seq(sname1,&l1,title,i-first_seq+1);		else			seq1=get_seq(sname1,&l1,title);		if(seq1==NULL) break;/* JULIE *//*  Set max length of dynamically allocated arrays in prfalign.c */		if (l1 > max_aln_length) max_aln_length = l1;		seqlen_array[i]=l1;                   /* store the length */		strcpy(names[i],sname1);              /*    "   "  name   */		strcpy(titles[i],title);              /*    "   "  title  */		if(!explicit_dnaflag) {			dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */		        if(i == 1) dnaflag = dnaflag1;		}			/* type decided by first seq*/		else			dnaflag1 = dnaflag;		alloc_seq(i,l1);		if(dnaflag)			n_encode(seq1,seq_array[i],l1); /* encode the sequence*/		else					/* as ints  */			p_encode(seq1,seq_array[i],l1);		if(seq1!=NULL) seq1=ckfree(seq1);	}	max_aln_length *= 2;/*   JULIE   check sequence names are all different - otherwise phylip tree is    confused.*/	for(i=1;i<=first_seq+no_seqs-1;i++) {		for(j=i+1;j<=first_seq+no_seqs-1;j++) {			if (strncmp(names[i],names[j],MAXNAMES) == 0) {				error("Multiple sequences found with same name, %s (first %d chars are significant)", names[i],MAXNAMES);				return 0;			}		}	}	for(i=first_seq;i<=first_seq+no_seqs-1;i++)	{		if(seqlen_array[i]>max_aln_length)			max_aln_length=seqlen_array[i];	}	/* look for a feature table / gap penalty mask (only if this is a profile) */	if (profile_no > 0) {		rewind(fin);		struct_penalties = NONE;    		gap_penalty_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));    		sec_struct_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));    		ss_name = (char *)ckalloc((MAXNAMES+1) * sizeof (char));		if (seqFormat == CLUSTAL) {			get_clustal_ss(max_aln_length);		}		else if (seqFormat == GDE) {			get_gde_ss(max_aln_length);		}		else if (seqFormat == EMBLSWISS) {			get_embl_ss(max_aln_length);		}		else if (seqFormat == RSF) {			get_rsf_ss(max_aln_length);		}	}	for(i=first_seq;i<=first_seq+no_seqs-1;i++)	{		if(strlen(names[i])>max_names)			max_names=strlen(names[i]);	}	if(max_names<10) max_names=10;	fclose(fin);				return no_seqs;    /* return the number of seqs. read in this call */}static Boolean check_dnaflag(char *seq, sint slen)/* check if DNA or Protein   The decision is based on counting all A,C,G,T,U or N.    If >= 85% of all characters (except -) are as above => DNA  */{	sint i, c, nresidues, nbases;	float ratio;	char *dna_codes="ACGTUN";		nresidues = nbases = 0;		for(i=1; i <= slen; i++) {		if(seq[i] != '-') {			nresidues++;			if(seq[i] == 'N')				nbases++;			else {				c = res_index(dna_codes, seq[i]);				if(c >= 0)					nbases++;			}		}	}	if( (nbases == 0) || (nresidues == 0) ) return FALSE;	ratio = (float)nbases/(float)nresidues;/* DES 	fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n",		(pint)nbases,(pint)nresidues,(pint)ratio); */	if(ratio >= 0.85) 		return TRUE;	else		return FALSE;}static void check_infile(sint *nseqs){	char line[MAXLINE+1];	sint i;		*nseqs=0;	while (fgets(line,MAXLINE+1,fin) != NULL) {		if(!blankline(line)) 			break;	}	for(i=strlen(line)-1;i>=0;i--)		if(isgraph(line[i])) break;	line[i+1]=EOS;        	for(i=0;i<=6;i++) line[i] = toupper(line[i]);	if( linetype(line,"ID") ) {					/* EMBL/Swiss-Prot format ? */		seqFormat=EMBLSWISS;		(*nseqs)++;	}        else if( linetype(line,"CLUSTAL") ) {		seqFormat=CLUSTAL;	} 	else if( linetype(line,"PILEUP") ) {		seqFormat = MSF;	} 	else if( linetype(line,"!!AA_MULTIPLE_ALIGNMENT") ) {		seqFormat = MSF;		dnaflag = FALSE;	} 	else if( linetype(line,"!!NA_MULTIPLE_ALIGNMENT") ) {		seqFormat = MSF;		dnaflag = TRUE;	} 	else if( strstr(line,"MSF") && line[strlen(line)-1]=='.' &&                                 line[strlen(line)-2]=='.' ) {		seqFormat = MSF;	} 	else if( linetype(line,"!!RICH_SEQUENCE") ) {		seqFormat = RSF;	} 	else if( linetype(line,"#NEXUS") ) {		seqFormat=NEXUS;		return;	}	else if(*line == '>') {						/* no */		seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */		(*nseqs)++;	}	else if((*line == '"') || (*line == '%') || (*line == '#')) {		seqFormat=GDE; /* GDE format */		if (*line == '%') {                        (*nseqs)++;			dnaflag = FALSE;		}		else if (*line == '#') {			(*nseqs)++;			dnaflag = TRUE;		}	}	else {		seqFormat=UNKNOWN;		return;	}	while(fgets(line,MAXLINE+1,fin) != NULL) {		switch(seqFormat) {			case EMBLSWISS:				if( linetype(line,"ID") )					(*nseqs)++;				break;			case PIR:				*nseqs = count_pir_seqs();				fseek(fin,0,0);				return;			case PEARSON:                                if( *line == '>' )                                        (*nseqs)++;                                break;			case GDE:				if(( *line == '%' ) && ( dnaflag == FALSE))					(*nseqs)++;				else if (( *line == '#') && ( dnaflag == TRUE))					(*nseqs)++;				break;			case CLUSTAL:				*nseqs = count_clustal_seqs();/* DES */ 			/* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */				fseek(fin,0,0);				return;			case MSF:				*nseqs = count_msf_seqs();				fseek(fin,0,0);				return;			case RSF:				fseek(fin,0,0);				*nseqs = count_rsf_seqs();				fseek(fin,0,0);				return;			case USER:			default:				break;		}	}	fseek(fin,0,0);}static sint count_pir_seqs(void)/* count the number of sequences in a pir alignment file */{	char line[MAXLINE+1],c;	sint  nseqs, i;	Boolean seq_ok;	seq_ok = FALSE;	while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of first seq */		if(*line == '>') break;		for(i=0;seq_ok == FALSE;i++) {			c=line[i];			if(c == '*') {				seq_ok = TRUE;	/* ok - end of sequence found */				break;			}			/* EOL */			if(c == '\n' || c == EOS)				break;			/* EOL */		}		if (seq_ok == TRUE)			break;	}	if (seq_ok == FALSE) {		error("PIR format sequence end marker '*'\nmissing for one or more sequences.");		return (sint)0;	/* funny format*/	}			nseqs = 1;		while (fgets(line,MAXLINE+1,fin) != NULL) {		if(*line == '>') {		/* Look for start of next seq */			seq_ok = FALSE;			while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of seq */				if(*line == '>') {					error("PIR format sequence end marker '*' missing for one or more sequences.");					return (sint)0;	/* funny format*/				}				for(i=0;seq_ok == FALSE;i++) {					c=line[i];					if(c == '*') {						seq_ok = TRUE;	/* ok - sequence found */						break;					}			/* EOL */					if(c == '\n' || c == EOS)						break;			/* EOL */				}				if (seq_ok == TRUE) {					nseqs++;					break;				}			}		}	}	return (sint)nseqs;}static sint count_clustal_seqs(void)/* count the number of sequences in a clustal alignment file */{	char line[MAXLINE+1];	sint  nseqs;	while (fgets(line,MAXLINE+1,fin) != NULL) {		if(!cl_blankline(line)) break;		/* Look for next non- */	}						/* blank line */	nseqs = 1;	while (fgets(line,MAXLINE+1,fin) != NULL) {		if(cl_blankline(line)) return nseqs;		nseqs++;	}	return (sint)0;	/* if you got to here-funny format/no seqs.*/}static sint count_msf_seqs(void){/* count the number of sequences in a PILEUP alignment file */	char line[MAXLINE+1];	sint  nseqs;	while (fgets(line,MAXLINE+1,fin) != NULL) {		if(linetype(line,"//")) break;	}	while (fgets(line,MAXLINE+1,fin) != NULL) {		if(!blankline(line)) break;		/* Look for next non- */	}						/* blank line */	nseqs = 1;	while (fgets(line,MAXLINE+1,fin) != NULL) {		if(blankline(line)) return nseqs;		nseqs++;	}	return (sint)0;	/* if you got to here-funny format/no seqs.*/}static sint count_rsf_seqs(void){/* count the number of sequences in a GCG RSF alignment file */	char line[MAXLINE+1];	sint  nseqs;	nseqs = 0;/* skip the comments */	while (fgets(line,MAXLINE+1,fin) != NULL) { 		if(line[strlen(line)-2]=='.' &&                                 line[strlen(line)-3]=='.')			break;	}	while (fgets(line,MAXLINE+1,fin) != NULL) {                if( *line == '{' )                      nseqs++;	}	return (sint)nseqs;}static void p_encode(char *seq, char *naseq, sint l){				/* code seq as ints .. use gap_pos2 for gap */	register sint i;/*	static char *aacids="CSTPAGNDEQHRKMILVFYW";*/		for(i=1;i<=l;i++)		if(seq[i] == '-')			naseq[i] = gap_pos2;		else			naseq[i] = res_index(amino_acid_codes,seq[i]);	naseq[i] = -3;}static void n_encode(char *seq,char *naseq,sint l){				/* code seq as ints .. use gap_pos2 for gap */	register sint i;/*	static char *nucs="ACGTU";	*/		for(i=1;i<=l;i++) {    	if(seq[i] == '-')          	   /* if a gap character -> code = gap_pos2 */			naseq[i] = gap_pos2;   /* this is the code for a gap in */		else {                     /* the input files */			naseq[i]=res_index(amino_acid_codes,seq[i]);		}	}	naseq[i] = -3;}static sint res_index(char *t,char c){	register sint i;		for(i=0;t[i] && t[i] != c;i++)		;	if(t[i]) return(i);	else return -1;}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -