📄 sequence.c

📁 在任务级并行平台P2HP上
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
		exit(1) ;      /* DES -1 => file not found */

	}

	if((fout=fopen(to,"a+"))==NULL) {
		printf("Could not open sequence file (%s) ",to);
		return ;      /* DES -1 => file not found */
	}

	while(fgets(buff,4096,from)!=NULL)
	{
		fputs(buff,fout);
	}

	fclose(from);
	fclose(fout);
	return;
}


/**************************
Modified by jf.
2005.1.18
**************************/

sint readseqs(sint first_seq,char *datapath,char *datafilename) /*first_seq is the #no. of the first seq. to read */
{
	char line[FILENAMELEN+1];
	//char inputsequencename[FILENAMELEN+1];
	//const char inputsequence[]="sequence.txt"
	//char resultname[FILENAMELEN+1];
	char fileName[FILENAMELEN+1];

	static char *seq1,sname1[MAXNAMES+1],title[MAXTITLES+1];
	sint i,j;
	sint no_seqs;
	static sint l1;
	static Boolean dnaflag1;


	char dataPath[FILENAMELEN+1];
	
	/***if(usemenu)
		getstr("Enter the name of the sequence file",line);
	else
		strcpy(line,seqname);***/
	//strcpy(line,rootname);

    /*strcpy(inputsequencename,resultroot);
	if( inputsequencename[strlen(inputsequencename) -1] != '\\' )
    {
          strcat(inputsequencename, "\\");
	   strcat(inputsequencename,inputsequence);
		}    
	  
       FindFileInDir(inputsequencename,rootname)*/

       strcpy(dataPath,datapath);
       
       strcpy(line,dataPath);
       strcat(line,datafilename);
	
	if(*line== EOS) return -1;
        
	if ((sscanf(line,"file://%s",fileName) == 1 )) {
	  strcpy(line,fileName);
	}

	if((fin=fopen(line,"r"))==NULL) {
		error("Could not open sequence file (%s) ",line);
		return -1;      /* DES -1 => file not found */
	}
	strcpy(seqname,line);
	no_seqs=0;
	check_infile(&no_seqs);
	info("Sequence format is %s",formatNames[seqFormat]);
	if(seqFormat==NEXUS)
		error("Cannot read nexus format");

/* DES DEBUG 
	fprintf(stdout,"\n\n File name = %s\n\n",seqname);
*/
	if(no_seqs == 0)
		return 0;       /* return the number of seqs. (zero here)*/

/*
	if((no_seqs + first_seq -1) > MAXN) {
		error("Too many sequences. Maximum is %d",(pint)MAXN);
		return 0;
	}
*/

/* DES */
/*	if(seqFormat == CLUSTAL) {
		info("no of sequences = %d",(pint)no_seqs);
		return no_seqs;
	}
*/
	max_aln_length = 0;

/* if this is a multiple alignment, or profile 1 - free any memory used
by previous alignments, then allocate memory for the new alignment */
	if(first_seq == 1) {
		max_names = 0;
		free_aln(nseqs);
		alloc_aln(no_seqs);
	}
/* otherwise, this is a profile 2, and we need to reallocate the arrays,
leaving the data for profile 1 intact */
	else realloc_aln(first_seq,no_seqs);

        for(i=1;i<first_seq;i++)
	{
                if(seqlen_array[i]>max_aln_length)
                        max_aln_length=seqlen_array[i];
		if(strlen(names[i])>max_names)
			max_names=strlen(names[i]);
	}

	for(i=first_seq;i<=first_seq+no_seqs-1;i++) {    /* get the seqs now*/
		output_index[i] = i;	/* default output order */
		if(seqFormat == CLUSTAL) 
			seq1=get_clustal_seq(sname1,&l1,title,i-first_seq+1);
		else if(seqFormat == MSF)
			    seq1=get_msf_seq(sname1,&l1,title,i-first_seq+1);
		else
			seq1=get_seq(sname1,&l1,title);

		if(seq1==NULL) break;
/* JULIE */
/*  Set max length of dynamically allocated arrays in prfalign.c */
		if (l1 > max_aln_length) max_aln_length = l1;
		seqlen_array[i]=l1;                   /* store the length */
		strcpy(names[i],sname1);              /*    "   "  name   */
		strcpy(titles[i],title);              /*    "   "  title  */

		if(!explicit_dnaflag) {
			dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */
		        if(i == 1) dnaflag = dnaflag1;
		}			/* type decided by first seq*/
		else
			dnaflag1 = dnaflag;

		alloc_seq(i,l1);

		if(dnaflag)
			n_encode(seq1,seq_array[i],l1); /* encode the sequence*/
		else					/* as ints  */
			p_encode(seq1,seq_array[i],l1);
		if(seq1!=NULL) seq1=ckfree(seq1);
	}


	max_aln_length *= 2;
/*
   JULIE
   check sequence names are all different - otherwise phylip tree is 
   confused.
*/
	for(i=1;i<=first_seq+no_seqs-1;i++) {
		for(j=i+1;j<=first_seq+no_seqs-1;j++) {
			if (strncmp(names[i],names[j],MAXNAMES) == 0) {
				error("Multiple sequences found with same name, %s (first %d chars are significant)", names[i],MAXNAMES);
				return 0;
			}
		}
	}
	for(i=first_seq;i<=first_seq+no_seqs-1;i++)
	{
		if(seqlen_array[i]>max_aln_length)
			max_aln_length=seqlen_array[i];
	}
	
/* look for a feature table / gap penalty mask (only if this is a profile) */
	if (profile_no > 0) {
		rewind(fin);
		struct_penalties = NONE;
    		gap_penalty_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));
    		sec_struct_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));
    		ss_name = (char *)ckalloc((MAXNAMES+1) * sizeof (char));

		if (seqFormat == CLUSTAL) {
			get_clustal_ss(max_aln_length);
		}
		else if (seqFormat == GDE) {
			get_gde_ss(max_aln_length);
		}
		else if (seqFormat == EMBLSWISS) {
			get_embl_ss(max_aln_length);
		}
		else if (seqFormat == RSF) {
			get_rsf_ss(max_aln_length);
		}
	}

	for(i=first_seq;i<=first_seq+no_seqs-1;i++)
	{
		if(strlen(names[i])>max_names)
			max_names=strlen(names[i]);
	}

	if(max_names<10) max_names=10;

	fclose(fin);
			
	return no_seqs;    /* return the number of seqs. read in this call */
}


static Boolean check_dnaflag(char *seq, sint slen)
/* check if DNA or Protein
   The decision is based on counting all A,C,G,T,U or N. 
   If >= 85% of all characters (except -) are as above => DNA  */
{
	sint i, c, nresidues, nbases;
	float ratio;
	char *dna_codes="ACGTUN";
	
	nresidues = nbases = 0;	
	for(i=1; i <= slen; i++) {
		if(seq[i] != '-') {
			nresidues++;
			if(seq[i] == 'N')
				nbases++;
			else {
				c = res_index(dna_codes, seq[i]);
				if(c >= 0)
					nbases++;
			}
		}
	}
	if( (nbases == 0) || (nresidues == 0) ) return FALSE;
	ratio = (float)nbases/(float)nresidues;
/* DES 	fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n",
		(pint)nbases,(pint)nresidues,(pint)ratio); */
	if(ratio >= 0.85) 
		return TRUE;
	else
		return FALSE;
}



static void check_infile(sint *nseqs)
{
	char line[MAXLINE+1];
	sint i;	

	*nseqs=0;
	while (fgets(line,MAXLINE+1,fin) != NULL) {
		if(!blankline(line)) 
			break;
	}

	for(i=strlen(line)-1;i>=0;i--)
		if(isgraph(line[i])) break;
	line[i+1]=EOS;
        
	for(i=0;i<=6;i++) line[i] = toupper(line[i]);

	if( linetype(line,"ID") ) {					/* EMBL/Swiss-Prot format ? */
		seqFormat=EMBLSWISS;
		(*nseqs)++;
	}
        else if( linetype(line,"CLUSTAL") ) {
		seqFormat=CLUSTAL;
	}
 	else if( linetype(line,"PILEUP") ) {
		seqFormat = MSF;
	}
 	else if( linetype(line,"!!AA_MULTIPLE_ALIGNMENT") ) {
		seqFormat = MSF;
		dnaflag = FALSE;
	}
 	else if( linetype(line,"!!NA_MULTIPLE_ALIGNMENT") ) {
		seqFormat = MSF;
		dnaflag = TRUE;
	}
 	else if( strstr(line,"MSF") && line[strlen(line)-1]=='.' &&
                                 line[strlen(line)-2]=='.' ) {
		seqFormat = MSF;
	}
 	else if( linetype(line,"!!RICH_SEQUENCE") ) {
		seqFormat = RSF;
	}
 	else if( linetype(line,"#NEXUS") ) {
		seqFormat=NEXUS;
		return;
	}
	else if(*line == '>') {						/* no */
		seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
		(*nseqs)++;
	}
	else if((*line == '"') || (*line == '%') || (*line == '#')) {
		seqFormat=GDE; /* GDE format */
		if (*line == '%') {
                        (*nseqs)++;
			dnaflag = FALSE;
		}
		else if (*line == '#') {
			(*nseqs)++;
			dnaflag = TRUE;
		}
	}
	else {
		seqFormat=UNKNOWN;
		return;
	}

	while(fgets(line,MAXLINE+1,fin) != NULL) {
		switch(seqFormat) {
			case EMBLSWISS:
				if( linetype(line,"ID") )
					(*nseqs)++;
				break;
			case PIR:
				*nseqs = count_pir_seqs();
				fseek(fin,0,0);
				return;
			case PEARSON:
                                if( *line == '>' )
                                        (*nseqs)++;
                                break;
			case GDE:
				if(( *line == '%' ) && ( dnaflag == FALSE))
					(*nseqs)++;
				else if (( *line == '#') && ( dnaflag == TRUE))
					(*nseqs)++;
				break;
			case CLUSTAL:
				*nseqs = count_clustal_seqs();
/* DES */ 			/* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */
				fseek(fin,0,0);
				return;
			case MSF:
				*nseqs = count_msf_seqs();
				fseek(fin,0,0);
				return;
			case RSF:
				fseek(fin,0,0);
				*nseqs = count_rsf_seqs();
				fseek(fin,0,0);
				return;
			case USER:
			default:
				break;
		}
	}
	fseek(fin,0,0);
}


static sint count_pir_seqs(void)
/* count the number of sequences in a pir alignment file */
{
	char line[MAXLINE+1],c;
	sint  nseqs, i;
	Boolean seq_ok;

	seq_ok = FALSE;
	while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of first seq */
		if(*line == '>') break;
		for(i=0;seq_ok == FALSE;i++) {
			c=line[i];
			if(c == '*') {
				seq_ok = TRUE;	/* ok - end of sequence found */
				break;
			}			/* EOL */
			if(c == '\n' || c == EOS)
				break;			/* EOL */
		}
		if (seq_ok == TRUE)
			break;
	}
	if (seq_ok == FALSE) {
		error("PIR format sequence end marker '*'\nmissing for one or more sequences.");
		return (sint)0;	/* funny format*/
	}
	
	
	nseqs = 1;
	
	while (fgets(line,MAXLINE+1,fin) != NULL) {
		if(*line == '>') {		/* Look for start of next seq */
			seq_ok = FALSE;
			while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of seq */
				if(*line == '>') {
					error("PIR format sequence end marker '*' missing for one or more sequences.");
					return (sint)0;	/* funny format*/
				}
				for(i=0;seq_ok == FALSE;i++) {
					c=line[i];
					if(c == '*') {
						seq_ok = TRUE;	/* ok - sequence found */
						break;
					}			/* EOL */
					if(c == '\n' || c == EOS)
						break;			/* EOL */
				}
				if (seq_ok == TRUE) {
					nseqs++;
					break;
				}
			}
		}
	}
	return (sint)nseqs;
}


static sint count_clustal_seqs(void)
/* count the number of sequences in a clustal alignment file */
{
	char line[MAXLINE+1];
	sint  nseqs;

	while (fgets(line,MAXLINE+1,fin) != NULL) {
		if(!cl_blankline(line)) break;		/* Look for next non- */
	}						/* blank line */
	nseqs = 1;

	while (fgets(line,MAXLINE+1,fin) != NULL) {
		if(cl_blankline(line)) return nseqs;
		nseqs++;
	}

	return (sint)0;	/* if you got to here-funny format/no seqs.*/
}

static sint count_msf_seqs(void)
{
/* count the number of sequences in a PILEUP alignment file */

	char line[MAXLINE+1];
	sint  nseqs;

	while (fgets(line,MAXLINE+1,fin) != NULL) {
		if(linetype(line,"//")) break;
	}

	while (fgets(line,MAXLINE+1,fin) != NULL) {
		if(!blankline(line)) break;		/* Look for next non- */
	}						/* blank line */
	nseqs = 1;

	while (fgets(line,MAXLINE+1,fin) != NULL) {
		if(blankline(line)) return nseqs;
		nseqs++;
	}

	return (sint)0;	/* if you got to here-funny format/no seqs.*/
}

static sint count_rsf_seqs(void)
{
/* count the number of sequences in a GCG RSF alignment file */

	char line[MAXLINE+1];
	sint  nseqs;

	nseqs = 0;
/* skip the comments */
	while (fgets(line,MAXLINE+1,fin) != NULL) {
 		if(line[strlen(line)-2]=='.' &&
                                 line[strlen(line)-3]=='.')
			break;
	}

	while (fgets(line,MAXLINE+1,fin) != NULL) {
                if( *line == '{' )
                      nseqs++;
	}
	return (sint)nseqs;
}

static void p_encode(char *seq, char *naseq, sint l)
{				/* code seq as ints .. use gap_pos2 for gap */
	register sint i;
/*	static char *aacids="CSTPAGNDEQHRKMILVFYW";*/
	
	for(i=1;i<=l;i++)
		if(seq[i] == '-')
			naseq[i] = gap_pos2;
		else
			naseq[i] = res_index(amino_acid_codes,seq[i]);
	naseq[i] = -3;
}

static void n_encode(char *seq,char *naseq,sint l)
{				/* code seq as ints .. use gap_pos2 for gap */
	register sint i;
/*	static char *nucs="ACGTU";	*/
	
	for(i=1;i<=l;i++) {
    	if(seq[i] == '-')          	   /* if a gap character -> code = gap_pos2 */
			naseq[i] = gap_pos2;   /* this is the code for a gap in */
		else {                     /* the input files */
			naseq[i]=res_index(amino_acid_codes,seq[i]);
		}
	}
	naseq[i] = -3;
}

static sint res_index(char *t,char c)
{
	register sint i;
	
	for(i=0;t[i] && t[i] != c;i++)
		;
	if(t[i]) return(i);
	else return -1;
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -