📄 sequence.c
字号:
exit(1) ; /* DES -1 => file not found */
}
if((fout=fopen(to,"a+"))==NULL) {
printf("Could not open sequence file (%s) ",to);
return ; /* DES -1 => file not found */
}
while(fgets(buff,4096,from)!=NULL)
{
fputs(buff,fout);
}
fclose(from);
fclose(fout);
return;
}
/**************************
Modified by jf.
2005.1.18
**************************/
sint readseqs(sint first_seq,char *datapath,char *datafilename) /*first_seq is the #no. of the first seq. to read */
{
char line[FILENAMELEN+1];
//char inputsequencename[FILENAMELEN+1];
//const char inputsequence[]="sequence.txt"
//char resultname[FILENAMELEN+1];
char fileName[FILENAMELEN+1];
static char *seq1,sname1[MAXNAMES+1],title[MAXTITLES+1];
sint i,j;
sint no_seqs;
static sint l1;
static Boolean dnaflag1;
char dataPath[FILENAMELEN+1];
/***if(usemenu)
getstr("Enter the name of the sequence file",line);
else
strcpy(line,seqname);***/
//strcpy(line,rootname);
/*strcpy(inputsequencename,resultroot);
if( inputsequencename[strlen(inputsequencename) -1] != '\\' )
{
strcat(inputsequencename, "\\");
strcat(inputsequencename,inputsequence);
}
FindFileInDir(inputsequencename,rootname)*/
strcpy(dataPath,datapath);
strcpy(line,dataPath);
strcat(line,datafilename);
if(*line== EOS) return -1;
if ((sscanf(line,"file://%s",fileName) == 1 )) {
strcpy(line,fileName);
}
if((fin=fopen(line,"r"))==NULL) {
error("Could not open sequence file (%s) ",line);
return -1; /* DES -1 => file not found */
}
strcpy(seqname,line);
no_seqs=0;
check_infile(&no_seqs);
info("Sequence format is %s",formatNames[seqFormat]);
if(seqFormat==NEXUS)
error("Cannot read nexus format");
/* DES DEBUG
fprintf(stdout,"\n\n File name = %s\n\n",seqname);
*/
if(no_seqs == 0)
return 0; /* return the number of seqs. (zero here)*/
/*
if((no_seqs + first_seq -1) > MAXN) {
error("Too many sequences. Maximum is %d",(pint)MAXN);
return 0;
}
*/
/* DES */
/* if(seqFormat == CLUSTAL) {
info("no of sequences = %d",(pint)no_seqs);
return no_seqs;
}
*/
max_aln_length = 0;
/* if this is a multiple alignment, or profile 1 - free any memory used
by previous alignments, then allocate memory for the new alignment */
if(first_seq == 1) {
max_names = 0;
free_aln(nseqs);
alloc_aln(no_seqs);
}
/* otherwise, this is a profile 2, and we need to reallocate the arrays,
leaving the data for profile 1 intact */
else realloc_aln(first_seq,no_seqs);
for(i=1;i<first_seq;i++)
{
if(seqlen_array[i]>max_aln_length)
max_aln_length=seqlen_array[i];
if(strlen(names[i])>max_names)
max_names=strlen(names[i]);
}
for(i=first_seq;i<=first_seq+no_seqs-1;i++) { /* get the seqs now*/
output_index[i] = i; /* default output order */
if(seqFormat == CLUSTAL)
seq1=get_clustal_seq(sname1,&l1,title,i-first_seq+1);
else if(seqFormat == MSF)
seq1=get_msf_seq(sname1,&l1,title,i-first_seq+1);
else
seq1=get_seq(sname1,&l1,title);
if(seq1==NULL) break;
/* JULIE */
/* Set max length of dynamically allocated arrays in prfalign.c */
if (l1 > max_aln_length) max_aln_length = l1;
seqlen_array[i]=l1; /* store the length */
strcpy(names[i],sname1); /* " " name */
strcpy(titles[i],title); /* " " title */
if(!explicit_dnaflag) {
dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */
if(i == 1) dnaflag = dnaflag1;
} /* type decided by first seq*/
else
dnaflag1 = dnaflag;
alloc_seq(i,l1);
if(dnaflag)
n_encode(seq1,seq_array[i],l1); /* encode the sequence*/
else /* as ints */
p_encode(seq1,seq_array[i],l1);
if(seq1!=NULL) seq1=ckfree(seq1);
}
max_aln_length *= 2;
/*
JULIE
check sequence names are all different - otherwise phylip tree is
confused.
*/
for(i=1;i<=first_seq+no_seqs-1;i++) {
for(j=i+1;j<=first_seq+no_seqs-1;j++) {
if (strncmp(names[i],names[j],MAXNAMES) == 0) {
error("Multiple sequences found with same name, %s (first %d chars are significant)", names[i],MAXNAMES);
return 0;
}
}
}
for(i=first_seq;i<=first_seq+no_seqs-1;i++)
{
if(seqlen_array[i]>max_aln_length)
max_aln_length=seqlen_array[i];
}
/* look for a feature table / gap penalty mask (only if this is a profile) */
if (profile_no > 0) {
rewind(fin);
struct_penalties = NONE;
gap_penalty_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));
sec_struct_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char));
ss_name = (char *)ckalloc((MAXNAMES+1) * sizeof (char));
if (seqFormat == CLUSTAL) {
get_clustal_ss(max_aln_length);
}
else if (seqFormat == GDE) {
get_gde_ss(max_aln_length);
}
else if (seqFormat == EMBLSWISS) {
get_embl_ss(max_aln_length);
}
else if (seqFormat == RSF) {
get_rsf_ss(max_aln_length);
}
}
for(i=first_seq;i<=first_seq+no_seqs-1;i++)
{
if(strlen(names[i])>max_names)
max_names=strlen(names[i]);
}
if(max_names<10) max_names=10;
fclose(fin);
return no_seqs; /* return the number of seqs. read in this call */
}
static Boolean check_dnaflag(char *seq, sint slen)
/* check if DNA or Protein
The decision is based on counting all A,C,G,T,U or N.
If >= 85% of all characters (except -) are as above => DNA */
{
sint i, c, nresidues, nbases;
float ratio;
char *dna_codes="ACGTUN";
nresidues = nbases = 0;
for(i=1; i <= slen; i++) {
if(seq[i] != '-') {
nresidues++;
if(seq[i] == 'N')
nbases++;
else {
c = res_index(dna_codes, seq[i]);
if(c >= 0)
nbases++;
}
}
}
if( (nbases == 0) || (nresidues == 0) ) return FALSE;
ratio = (float)nbases/(float)nresidues;
/* DES fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n",
(pint)nbases,(pint)nresidues,(pint)ratio); */
if(ratio >= 0.85)
return TRUE;
else
return FALSE;
}
static void check_infile(sint *nseqs)
{
char line[MAXLINE+1];
sint i;
*nseqs=0;
while (fgets(line,MAXLINE+1,fin) != NULL) {
if(!blankline(line))
break;
}
for(i=strlen(line)-1;i>=0;i--)
if(isgraph(line[i])) break;
line[i+1]=EOS;
for(i=0;i<=6;i++) line[i] = toupper(line[i]);
if( linetype(line,"ID") ) { /* EMBL/Swiss-Prot format ? */
seqFormat=EMBLSWISS;
(*nseqs)++;
}
else if( linetype(line,"CLUSTAL") ) {
seqFormat=CLUSTAL;
}
else if( linetype(line,"PILEUP") ) {
seqFormat = MSF;
}
else if( linetype(line,"!!AA_MULTIPLE_ALIGNMENT") ) {
seqFormat = MSF;
dnaflag = FALSE;
}
else if( linetype(line,"!!NA_MULTIPLE_ALIGNMENT") ) {
seqFormat = MSF;
dnaflag = TRUE;
}
else if( strstr(line,"MSF") && line[strlen(line)-1]=='.' &&
line[strlen(line)-2]=='.' ) {
seqFormat = MSF;
}
else if( linetype(line,"!!RICH_SEQUENCE") ) {
seqFormat = RSF;
}
else if( linetype(line,"#NEXUS") ) {
seqFormat=NEXUS;
return;
}
else if(*line == '>') { /* no */
seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
(*nseqs)++;
}
else if((*line == '"') || (*line == '%') || (*line == '#')) {
seqFormat=GDE; /* GDE format */
if (*line == '%') {
(*nseqs)++;
dnaflag = FALSE;
}
else if (*line == '#') {
(*nseqs)++;
dnaflag = TRUE;
}
}
else {
seqFormat=UNKNOWN;
return;
}
while(fgets(line,MAXLINE+1,fin) != NULL) {
switch(seqFormat) {
case EMBLSWISS:
if( linetype(line,"ID") )
(*nseqs)++;
break;
case PIR:
*nseqs = count_pir_seqs();
fseek(fin,0,0);
return;
case PEARSON:
if( *line == '>' )
(*nseqs)++;
break;
case GDE:
if(( *line == '%' ) && ( dnaflag == FALSE))
(*nseqs)++;
else if (( *line == '#') && ( dnaflag == TRUE))
(*nseqs)++;
break;
case CLUSTAL:
*nseqs = count_clustal_seqs();
/* DES */ /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */
fseek(fin,0,0);
return;
case MSF:
*nseqs = count_msf_seqs();
fseek(fin,0,0);
return;
case RSF:
fseek(fin,0,0);
*nseqs = count_rsf_seqs();
fseek(fin,0,0);
return;
case USER:
default:
break;
}
}
fseek(fin,0,0);
}
static sint count_pir_seqs(void)
/* count the number of sequences in a pir alignment file */
{
char line[MAXLINE+1],c;
sint nseqs, i;
Boolean seq_ok;
seq_ok = FALSE;
while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of first seq */
if(*line == '>') break;
for(i=0;seq_ok == FALSE;i++) {
c=line[i];
if(c == '*') {
seq_ok = TRUE; /* ok - end of sequence found */
break;
} /* EOL */
if(c == '\n' || c == EOS)
break; /* EOL */
}
if (seq_ok == TRUE)
break;
}
if (seq_ok == FALSE) {
error("PIR format sequence end marker '*'\nmissing for one or more sequences.");
return (sint)0; /* funny format*/
}
nseqs = 1;
while (fgets(line,MAXLINE+1,fin) != NULL) {
if(*line == '>') { /* Look for start of next seq */
seq_ok = FALSE;
while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of seq */
if(*line == '>') {
error("PIR format sequence end marker '*' missing for one or more sequences.");
return (sint)0; /* funny format*/
}
for(i=0;seq_ok == FALSE;i++) {
c=line[i];
if(c == '*') {
seq_ok = TRUE; /* ok - sequence found */
break;
} /* EOL */
if(c == '\n' || c == EOS)
break; /* EOL */
}
if (seq_ok == TRUE) {
nseqs++;
break;
}
}
}
}
return (sint)nseqs;
}
static sint count_clustal_seqs(void)
/* count the number of sequences in a clustal alignment file */
{
char line[MAXLINE+1];
sint nseqs;
while (fgets(line,MAXLINE+1,fin) != NULL) {
if(!cl_blankline(line)) break; /* Look for next non- */
} /* blank line */
nseqs = 1;
while (fgets(line,MAXLINE+1,fin) != NULL) {
if(cl_blankline(line)) return nseqs;
nseqs++;
}
return (sint)0; /* if you got to here-funny format/no seqs.*/
}
static sint count_msf_seqs(void)
{
/* count the number of sequences in a PILEUP alignment file */
char line[MAXLINE+1];
sint nseqs;
while (fgets(line,MAXLINE+1,fin) != NULL) {
if(linetype(line,"//")) break;
}
while (fgets(line,MAXLINE+1,fin) != NULL) {
if(!blankline(line)) break; /* Look for next non- */
} /* blank line */
nseqs = 1;
while (fgets(line,MAXLINE+1,fin) != NULL) {
if(blankline(line)) return nseqs;
nseqs++;
}
return (sint)0; /* if you got to here-funny format/no seqs.*/
}
static sint count_rsf_seqs(void)
{
/* count the number of sequences in a GCG RSF alignment file */
char line[MAXLINE+1];
sint nseqs;
nseqs = 0;
/* skip the comments */
while (fgets(line,MAXLINE+1,fin) != NULL) {
if(line[strlen(line)-2]=='.' &&
line[strlen(line)-3]=='.')
break;
}
while (fgets(line,MAXLINE+1,fin) != NULL) {
if( *line == '{' )
nseqs++;
}
return (sint)nseqs;
}
static void p_encode(char *seq, char *naseq, sint l)
{ /* code seq as ints .. use gap_pos2 for gap */
register sint i;
/* static char *aacids="CSTPAGNDEQHRKMILVFYW";*/
for(i=1;i<=l;i++)
if(seq[i] == '-')
naseq[i] = gap_pos2;
else
naseq[i] = res_index(amino_acid_codes,seq[i]);
naseq[i] = -3;
}
static void n_encode(char *seq,char *naseq,sint l)
{ /* code seq as ints .. use gap_pos2 for gap */
register sint i;
/* static char *nucs="ACGTU"; */
for(i=1;i<=l;i++) {
if(seq[i] == '-') /* if a gap character -> code = gap_pos2 */
naseq[i] = gap_pos2; /* this is the code for a gap in */
else { /* the input files */
naseq[i]=res_index(amino_acid_codes,seq[i]);
}
}
naseq[i] = -3;
}
static sint res_index(char *t,char c)
{
register sint i;
for(i=0;t[i] && t[i] != c;i++)
;
if(t[i]) return(i);
else return -1;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -