📄 getfastasets.c
字号:
/* * $Id: getfastasets.c 1272 2007-05-09 16:26:20Z mhe $ */ #include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include "lstm.h"#include "getfastasets.h"// returns size (sequences) of the fileint get_data_sets(char *what, int class, char **fasta, int *tar, char **aaSym, int offset, int length[], char *filename) { FILE *fp; int res,i, j, k, l, maxseqlen; int aacounter = 0; // counter for symbols of amino acids char description[MAXCHARSFASTADESC]; // temporary buffer for fasta description char buf; // read in buffer char *classstring; // for output messages ("positive" / "negative") char *set; // output ("testdataset" / "traindataset") char *whatcap; // ("Train" / "Test") // fit the outputstrings depending on the parameters if (!strcmp(what,"training")) { set = "traindataset"; whatcap = "Training"; } else { if (!strcmp(what, "test")) { set = "testdataset"; whatcap = "Test"; } else { fprintf(stderr, "Wrong parameter for get_data_sets\n"); exit(1); } } if (class == 1) { classstring = "positive"; } else { if (class == 0) { classstring = "negative"; } else { fprintf(stderr, "Unknown class in get_data_sets\n"); exit(1); } } fprintf(stderr, "reading %s %s: %s\n", classstring, set, filename); if ((fp=fopen(filename,"r")) == NULL) { fprintf(stderr, "can't open %s\n", filename); exit(1); } maxseqlen = 0; // first call of this function with positive sequences into the datastructures // *tar and **aaSym at offset 0 // second call of this function with negative sequences into the same(!) // datastructures at offset > 0 (number of first read in positive sequences) i = offset; j = 0; do { // first line (third, fifth..) is (should be) fasta description // (beginning with '>') k = 0; while ((res = fscanf(fp, "%c", &description[k]))) { if (!res) { fprintf(stderr, "Error on reading %s on line %d char %d\n", set, 2 * (i - offset), j); } if (res == EOF) { fprintf(stderr, "Error in fastafile: EOF after description"); exit(1); } if (description[k] == '\n') { description[k] = '\0'; for (l = 0; l <= k; l++) { char dummy = description[l]; } break; } k++; } // sequence symbols while ((res = fscanf(fp, "%c", &buf))) { if (!res){ fprintf(stderr, "Error on reading %s on line %d char %d\n", set, 2 * (i - offset), j); exit(1); } // next fasta description resp. next sequence or end of file if (buf == '>' || res == ' ' || res == EOF) { break; } if (buf != '\n' && buf != '\r') { aaSym[i][j] = buf; j++; } } length[i] = j; if (length[i] > maxseqlen) { maxseqlen = length[i]; } tar[i] = class; // target aacounter += j; i++; j = 0; } while (res != EOF); fprintf(stderr, "Num %s %s proteins: %d\n", classstring, what, i - offset); fprintf(stderr, "%s AAs: %d MaxSeq %d Avg %d\n\n", whatcap, aacounter, maxseqlen, aacounter / (i - offset)); fclose(fp); return i - offset; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -