⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bz_hit19.c

📁 一个类似于blast算法的基因数据快速搜索算法
💻 C
字号:
// -- a single match of length 12 at chosen positions in a 19-mer,//  allowing one transition // -- a single match of length 11 at chosen positions in a 18-mer,//  allowing one transition static const char rcsid[] ="$Id: bz_hit19.c,v 1.11 2003/01/09 19:33:56 schwartz Exp $";#include "util.h"#include "seq.h"#include "dna.h"#include "bz_table.h"#include "bz_hit19.h"#include "astack.c"// #include "cache.c"//#define STATISTICS(code) code;#define STATISTICS(code)struct statistics {    long add_word;    long extend_hit;    long a, b, c, d, e;} STATS;void print_statistics() {    fprintf(stderr, "add_word=%ld\n", STATS.add_word);    fprintf(stderr, "extend_hit=%ld\n", STATS.extend_hit);    fprintf(stderr, "a=%ld\n", STATS.a);    fprintf(stderr, "b=%ld\n", STATS.b);    fprintf(stderr, "c=%ld\n", STATS.c);    fprintf(stderr, "d=%ld\n", STATS.d);    fprintf(stderr, "e=%ld\n", STATS.e);}static void print_ll(FILE *fp, unsigned long long x){    int i;    for (i = 36; i >= 0; i -= 2)	fprintf(fp, "%c", "ACGT"[(x>>i)&3]);}static void print_l(FILE *fp, unsigned long x){    int i;    for (i = 22; i >= 0; i -= 2)	fprintf(fp, "%c", "ACGT"[(x>>i)&3]);}//              8765432109876543210// template     1110100110010101111 (12 of 19)#define B(n) ((1ULL<<(n*2))-1)static unsigned int map_19to12(unsigned long long i){        unsigned int j =                ((i & (B(3) << (2*16))) >> (2*7)) |                ((i & (B(1) << (2*14))) >> (2*6)) |                ((i & (B(2) << (2*10))) >> (2*4)) |                ((i & (B(1) << (2* 7))) >> (2*2)) |                ((i & (B(1) << (2* 5))) >> (2*1)) |                ((i & (B(4) << (2* 0))) >> (2*0)) ;	    // DEBUG	    //fprintf(stdout, "%016llx %016x\n", i, j);	    //print_ll(stdout, i); 	    //fprintf(stdout, " ");	    //print_l(stdout, j); 	    //fprintf(stdout, "\n");        return j;}//              765432109876543210// template     111010010100110111  (11 of 18)static unsigned int map_18to11(unsigned long long i){        unsigned int j =                ((i & (B(3) << (2*15))) >> (2*7)) |                ((i & (B(1) << (2*13))) >> (2*6)) |                ((i & (B(1) << (2*10))) >> (2*4)) |                ((i & (B(1) << (2* 8))) >> (2*3)) |                ((i & (B(2) << (2* 4))) >> (2*1)) |                ((i & (B(3) << (2* 0))) >> (2*0)) ;        //fprintf(stdout, "%016lx %016llx\n", j, i);        return j;}#define N4E12 16777216	/* 4^12 *//* -------------   build table of W-tuples in the first sequence  ----------- */typedef struct hashpos {	int pos;		/* position where word starts in sequence 1 */	struct hashpos *next;} Hash;typedef struct blast_table_n bt_t;/* add_word - add to the table a position in sequence 1 of a given word */static void add_word(bt_t *bt, int ecode, int pos){	Hash *h;	if (bt->n_node >= bt->npool)		fatal("blast hashtab overflowed (can't happen)");	h = &(bt->pool[bt->n_node++]);	h->pos = pos;	h->next = bt->hashtab[ecode];	bt->hashtab[ecode] = h;	//STATISTICS(STATS.add_word++);}blast_table_t *blast_1219_new(SEQ *seq, int W){	unsigned long long ei;	unsigned int ecode;	int i;	const unsigned char *s;	const int len1 = SEQ_LEN(seq);	const unsigned char *const seq1 = SEQ_CHARS(seq);	const unsigned char *const seq_end = seq1 + len1;	blast_table_t *bt;	if (W != 12)		fatal("Cannot handle W != 12.");	if (len1 < 19)		fatal("Cannot handle seq1 len < 19.");	bt = ckalloc(sizeof(*bt));	bt->n.hashtab = (Hash **) ckallocz(N4E12*sizeof(Hash *));	W = 0; // because we don't add transitions to this table anymore.	if (len1 >= LONG_MAX/(W+1))		fatalf("sequence too long: %ld\n", len1);	bt->n.npool = (W+1)*(len1 - W + 1);	if ((unsigned int)bt->n.npool >= LONG_MAX/sizeof(Hash))		fatalf("too many nodes: %d\n", bt->n.npool);	bt->n.pool = (Hash *) ckalloc(bt->n.npool*sizeof(Hash));	bt->n.n_node = 0;		for (s = seq1; s < seq_end;  ) {	restart://STATISTICS(STATS.a++);		/* init -- load first 18bp, compute starts with 19th */		ei = 0;		for (i = 0; i < 18 && s < seq_end; ++i) {			const int e = fasta_encoding[*s++];			if (e < 0) goto restart; /* hit a run of 'X's */			ei = (ei << 2) | e;//printf("%c -> ", s[-1]); print_ll(stdout, ei); printf("\n");		}//STATISTICS(STATS.b++);		/* compute */		for ( ; s < seq_end; ++s) {			const int e = fasta_encoding[*s];			if (e < 0) goto restart; /* hit a run of 'X's */			ei = (ei << 2) | e;			ecode = map_19to12(ei);//STATISTICS(STATS.c++);			add_word(&bt->n, ecode, s-seq1+1);		} 	}	return bt;}void blast_1219_free(blast_table_t *bt){	free(bt->n.hashtab);	free(bt->n.pool);}/* -----------------------   search the second sequence   --------------------*/msp_table_t *blast_1219_search(SEQ *seq1, SEQ *seq2, blast_table_t *bt, msp_table_t *mt, ss_t ss, int X, int K, int P, int T){	unsigned long long ei;	unsigned int ecode;	int i;	Hash *h;	const unsigned char *s;	const unsigned char *const seq = SEQ_CHARS(seq2);	const unsigned char *const seq_end = seq + SEQ_LEN(seq2);	const int len1 = SEQ_LEN(seq1);	const int len2 = SEQ_LEN(seq2);	static comb_t *diag_lev = 0;        { unsigned int n = (len1+len2+1);          n = roundup(n, 64*1024);          diag_lev = comb_resize(diag_lev,n,len1);        }	for (s = seq; s < seq_end;  ) {	restart:		/* init -- load first 18bp, compute starts with 19th */		ei = 0;		for (i = 0; i < 18 && s < seq_end; ++i) {			const int e = fasta_encoding[*s++];			if (e < 0) goto restart; /* hit a run of 'X's */			ei = (ei << 2) | e;		}		/* compute */		for ( ; s < seq_end; ++s) {			const int e = fasta_encoding[*s];			if (e < 0) goto restart; /* hit a run of 'X's */			ei = (ei << 2) | e;			ecode = map_19to12(ei);			for (h = bt->n.hashtab[ecode]; h; h = h->next) {//STATISTICS(STATS.extend_hit++);				msp_extend_hit(mt, seq1, seq2,                                                ss, X, K, 19, P,                                                h->pos, s-seq+1, diag_lev);			}			// search transitions too			if (T) for (i = 0; i < 12; ++i) {			    unsigned int t;			    t = ecode ^ (2 << (2 * i)); /* A~G; C~T */			    for (h = bt->n.hashtab[t]; h; h = h->next) {//STATISTICS(STATS.extend_hit++);				msp_extend_hit(mt, seq1, seq2,                                                ss, X, K, 19, P,                                                h->pos, s-seq+1, diag_lev);			    }			}		} 	}	comb_clear(diag_lev);	return mt;}// ----------// Note: a more general version of the transformation function://  j = (ecode & (3<<(2*i))) >> (2*i);//  j = transition[j];//  t = (ecode &~ (3<<(2*i))) | (j << (2*i));// A simple version that just does transitions by flipping the right bit://  t = ecode ^ (2 << (2 * i)); /* A~G; C~T */// ----------

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -