⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bz_table.c

📁 一个类似于blast算法的基因数据快速搜索算法
💻 C
字号:
#ifndef __lintstatic const char rcsid[] ="$Id: bz_table.c,v 1.15 2003/01/09 19:33:56 schwartz Exp $";#endif#ifdef DEBUG#include "valgrind.h"#endif#include "util.h"#include "seq.h"#include "dna.h"#include "bz_table.h"#include "astack.c"typedef struct blast_table_w bt_t;typedef struct blast_epos_node {	struct blast_epos_node *link;	/* next occurrence of the word */	blast_ecode_t ecode;	int pos;		/* position where word occurs in seq #1 */} blast_epos_node_t;static void blast_free_chain(bt_t *bt, blast_ecode_t ecode);/* -------------   build table of W-tuples in the first sequence ------------*/#ifdef USE_OBSTACK  /* ifdef considered harmful !!! */#define obstack_chunk_alloc ckalloc#define obstack_chunk_free ckfree#include <obstack.h> /* GNU-ism */static struct obstack bt_obstk;#endifmsp_table_t *msp_new_table(void){	msp_table_t *mt = ckallocz(sizeof(*mt));	return mt;}static blast_epos_node_t *cons_epos_node(int pos, blast_ecode_t ecode, blast_epos_node_t *link){#ifdef USE_OBSTACK	blast_epos_node_t *n = obstack_alloc(&bt_obstk, sizeof(blast_epos_node_t));#else	blast_epos_node_t *n = ckalloc(sizeof(struct blast_epos_node));#endif	n->link = link;	n->pos = pos;	n->ecode = ecode;	return n;}static unsigned int pos_add(bt_t *bt, blast_ecode_t ecode, int pos){	unsigned int e = ecode % BLAST_POS_TAB_SIZE;	bt->epos_tab[e] = cons_epos_node(pos, ecode, bt->epos_tab[e]);	bt->num++;	return e;}static blast_epos_node_t **pos_getp(const bt_t *bt, blast_ecode_t ecode){	unsigned int e = ecode % BLAST_POS_TAB_SIZE;	return &bt->epos_tab[e];}static blast_epos_node_t *pos_get(const bt_t *bt, blast_ecode_t ecode){	return *pos_getp(bt, ecode);}static blast_table_t *cons_blast_table(int W, const signed char *encoding){	blast_table_t *bt = ckallocz(sizeof(*bt));	bt->w.encoding = encoding;	if ((unsigned int)W > sizeof(blast_ecode_t)*8/2)		W = sizeof(blast_ecode_t)*8/2;	bt->w.W = W;	bt->w.epos_tab=ckallocz(BLAST_POS_TAB_SIZE*sizeof(blast_epos_node_t));#ifdef USE_OBSTACK	obstack_begin(&bt_obstk, 10*1024*1024);#endif	return bt;}static blast_table_t *blast_table_enc_new(SEQ *seq, int W,  const signed char *enc){	blast_ecode_t ecode;	int i;	uchar *s;	uchar *const seq_beg = SEQ_CHARS(seq);	uchar *const seq_end = seq_beg+SEQ_LEN(seq);	blast_table_t *const bt = cons_blast_table(W, enc);	const signed char *const encoding = bt->w.encoding;	const blast_ecode_t mask = (1UL << ((W-1)*2)) - 1;        if (W<=1) return 0;	for (s = seq_beg; s < seq_end; ) {	restart:		/* init */		ecode = 0L;		for (i = 1; i < W && s < seq_end; ++i) {			const int e = encoding[*s++];			if (e < 0) goto restart; /* hit a run of 'X's */			ecode = (ecode << 2) + e;		}		/* compute */		for (; s < seq_end; ++s) {			const int e = encoding[*s];			if (e < 0) goto restart; /* hit a run of 'X's */			ecode = ((ecode & mask) << 2) + e;			pos_add(&bt->w, ecode, s-seq_beg+1);		} 	}	return bt;}static blast_table_t *blast_table_masked_new(SEQ *seq, int W){	return blast_table_enc_new(seq, W, fasta_encoding);}blast_table_t *blast_table_new(SEQ *seq, int W){	return blast_table_masked_new(seq, W);}#if 0     static int chain_len(bt_t *bt, blast_ecode_t ecode){        blast_epos_node_t *p;	int count = 0;	for (p = pos_get(bt, ecode); p; p = p->link)		++count;	return count;}static int pos_density(bt_t *bt, blast_ecode_t ecode){        blast_epos_node_t *p;	int count = 0;	for (p = pos_get(bt, ecode); p; p = p->link)		++count;	return count;}bt_t *blast_table_pare(bt_t *bt, int surfeit){        blast_ecode_t ecode;        if (bt == 0) return 0;         for (ecode = 0; ecode < BLAST_POS_TAB_SIZE; ++ecode) {		/* Delete ecodes whose chains are too long. */		if (chain_len(bt, ecode) > surfeit)			blast_free_chain(bt, ecode);		/* Delete ecodes whose positions are too dense. */		if (pos_density(bt, ecode) > density)			blast_free_chain(bt, ecode);	}	return bt;}#endif/* -----------------------   search the second sequence   --------------------*/static double entropy(const uchar *s, int n){	double pA, pC, pG, pT, qA, qC, qG, qT, e;	int count[128];	int i, cA, cC, cG, cT;	for (i = 0; i < 128; ++i)		count[i] = 0;	for (i = 0; i < n; ++i)		++count[s[i]];	cA = count['A'];	cC = count['C'];	cG = count['G'];	cT = count['T'];	pA = ((double)count['A']) / ((double)n);	pC = ((double)count['C']) / ((double)n);	pG = ((double)count['G']) / ((double)n);	pT = ((double)count['T']) / ((double)n);	qA = (cA ? log(pA) : 0.0);	qC = (cC ? log(pC) : 0.0);	qG = (cG ? log(pG) : 0.0);	qT = (cT ? log(pT) : 0.0);	e = -(pA*qA + pC*qC + pG*qG + pT*qT)/log(4.0);	return e;}int msp_add(msp_table_t *mt, 		int len, int pos1, int pos2, score_t score, int filter){	/* XXX - use vec */	/*printf("add %d %d %d %d %d\n", len, pos1, pos2, score, filter);*/	if (mt->num >= mt->size) {		/*fprintf(stderr, "grow(%d/%d) ", mt->num, mt->size);*/		mt->size += 15;		mt->size += mt->size/2;		mt->msp = ckrealloc(mt->msp, mt->size*sizeof(*mt->msp));	}	{ msp_t *const msp = mt->msp + mt->num;	  msp->len = len;	  msp->pos1 = pos1;	  msp->pos2 = pos2;	  msp->score = score;	  msp->filter = filter;	}	mt->num++;	/*fprintf(stderr, "(%d/%d)\n", mt->num, mt->size);*/	return mt->num;}/* extend_hit - extend a word-sized hit to a longer match */int msp_extend_hit(msp_table_t *mt,			SEQ *s1, SEQ *s2, ss_t ss, int X, int K, int W, int P,			int pos1, int pos2,			comb_t *diag_lev			){	const uchar *beg1, *end1;	int left_sum, right_sum;	const uchar *const seq1 = SEQ_CHARS(s1);	const uchar *const seq2 = SEQ_CHARS(s2);	const int diag = pos2 - pos1;	if (seq1 == seq2 && pos1 >= pos2)		return 0;		if (comb_get(diag_lev,diag) > pos1)		return 0;        	{ /* extend to the right */	    const uchar *q = end1 = seq1 + pos1;	    const uchar *s = seq2 + pos2;            const int end = SEQ_LEN(s2)-diag;	    const int end_ = (end < SEQ_LEN(s1)) ? end : SEQ_LEN(s1);	    const uchar *const seq1_end = seq1 + end_;	    int sum;	    right_sum = sum = 0;	    while (q < seq1_end && sum >= right_sum - X) {		if ((sum += ss[*s++][*q++]) > right_sum) {			right_sum = sum;			end1 = q;		}	    }	}	{ /* extend to the left */	    const uchar *q = beg1 = (seq1 + pos1) - W;	    const uchar *s = (seq2+pos2)-W;	    const int end = comb_get(diag_lev,diag)-W;	    const int end_ = (end < 0) ? 0 : end;	    const int end__ = (end_ < -diag) ? -diag : end_;	    const uchar *const seq1_end = seq1+end__;	    int sum;	    left_sum = sum = 0;	    while (q > seq1_end  && sum >= left_sum - X) {		if ((sum += ss[*--s][*--q]) > left_sum) {			left_sum = sum;			beg1 = q;		}	    }	}	{   score_t score = right_sum + left_sum;	    const uchar *q = (seq1 + pos1) - W;	    const uchar *s = (seq2 + pos2) - W;	    const uchar *qend = seq1 + pos1;#ifdef DEBUGfprintf(stderr, "seq1=%p\n", seq1);fprintf(stderr, "end1=%p\n", seq1+SEQ_LEN(s1));fprintf(stderr, "q=   %p\n", q);fprintf(stderr, "seq2=%p\n", seq2);fprintf(stderr, "end2=%p\n", seq1+SEQ_LEN(s2));fprintf(stderr, "pos2=%d W=%d\n", pos2, W);fprintf(stderr, "s=   %p\n", s);assert(q >= seq1);assert(s >= seq2);#endif	    while (q < qend) {#ifdef DEBUGif (VALGRIND_CHECK_READABLE(q, sizeof(*q))) fatalf("\nq=%p %d",q, *q);if (VALGRIND_CHECK_READABLE(s, sizeof(*s))) fatalf("\ns=%p %d",s, *s);#endif		score += ss[*q++][*s++];	    }            if (P && score >= K && score <= 10*K) {                double ent = entropy(beg1, end1-beg1);		int old_score = score;                score = (int) ((double)score * ent);		if (score <= K && P > 1)			fprintf(stderr, "pruned hit of score %d at (%d,%d)\n",			   old_score, beg1-seq1, beg1-seq1+diag);	    }	    	    if (score >= K)		msp_add(mt, end1-beg1, beg1-seq1, beg1-seq1+diag, score, 0);	}	comb_set(diag_lev,diag, (end1 - seq1) + W);	return 0;}static msp_table_t *blast_search_unchecked(SEQ *seq1, SEQ *seq2, blast_table_t *bt, msp_table_t *msp_tab, ss_t ss, int X, int K, int P){	uchar *s;	blast_ecode_t ecode;	static comb_t *diag_lev = 0;	int i;	const int W = bt->w.W;	const int len1 = SEQ_LEN(seq1);	const int len2 = SEQ_LEN(seq2);	uchar *const beg2 = SEQ_CHARS(seq2);	uchar *const end2 = beg2+len2;	const signed char *const encoding = bt->w.encoding;	const blast_ecode_t mask = (1UL << ((W-1)*2)) - 1;        if (W<=1) return 0;	{ unsigned int n = (len1+len2+1)*sizeof(int);	  n = roundup(n, 64*1024);	  diag_lev = comb_resize(diag_lev,n,len1);        }	for (s = beg2; s < end2; ) {	restart:		/* init */		ecode = 0;		for (i = 1; i < W && s < end2; ++i) {			const int e = encoding[*s++];			if (e < 0) goto restart;			ecode = (ecode << 2) + e;		}		/* compute */		for (; s < end2; ++s) {			const int e = encoding[*s];			const blast_epos_node_t *h;			if (e < 0) goto restart; /* hit a run of 'X's */			ecode = ((ecode & mask) << 2) + e;			for (h = pos_get(&bt->w, ecode); h; h = h->link)				if (h->ecode == ecode)					msp_extend_hit(msp_tab, seq1, seq2, 						ss, X, K, W, P,						h->pos, s-beg2+1, diag_lev);		} 	}	comb_clear(diag_lev);	return msp_tab;}msp_table_t *blast_search(SEQ *seq1, SEQ *seq2, blast_table_t *bt, msp_table_t *msp, ss_t ss, int X, int K, int P, int T){	if (seq1 == 0) return 0;	if (seq2 == 0) return 0;	if (bt == 0) return 0;	if (T != 0) fatal("blast_search: T must be zero (for now)");	return blast_search_unchecked(seq1, seq2, bt, msp, ss, X, K, P);}static void blast_free_chain(bt_t *bt, blast_ecode_t ecode){	blast_epos_node_t **p = pos_getp(bt, ecode);	blast_epos_node_t *q = *p;	while (q) {		blast_epos_node_t *t = q->link;		ZFREE(q);		q = t;	}	*p = 0; /* clear table entry too */}void blast_table_free(blast_table_t *bt){        if (bt == 0) return;#ifdef USE_OBSTACK	obstack_free(&bt_obstk, (void*)0);#else	{blast_ecode_t ecode;	 for (ecode = 0; ecode < BLAST_POS_TAB_SIZE; ++ecode)		blast_free_chain(&bt->w, ecode);	}#endif	ZFREE(bt->w.epos_tab);	ZFREE(bt);}void msp_free_table(msp_table_t *mt){	if (mt && mt->msp) ZFREE(mt->msp);	if (mt) ZFREE(mt);}/* -----------------------------  filter the MSPs  ---------------------------*/msp_table_t *msp_compress(msp_table_t *mt){	msp_t *p, *q;	int i = 0;	if (mt == 0)		return 0;	if (mt->msp == 0)		return 0;	for (q=p=MSP_TAB_FIRST(mt); MSP_TAB_MORE(mt,p); p=MSP_TAB_NEXT(p)) {		if (p->filter == 0) {			if (p != q)				*q = *p;			q = MSP_TAB_NEXT(q);					++i;		} 	}	mt->num = i;	return mt;}#define DEFINE_MSP_CMP(f) \static int msp_cmp_##f(const void *a, const void *b) { \	return ((const msp_t*)a)->f - ((const msp_t*)b)->f; \}DEFINE_MSP_CMP(pos1)static msp_table_t *msp_sort_by(msp_table_t *mt, msp_cmp_t cmp){	msp_t *msp; 	unsigned int num;	if ((mt == NULL) || ((num = mt->num) == 0))		return 0;        msp = mt->msp;	qsort((char *) msp, num, sizeof(msp_t), cmp);	return mt;}#define DEFINE_MSP_SORT(f) \msp_table_t *msp_sort_##f(msp_table_t *mt) { \      return msp_sort_by(mt, msp_cmp_##f); \}DEFINE_MSP_SORT(pos1)/* --------------------------  sort the MSPs  ------------------------------- *//* sort MSPs by score */static int msp_cmp_score(const void *e, const void *f){	int i;	const msp_t *ee = (const msp_t *)e;	const msp_t *ff = (const msp_t *)f;		if ((i = (ee->score - ff->score)) < 0)                return 1;        else if (i > 0)                return -1;        else if ((i = (ee->pos1 - ff->pos1)) > 0)                return 1;	else if (i < 0)		return -1;        else if ((i = (ee->pos2 - ff->pos2)) > 0)                return 1;	else if (i < 0)		return -1;	else                return 0;} msp_table_t *msp_sort(msp_table_t *mt){	return msp_sort_by(mt, msp_cmp_score);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -