📄 fastdistmafft.cpp

📁 unix,linux下编译。用于蛋白质
💻 CPP
字号:
#include "muscle.h"
#include "distfunc.h"
#include "seqvect.h"
#include <math.h>

#define TRACE 0

#define MIN(x, y)	(((x) < (y)) ? (x) : (y))
#define MAX(x, y)	(((x) > (y)) ? (x) : (y))

const unsigned TUPLE_COUNT = 6*6*6*6*6*6;
static unsigned char Count1[TUPLE_COUNT];
static unsigned char Count2[TUPLE_COUNT];

// Amino acid groups according to MAFFT (sextet5)
// 0 =  A G P S T
// 1 =  I L M V
// 2 =  N D Q E B Z
// 3 =  R H K
// 4 =  F W Y
// 5 =  C
// 6 =  X . - U
unsigned ResidueGroup[] =
	{
	0,		// AX_A,
	5,		// AX_C,
	2,		// AX_D,
	2,		// AX_E,
	4,		// AX_F,
	0,		// AX_G,
	3,		// AX_H,
	1,		// AX_I,
	3,		// AX_K,
	1,		// AX_L,
	1,		// AX_M,
	2,		// AX_N,
	0,		// AX_P,
	2,		// AX_Q,
	3,		// AX_R,
	0,		// AX_S,
	0,		// AX_T,
	1,		// AX_V,
	4,		// AX_W,
	4,		// AX_Y,

	2,		// AX_B,	// D or N
	2,		// AX_Z,	// E or Q
	0,		// AX_X,	// Unknown		// ******** TODO *************
										// This isn't the correct way of avoiding group 6
	0		// AX_GAP,					// ******** TODO ******************
	};
unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);

static char *TupleToStr(int t)
	{
	static char s[7];
	int t1, t2, t3, t4, t5, t6;

	t1 = t%6;
	t2 = (t/6)%6;
	t3 = (t/(6*6))%6;
	t4 = (t/(6*6*6))%6;
	t5 = (t/(6*6*6*6))%6;
	t6 = (t/(6*6*6*6*6))%6;

	s[5] = '0' + t1;
	s[4] = '0' + t2;
	s[3] = '0' + t3;
	s[2] = '0' + t4;
	s[1] = '0' + t5;
	s[0] = '0' + t6;
	return s;
	}

static unsigned GetTuple(const unsigned uLetters[], unsigned n)
	{
	assert(uLetters[n] < uResidueGroupCount);
	assert(uLetters[n+1] < uResidueGroupCount);
	assert(uLetters[n+2] < uResidueGroupCount);
	assert(uLetters[n+3] < uResidueGroupCount);
	assert(uLetters[n+4] < uResidueGroupCount);
	assert(uLetters[n+5] < uResidueGroupCount);

	unsigned u1 = ResidueGroup[uLetters[n]];
	unsigned u2 = ResidueGroup[uLetters[n+1]];
	unsigned u3 = ResidueGroup[uLetters[n+2]];
	unsigned u4 = ResidueGroup[uLetters[n+3]];
	unsigned u5 = ResidueGroup[uLetters[n+4]];
	unsigned u6 = ResidueGroup[uLetters[n+5]];

	return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6;
	}

static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])
	{
	memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));
	for (unsigned n = 0; n < uTupleCount; ++n)
		{
		const unsigned uTuple = GetTuple(L, n);
		++(Count[uTuple]);
		}
	}

static void ListCount(const unsigned char Count[])
	{
	for (unsigned n = 0; n < TUPLE_COUNT; ++n)
		{
		if (0 == Count[n])
			continue;
		Log("%s  %u\n", TupleToStr(n), Count[n]);
		}
	}

void DistKmer6_6(const SeqVect &v, DistFunc &DF)
	{
	const unsigned uSeqCount = v.Length();

	DF.SetCount(uSeqCount);
	if (0 == uSeqCount)
		return;

// Initialize distance matrix to zero
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			DF.SetDist(uSeq1, uSeq2, 0);
		}

// Convert to letters
	unsigned **Letters = new unsigned *[uSeqCount];
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		Seq &s = *(v[uSeqIndex]);
		const unsigned uSeqLength = s.Length();
		unsigned *L = new unsigned[uSeqLength];
		Letters[uSeqIndex] = L;
		for (unsigned n = 0; n < uSeqLength; ++n)
			{
			char c = s[n];
			L[n] = CharToLetterEx(c);
			assert(L[n] < uResidueGroupCount);
			}
		}

	unsigned **uCommonTupleCount = new unsigned *[uSeqCount];
	for (unsigned n = 0; n < uSeqCount; ++n)
		{
		uCommonTupleCount[n] = new unsigned[uSeqCount];
		memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
		}

	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
	unsigned uCount = 0;
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		Seq &seq1 = *(v[uSeq1]);
		const unsigned uSeqLength1 = seq1.Length();
		if (uSeqLength1 < 5)
			continue;

		const unsigned uTupleCount = uSeqLength1 - 5;
		const unsigned *L = Letters[uSeq1];
		CountTuples(L, uTupleCount, Count1);
#if	TRACE
		{
		Log("Seq1=%d\n", uSeq1);
		Log("Groups:\n");
		for (unsigned n = 0; n < uSeqLength1; ++n)
			Log("%u", ResidueGroup[L[n]]);
		Log("\n");

		Log("Tuples:\n");
		ListCount(Count1);
		}
#endif

		SetProgressDesc("K-mer dist pass 1");
		for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
			{
			if (0 == uCount%500)
				Progress(uCount, uPairCount);
			++uCount;
			Seq &seq2 = *(v[uSeq2]);
			const unsigned uSeqLength2 = seq2.Length();
			if (uSeqLength2 < 5)
				{
				if (uSeq1 == uSeq2)
					DF.SetDist(uSeq1, uSeq2, 0);
				else
					DF.SetDist(uSeq1, uSeq2, 1);
				continue;
				}

		// First pass through seq 2 to count tuples
			const unsigned uTupleCount = uSeqLength2 - 5;
			const unsigned *L = Letters[uSeq2];
			CountTuples(L, uTupleCount, Count2);
#if	TRACE
			Log("Seq2=%d Counts=\n", uSeq2);
			ListCount(Count2);
#endif

		// Second pass to accumulate sum of shared tuples
		// MAFFT defines this as the sum over unique tuples
		// in seq2 of the minimum of the number of tuples found
		// in the two sequences.
			unsigned uSum = 0;
			for (unsigned n = 0; n < uTupleCount; ++n)
				{
				const unsigned uTuple = GetTuple(L, n);
				uSum += MIN(Count1[uTuple], Count2[uTuple]);

			// This is a hack to make sure each unique tuple counted only once.
				Count2[uTuple] = 0;
				}
#if	TRACE
			{
			Seq &s1 = *(v[uSeq1]);
			Seq &s2 = *(v[uSeq2]);
			const char *pName1 = s1.GetName();
			const char *pName2 = s2.GetName();
			Log("Common count %s(%d) - %s(%d) =%u\n",
			  pName1, uSeq1, pName2, uSeq2, uSum);
			}
#endif
			uCommonTupleCount[uSeq1][uSeq2] = uSum;
			uCommonTupleCount[uSeq2][uSeq1] = uSum;
			}
		}
	ProgressStepsDone();

	uCount = 0;
	SetProgressDesc("K-mer dist pass 2");
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		Seq &s1 = *(v[uSeq1]);
		const char *pName1 = s1.GetName();

		double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
		if (0 == dCommonTupleCount11)
			dCommonTupleCount11 = 1;

		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			{
			if (0 == uCount%500)
				Progress(uCount, uPairCount);
			++uCount;

			double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
			if (0 == dCommonTupleCount22)
				dCommonTupleCount22 = 1;

			const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
			  /dCommonTupleCount11;
			const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
			  /dCommonTupleCount22;

		// dMinDist is the value used for tree-building in MAFFT
			const double dMinDist = MIN(dDist1, dDist2);
			DF.SetDist(uSeq1, uSeq2, (float) dMinDist);

			//const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
			//g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
		// **** TODO **** why does this make score slightly worse??
			//const double dKimuraDist = KimuraDist(dEstimatedPctId);
			//DF.SetDist(uSeq1, uSeq2, dKimuraDist);
			}
		}
	ProgressStepsDone();

	for (unsigned n = 0; n < uSeqCount; ++n)
		delete[] uCommonTupleCount[n];
	delete[] uCommonTupleCount;
	delete[] Letters;
	}

double PctIdToMAFFTDist(double dPctId)
	{
	if (dPctId < 0.05)
		dPctId = 0.05;
	double dDist = -log(dPctId);
	return dDist;
	}

double PctIdToHeightMAFFT(double dPctId)
	{
	return PctIdToMAFFTDist(dPctId);
	}
💿 文件大小 243 K
👤 上传用户 jc6629402
📂 所属分类生物技术
🏷️ 相关标签

#linux #unix #下编译 #蛋白质
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -