⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 msa2.cpp

📁 unix,linux下编译。用于蛋白质
💻 CPP
字号:
#include "muscle.h"
#include "msa.h"
#include "seqvect.h"
#include "profile.h"
#include "tree.h"

// These global variables are a hack to allow the tree
// dependent iteration code to communicate the edge
// used to divide the tree. The three-way weighting
// scheme needs to know this edge in order to compute
// sequence weights.
static const Tree *g_ptrMuscleTree = 0;
unsigned g_uTreeSplitNode1 = NULL_NEIGHBOR;
unsigned g_uTreeSplitNode2 = NULL_NEIGHBOR;

void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize,
  FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd,
  FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc,
  FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const
	{
	const unsigned uSeqCount = GetSeqCount();
	const unsigned uColCount = GetColCount();

	memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT));
	WEIGHT wTotal = 0;
	FCOUNT fGap = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		const WEIGHT w = GetSeqWeight(uSeqIndex);
		if (IsGap(uSeqIndex, uColIndex))
			{
			fGap += w;
			continue;
			}
		else if (IsWildcard(uSeqIndex, uColIndex))
			{
			const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
			switch (g_Alpha)
				{
			case ALPHA_Amino:
				switch (uLetter)
					{
				case AX_B:		// D or N
					fcCounts[AX_D] += w/2;
					fcCounts[AX_N] += w/2;
					break;
				case AX_Z:		// E or Q
					fcCounts[AX_E] += w/2;
					fcCounts[AX_Q] += w/2;
					break;
				default:		// any
					{
					const FCOUNT f = w/20;
					for (unsigned uLetter = 0; uLetter < 20; ++uLetter)
						fcCounts[uLetter] += f;
					break;
					}
					}
				break;

			case ALPHA_DNA:
			case ALPHA_RNA:
				switch (uLetter)
					{
				case AX_R:	// G or A
					fcCounts[NX_G] += w/2;
					fcCounts[NX_A] += w/2;
					break;
				case AX_Y:	// C or T/U
					fcCounts[NX_C] += w/2;
					fcCounts[NX_T] += w/2;
					break;
				default:	// any
					const FCOUNT f = w/20;
					for (unsigned uLetter = 0; uLetter < 4; ++uLetter)
						fcCounts[uLetter] += f;
					break;
					}
				break;

			default:
				Quit("Alphabet %d not supported", g_Alpha);
				}
			continue;
			}
		unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
		fcCounts[uLetter] += w;
		wTotal += w;
		}
	*ptrfOcc = (float) (1.0 - fGap);

	if (bNormalize && wTotal > 0)
		{
		if (wTotal > 1.001)
			Quit("wTotal=%g\n", wTotal);
		for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
			fcCounts[uLetter] /= wTotal;
//		AssertNormalized(fcCounts);
		}

	FCOUNT fcStartCount = 0;
	if (uColIndex == 0)
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex))
				fcStartCount += GetSeqWeight(uSeqIndex);
		}
	else
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1))
				fcStartCount += GetSeqWeight(uSeqIndex);
		}

	FCOUNT fcEndCount = 0;
	if (uColCount - 1 == uColIndex)
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex))
				fcEndCount += GetSeqWeight(uSeqIndex);
		}
	else
		{
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1))
				fcEndCount += GetSeqWeight(uSeqIndex);
		}

	FCOUNT LL = 0;
	FCOUNT LG = 0;
	FCOUNT GL = 0;
	FCOUNT GG = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		WEIGHT w = GetSeqWeight(uSeqIndex);
		bool bLetterHere = !IsGap(uSeqIndex, uColIndex);
		bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1));
		if (bLetterHere)
			{
			if (bLetterPrev)
				LL += w;
			else
				GL += w;
			}
		else
			{
			if (bLetterPrev)
				LG += w;
			else
				GG += w;
			}
		}

	FCOUNT fcExtendCount = 0;
	if (uColIndex > 0 && uColIndex < GetColCount() - 1)
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) &&
			  IsGap(uSeqIndex, uColIndex + 1))
				fcExtendCount += GetSeqWeight(uSeqIndex);

	*ptrfcLL = LL;
	*ptrfcLG = LG;
	*ptrfcGL = GL;
	*ptrfcGG = GG;
	*ptrfcGapStart = fcStartCount;
	*ptrfcGapEnd = fcEndCount;
	*ptrfcGapExtend = fcExtendCount;
	}

// Return true if the given column has no gaps and all
// its residues are in the same biochemical group.
bool MSAColIsConservative(const MSA &msa, unsigned uColIndex)
	{
	extern unsigned ResidueGroup[];

	const unsigned uSeqCount = msa.GetColCount();
	if (0 == uSeqCount)
		Quit("MSAColIsConservative: empty alignment");

	if (msa.IsGap(0, uColIndex))
		return false;

	unsigned uLetter = msa.GetLetterEx(0, uColIndex);
	const unsigned uGroup = ResidueGroup[uLetter];

	for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		if (msa.IsGap(uSeqIndex, uColIndex))
			return false;
		uLetter = msa.GetLetter(uSeqIndex, uColIndex);
		if (ResidueGroup[uLetter] != uGroup)
			return false;
		}
	return true;
	}

void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount,
  MSA &msaOut)
	{
	const unsigned uColCount = msaIn.GetColCount();
	msaOut.SetSize(uSeqCount, uColCount);

	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex);
		msaOut.SetSeqName(uSeqIndex, ptrName);

		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
			{
			const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex);
			msaOut.SetChar(uSeqIndex, uColIndex, c);
			}
		}
	}

void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount,
  MSA &msaOut)
	{
	const unsigned uSeqCount = msaIn.GetSeqCount();
	const unsigned uInColCount = msaIn.GetColCount();

	if (uFromColIndex + uColCount - 1 > uInColCount)
		Quit("MSAFromColRange, out of bounds");

	msaOut.SetSize(uSeqCount, uColCount);

	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		const char *ptrName = msaIn.GetSeqName(uSeqIndex);
		unsigned uId = msaIn.GetSeqId(uSeqIndex);
		msaOut.SetSeqName(uSeqIndex, ptrName);
		msaOut.SetSeqId(uSeqIndex, uId);

		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
			{
			const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex);
			msaOut.SetChar(uSeqIndex, uColIndex, c);
			}
		}
	}

void SeqVectFromMSA(const MSA &msa, SeqVect &v)
	{
	v.Clear();
	const unsigned uSeqCount = msa.GetSeqCount();
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		Seq s;
		msa.GetSeq(uSeqIndex, s);

		s.StripGaps();
		//if (0 == s.Length())
		//	continue;

		const char *ptrName = msa.GetSeqName(uSeqIndex);
		s.SetName(ptrName);

		v.AppendSeq(s);
		}
	}

void DeleteGappedCols(MSA &msa)
	{
	unsigned uColIndex = 0;
	for (;;)
		{
		if (uColIndex >= msa.GetColCount())
			break;
		if (msa.IsGapColumn(uColIndex))
			msa.DeleteCol(uColIndex);
		else
			++uColIndex;
		}
	}

void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount,
  MSA &msaOut)
	{
	const unsigned uColCount = msaIn.GetColCount();
	msaOut.SetSize(uSeqCount, uColCount);
	for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut)
		{
		unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut];
		const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);
		unsigned uId = msaIn.GetSeqId(uSeqIndexIn);
		msaOut.SetSeqName(uSeqIndexOut, ptrName);
		msaOut.SetSeqId(uSeqIndexOut, uId);
		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
			{
			const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);
			msaOut.SetChar(uSeqIndexOut, uColIndex, c);
			}
		}
	}

void AssertMSAEqIgnoreCaseAndGaps(const MSA &msa1, const MSA &msa2)
	{
	const unsigned uSeqCount1 = msa1.GetSeqCount();
	const unsigned uSeqCount2 = msa2.GetSeqCount();
	if (uSeqCount1 != uSeqCount2)
		Quit("Seq count differs");

	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
		{
		Seq seq1;
		msa1.GetSeq(uSeqIndex, seq1);

		unsigned uId = msa1.GetSeqId(uSeqIndex);
		unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);

		Seq seq2;
		msa2.GetSeq(uSeqIndex2, seq2);

		if (!seq1.EqIgnoreCaseAndGaps(seq2))
			{
			Log("Input:\n");
			seq1.LogMe();
			Log("Output:\n");
			seq2.LogMe();
			Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
			}
		}
	}

void AssertMSAEq(const MSA &msa1, const MSA &msa2)
	{
	const unsigned uSeqCount1 = msa1.GetSeqCount();
	const unsigned uSeqCount2 = msa2.GetSeqCount();
	if (uSeqCount1 != uSeqCount2)
		Quit("Seq count differs");

	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex)
		{
		Seq seq1;
		msa1.GetSeq(uSeqIndex, seq1);

		unsigned uId = msa1.GetSeqId(uSeqIndex);
		unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);

		Seq seq2;
		msa2.GetSeq(uSeqIndex2, seq2);

		if (!seq1.Eq(seq2))
			{
			Log("Input:\n");
			seq1.LogMe();
			Log("Output:\n");
			seq2.LogMe();
			Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex));
			}
		}
	}

void SetMSAWeightsMuscle(MSA &msa)
	{
	SEQWEIGHT Method = GetSeqWeightMethod();
	switch (Method)
		{
	case SEQWEIGHT_None:
		msa.SetUniformWeights();
		return;

	case SEQWEIGHT_Henikoff:
		msa.SetHenikoffWeights();
		return;

	case SEQWEIGHT_HenikoffPB:
		msa.SetHenikoffWeightsPB();
		return;

	case SEQWEIGHT_GSC:
		msa.SetGSCWeights();
		return;

	case SEQWEIGHT_ClustalW:
		SetClustalWWeightsMuscle(msa);
		return;
	
	case SEQWEIGHT_ThreeWay:
		SetThreeWayWeightsMuscle(msa);
		return;
		}
	Quit("SetMSAWeightsMuscle, Invalid method=%d", Method);
	}

static WEIGHT *g_MuscleWeights;
static unsigned g_uMuscleIdCount;

WEIGHT GetMuscleSeqWeightById(unsigned uId)
	{
	if (0 == g_MuscleWeights)
		Quit("g_MuscleWeights = 0");
	if (uId >= g_uMuscleIdCount)
		Quit("GetMuscleSeqWeightById(%u): count=%u",
		  uId, g_uMuscleIdCount);

	return g_MuscleWeights[uId];
	}

void SetMuscleTree(const Tree &tree)
	{
	g_ptrMuscleTree = &tree;

	if (SEQWEIGHT_ClustalW != GetSeqWeightMethod())
		return;

	delete[] g_MuscleWeights;

	const unsigned uLeafCount = tree.GetLeafCount();
	g_uMuscleIdCount = uLeafCount;
	g_MuscleWeights = new WEIGHT[uLeafCount];
	CalcClustalWWeights(tree, g_MuscleWeights);
	}

void SetClustalWWeightsMuscle(MSA &msa)
	{
	if (0 == g_MuscleWeights)
		Quit("g_MuscleWeights = 0");
	const unsigned uSeqCount = msa.GetSeqCount();
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		const unsigned uId = msa.GetSeqId(uSeqIndex);
		if (uId >= g_uMuscleIdCount)
			Quit("SetClustalWWeightsMuscle: id out of range");
		msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]);
		}
	msa.NormalizeWeights((WEIGHT) 1.0);
	}

#define	LOCAL_VERBOSE	0

void SetThreeWayWeightsMuscle(MSA &msa)
	{
	if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2)
		{
		msa.SetHenikoffWeightsPB();
		return;
		}

	const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount();
	WEIGHT *Weights = new WEIGHT[uMuscleSeqCount];

	CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2,
	  Weights);

	const unsigned uMSASeqCount = msa.GetSeqCount();
	for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex)
		{
		const unsigned uId = msa.GetSeqId(uSeqIndex);
		if (uId >= uMuscleSeqCount)
			Quit("SetThreeWayWeightsMuscle: id out of range");
		msa.SetSeqWeight(uSeqIndex, Weights[uId]);
		}
#if	LOCAL_VERBOSE
	{
	Log("SetThreeWayWeightsMuscle\n");
	for (unsigned n = 0; n < uMSASeqCount; ++n)
		{
		const unsigned uId = msa.GetSeqId(n);
		Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]);
		}
	}
#endif
	msa.NormalizeWeights((WEIGHT) 1.0);

	delete[] Weights;
	}

// Append msa2 at the end of msa1
void MSAAppend(MSA &msa1, const MSA &msa2)
	{
	const unsigned uSeqCount = msa1.GetSeqCount();

	const unsigned uColCount1 = msa1.GetColCount();
	const unsigned uColCount2 = msa2.GetColCount();
	const unsigned uColCountCat = uColCount1 + uColCount2;

	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		unsigned uId = msa1.GetSeqId(uSeqIndex);
		unsigned uSeqIndex2 = msa2.GetSeqIndex(uId);
		for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
			{
			const char c = msa2.GetChar(uSeqIndex2, uColIndex);
			msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
			}
		}
	}

// "Catenate" two MSAs (by bad analogy with UNIX cat command).
// msa1 and msa2 must have same sequence names, but possibly
// in a different order.
// msaCat is the combined alignment produce by appending
// sequences in msa2 to sequences in msa1.
void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat)
	{
	const unsigned uSeqCount = msa1.GetSeqCount();

	const unsigned uColCount1 = msa1.GetColCount();
	const unsigned uColCount2 = msa2.GetColCount();
	const unsigned uColCountCat = uColCount1 + uColCount2;

	msaCat.SetSize(uSeqCount, uColCountCat);

	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex)
			{
			const char c = msa1.GetChar(uSeqIndex, uColIndex);
			msaCat.SetChar(uSeqIndex, uColIndex, c);
			}

		const char *ptrSeqName = msa1.GetSeqName(uSeqIndex);
		unsigned uSeqIndex2;
		msaCat.SetSeqName(uSeqIndex, ptrSeqName);
		bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2);
		if (bFound)
			{
			for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
				{
				const char c = msa2.GetChar(uSeqIndex2, uColIndex);
				msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c);
				}
			}
		else
			{
			for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex)
				msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-');
			}
		}
	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -