⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 subfam.cpp

📁 unix,linux下编译。用于蛋白质
💻 CPP
字号:
#include "muscle.h"
#include "tree.h"
#include "textfile.h"	// for test only
#include "msa.h"
#include "seqvect.h"
#include "profile.h"
#ifndef _MSC_VER
#include <unistd.h>	//	for unlink
#endif

#define TRACE	0

/***
Find subfamilies from tree by following criteria:

(a) number of leaves <= max,
(b) is monophyletic, i.e. most recent common ancestor is parent
of no more than one subfamily.
***/

static unsigned SubFamRecurse(const Tree &tree, unsigned uNodeIndex, unsigned uMaxLeafCount,
  unsigned SubFams[], unsigned &uSubFamCount)
	{
	if (tree.IsLeaf(uNodeIndex))
		return 1;

	unsigned uLeft = tree.GetLeft(uNodeIndex);
	unsigned uRight = tree.GetRight(uNodeIndex);
	unsigned uLeftCount = SubFamRecurse(tree, uLeft, uMaxLeafCount, SubFams, uSubFamCount);
	unsigned uRightCount = SubFamRecurse(tree, uRight, uMaxLeafCount, SubFams, uSubFamCount);

	unsigned uLeafCount = uLeftCount + uRightCount;
	if (uLeftCount + uRightCount > uMaxLeafCount)
		{
		if (uLeftCount <= uMaxLeafCount)
			SubFams[uSubFamCount++] = uLeft;
		if (uRightCount <= uMaxLeafCount)
			SubFams[uSubFamCount++] = uRight;
		}
	else if (tree.IsRoot(uNodeIndex))
		{
		if (uSubFamCount != 0)
			Quit("Error in SubFamRecurse");
		SubFams[uSubFamCount++] = uNodeIndex;
		}

	return uLeafCount;
	}

void SubFam(const Tree &tree, unsigned uMaxLeafCount, unsigned SubFams[], unsigned *ptruSubFamCount)
	{
	*ptruSubFamCount = 0;
	SubFamRecurse(tree, tree.GetRootNodeIndex(), uMaxLeafCount, SubFams, *ptruSubFamCount);

#if	TRACE
	{
	Log("\n");
	Log("Tree:\n");
	tree.LogMe();
	//void DrawTree(const Tree &tree);
	//DrawTree(tree);
	Log("\n");
	Log("%d subfams:\n", *ptruSubFamCount);
	for (unsigned i = 0; i < *ptruSubFamCount; ++i)
		Log("  %d=%d", i, SubFams[i]);
	Log("\n");
	}
#endif
	}

//unsigned SubFams[9999];
//unsigned uSubFamCount;
//
//static unsigned DistFromRoot(const Tree &tree, unsigned uNodeIndex)
//	{
//	const unsigned uRoot = tree.GetRootNodeIndex();
//	unsigned uDist = 0;
//	while (uNodeIndex != uRoot)
//		{
//		++uDist;
//		uNodeIndex = tree.GetParent(uNodeIndex);
//		}
//	return uDist;
//	}
//
//static void DrawNode(const Tree &tree, unsigned uNodeIndex)
//	{
//	if (!tree.IsLeaf(uNodeIndex))
//		DrawNode(tree, tree.GetLeft(uNodeIndex));
//
//	unsigned uDist = DistFromRoot(tree, uNodeIndex);
//	for (unsigned i = 0; i < 5*uDist; ++i)
//		Log(" ");
//	Log("%d", uNodeIndex);
//	for (unsigned i = 0; i < uSubFamCount; ++i)
//		if (uNodeIndex == SubFams[i])
//			{
//			Log("*");
//			break;
//			}
//	Log("\n");
//
//	if (!tree.IsLeaf(uNodeIndex))
//		DrawNode(tree, tree.GetRight(uNodeIndex));
//	}
//
//static void DrawTree(const Tree &tree)
//	{
//	unsigned uRoot = tree.GetRootNodeIndex();
//	DrawNode(tree, uRoot);
//	}
//
//void TestSubFams(const char *FileName)
//	{
//	Tree tree;
//	TextFile f(FileName);
//	tree.FromFile(f);
//	SubFam(tree, 5, SubFams, &uSubFamCount);
//	DrawTree(tree);
//	}

static void SetInFam(const Tree &tree, unsigned uNodeIndex, bool NodeInSubFam[])
	{
	if (tree.IsLeaf(uNodeIndex))
		return;
	unsigned uLeft = tree.GetLeft(uNodeIndex);
	unsigned uRight = tree.GetRight(uNodeIndex);
	NodeInSubFam[uLeft] = true;
	NodeInSubFam[uRight] = true;

	SetInFam(tree, uLeft, NodeInSubFam);
	SetInFam(tree, uRight, NodeInSubFam);
	}

void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex,
  MSA &msaOut)
	{
	const unsigned uSeqCount = vAll.GetSeqCount();

	const char *InTmp = "asf_in.tmp";
	const char *OutTmp = "asf_out.tmp";

	unsigned *Leaves = new unsigned[uSeqCount];
	unsigned uLeafCount;
	GetLeaves(GuideTree, uNodeIndex, Leaves, &uLeafCount);

	SeqVect v;
	for (unsigned i = 0; i < uLeafCount; ++i)
		{
		unsigned uLeafNodeIndex = Leaves[i];
		unsigned uId = GuideTree.GetLeafId(uLeafNodeIndex);
		Seq &s = vAll.GetSeqById(uId);
		v.AppendSeq(s);
		}

#if	TRACE
	{
	Log("Align subfam[node=%d, size=%d] ", uNodeIndex, uLeafCount);
	for (unsigned i = 0; i < uLeafCount; ++i)
		Log(" %s", v.GetSeqName(i));
	Log("\n");
	}
#endif

	TextFile fIn(InTmp, true);

	v.ToFASTAFile(fIn);
	fIn.Close();

	char CmdLine[4096];
	sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp);
//	sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp);
	system(CmdLine);

	TextFile fOut(OutTmp);
	msaOut.FromFile(fOut);

	for (unsigned uSeqIndex = 0; uSeqIndex < uLeafCount; ++uSeqIndex)
		{
		const char *Name = msaOut.GetSeqName(uSeqIndex);
		unsigned uId = vAll.GetSeqIdFromName(Name);
		msaOut.SetSeqId(uSeqIndex, uId);
		}

	unlink(InTmp);
	unlink(OutTmp);

	delete[] Leaves;
	}

void ProgAlignSubFams()
	{
	MSA msaOut;

	SetOutputFileName(g_pstrOutFileName);
	SetInputFileName(g_pstrInFileName);

	SetMaxIters(g_uMaxIters);
	SetSeqWeightMethod(g_SeqWeight1);

	TextFile fileIn(g_pstrInFileName);
	SeqVect v;
	v.FromFASTAFile(fileIn);
	const unsigned uSeqCount = v.Length();

	if (0 == uSeqCount)
		Quit("No sequences in input file");

	ALPHA Alpha = ALPHA_Undefined;
	switch (g_SeqType)
		{
	case SEQTYPE_Auto:
		Alpha = v.GuessAlpha();
		break;

	case SEQTYPE_Protein:
		Alpha = ALPHA_Amino;
		break;

	case SEQTYPE_DNA:
		Alpha = ALPHA_DNA;
		break;

	case SEQTYPE_RNA:
		Alpha = ALPHA_RNA;
		break;

	default:
		Quit("Invalid seq type");
		}
	SetAlpha(Alpha);
	v.FixAlpha();

	PTR_SCOREMATRIX UserMatrix = 0;
	if (0 != g_pstrMatrixFileName)
		{
		const char *FileName = g_pstrMatrixFileName;
		const char *Path = getenv("MUSCLE_MXPATH");
		if (Path != 0)
			{
			size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
			char *NewFileName = new char[n];
			sprintf(NewFileName, "%s/%s", Path, FileName);
			FileName = NewFileName;
			}
		TextFile File(FileName);
		UserMatrix = ReadMx(File);
		g_Alpha = ALPHA_Amino;
		g_PPScore = PPSCORE_SP;
		}

	SetPPScore();

	if (0 != UserMatrix)
		g_ptrScoreMatrix = UserMatrix;

	if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
		{
		SetPPScore(PPSCORE_SPN);
		g_Distance1 = DISTANCE_Kmer4_6;
		}

	unsigned uMaxL = 0;
	unsigned uTotL = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		unsigned L = v.GetSeq(uSeqIndex).Length();
		uTotL += L;
		if (L > uMaxL)
			uMaxL = L;
		}

	SetIter(1);
	g_bDiags = g_bDiags1;
	SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);

	SetMuscleSeqVect(v);

	MSA::SetIdCount(uSeqCount);

// Initialize sequence ids.
// From this point on, ids must somehow propogate from here.
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		v.SetSeqId(uSeqIndex, uSeqIndex);

	if (uSeqCount > 1)
		MHackStart(v);

	if (0 == uSeqCount)
		{
		msaOut.Clear();
		return;
		}

	if (1 == uSeqCount && ALPHA_Amino == Alpha)
		{
		const Seq &s = v.GetSeq(0);
		msaOut.FromSeq(s);
		return;
		}

	Tree GuideTree;
	TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1);
	SetMuscleTree(GuideTree);

	MSA msa;
	if (g_bLow)
		{
		ProgNode *ProgNodes = 0;
		ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
		delete[] ProgNodes;
		}
	else
		ProgressiveAlign(v, GuideTree, msa);
	SetCurrentAlignment(msa);
	TreeFromMSA(msa, GuideTree, g_Cluster2, g_Distance2, g_Root2);
	SetMuscleTree(GuideTree);

	unsigned *SubFams = new unsigned[uSeqCount];
	unsigned uSubFamCount;
	SubFam(GuideTree, g_uMaxSubFamCount, SubFams, &uSubFamCount);

	SetProgressDesc("Align node");
	const unsigned uNodeCount = 2*uSeqCount - 1;

	ProgNode *ProgNodes = new ProgNode[uNodeCount];
	bool *NodeIsSubFam = new bool[uNodeCount];
	bool *NodeInSubFam = new bool[uNodeCount];

	for (unsigned i = 0; i < uNodeCount; ++i)
		{
		NodeIsSubFam[i] = false;
		NodeInSubFam[i] = false;
		}

	for (unsigned i = 0; i < uSubFamCount; ++i)
		{
		unsigned uNodeIndex = SubFams[i];
		assert(uNodeIndex < uNodeCount);
		NodeIsSubFam[uNodeIndex] = true;
		SetInFam(GuideTree, uNodeIndex, NodeInSubFam);
		}

	unsigned uJoin = 0;
	unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
	do
		{
		if (NodeIsSubFam[uTreeNodeIndex])
			{
#if	TRACE
			Log("Node %d: align subfam\n", uTreeNodeIndex);
#endif
			ProgNode &Node = ProgNodes[uTreeNodeIndex];
			AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);
			Node.m_uLength = Node.m_MSA.GetColCount();
			}
		else if (!NodeInSubFam[uTreeNodeIndex])
			{
#if	TRACE
			Log("Node %d: align two subfams\n", uTreeNodeIndex);
#endif
			Progress(uJoin, uSubFamCount - 1);
			++uJoin;

			const unsigned uMergeNodeIndex = uTreeNodeIndex;
			ProgNode &Parent = ProgNodes[uMergeNodeIndex];

			const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
			const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);

			ProgNode &Node1 = ProgNodes[uLeft];
			ProgNode &Node2 = ProgNodes[uRight];

			PWPath Path;
			AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
			Parent.m_uLength = Parent.m_MSA.GetColCount();

			Node1.m_MSA.Clear();
			Node2.m_MSA.Clear();
			}
		else
			{
#if	TRACE
			Log("Node %d: in subfam\n", uTreeNodeIndex);
#endif
			;
			}
		uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
		}
	while (NULL_NEIGHBOR != uTreeNodeIndex);
	ProgressStepsDone();

	unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
	ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];

	TextFile fOut(g_pstrOutFileName, true);
	MHackEnd(RootProgNode.m_MSA);
	RootProgNode.m_MSA.ToFile(fOut);

	delete[] NodeInSubFam;
	delete[] NodeIsSubFam;
	delete[] ProgNodes;
	delete[] SubFams;

	ProgNodes = 0;
	NodeInSubFam = 0;
	NodeIsSubFam = 0;
	SubFams = 0;
	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -