📄 upgma2.cpp
字号:
#include "muscle.h"
#include "tree.h"
#include "distcalc.h"
// UPGMA clustering in O(N^2) time and space.
#define TRACE 0
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define AVG(x, y) (((x) + (y))/2)
static unsigned g_uLeafCount;
static unsigned g_uTriangleSize;
static unsigned g_uInternalNodeCount;
static unsigned g_uInternalNodeIndex;
// Triangular distance matrix is g_Dist, which is allocated
// as a one-dimensional vector of length g_uTriangleSize.
// TriangleSubscript(i,j) maps row,column=i,j to the subscript
// into this vector.
// Row / column coordinates are a bit messy.
// Initially they are leaf indexes 0..N-1.
// But each time we create a new node (=new cluster, new subtree),
// we re-use one of the two rows that become available (the children
// of the new node). This saves memory.
// We keep track of this through the g_uNodeIndex vector.
static dist_t *g_Dist;
// Distance to nearest neighbor in row i of distance matrix.
// Subscript is distance matrix row.
static dist_t *g_MinDist;
// Nearest neighbor to row i of distance matrix.
// Subscript is distance matrix row.
static unsigned *g_uNearestNeighbor;
// Node index of row i in distance matrix.
// Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.
// Subscript is distance matrix row.
static unsigned *g_uNodeIndex;
// The following vectors are defined on internal nodes,
// subscripts are internal node index 0..N-2.
// For g_uLeft/Right, value is the node index 0 .. 2N-2
// because a child can be internal or leaf.
static unsigned *g_uLeft;
static unsigned *g_uRight;
static dist_t *g_Height;
static dist_t *g_LeftLength;
static dist_t *g_RightLength;
static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2)
{
#if DEBUG
if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)
Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);
#endif
unsigned v;
if (uIndex1 >= uIndex2)
v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
else
v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);
return v;
}
static void ListState()
{
Log("Dist matrix\n");
Log(" ");
for (unsigned i = 0; i < g_uLeafCount; ++i)
{
if (uInsane == g_uNodeIndex[i])
continue;
Log(" %5u", g_uNodeIndex[i]);
}
Log("\n");
for (unsigned i = 0; i < g_uLeafCount; ++i)
{
if (uInsane == g_uNodeIndex[i])
continue;
Log("%5u ", g_uNodeIndex[i]);
for (unsigned j = 0; j < g_uLeafCount; ++j)
{
if (uInsane == g_uNodeIndex[j])
continue;
if (i == j)
Log(" ");
else
{
unsigned v = TriangleSubscript(i, j);
Log("%5.2g ", g_Dist[v]);
}
}
Log("\n");
}
Log("\n");
Log(" i Node NrNb Dist\n");
Log("----- ----- ----- --------\n");
for (unsigned i = 0; i < g_uLeafCount; ++i)
{
if (uInsane == g_uNodeIndex[i])
continue;
Log("%5u %5u %5u %8.3f\n",
i,
g_uNodeIndex[i],
g_uNearestNeighbor[i],
g_MinDist[i]);
}
Log("\n");
Log(" Node L R Height LLength RLength\n");
Log("----- ----- ----- ------ ------- -------\n");
for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i)
Log("%5u %5u %5u %6.2g %6.2g %6.2g\n",
i,
g_uLeft[i],
g_uRight[i],
g_Height[i],
g_LeftLength[i],
g_RightLength[i]);
}
void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage)
{
g_uLeafCount = DC.GetCount();
g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;
g_uInternalNodeCount = g_uLeafCount - 1;
g_Dist = new dist_t[g_uTriangleSize];
g_uNodeIndex = new unsigned[g_uLeafCount];
g_uNearestNeighbor = new unsigned[g_uLeafCount];
g_MinDist = new dist_t[g_uLeafCount];
unsigned *Ids = new unsigned [g_uLeafCount];
char **Names = new char *[g_uLeafCount];
g_uLeft = new unsigned[g_uInternalNodeCount];
g_uRight = new unsigned[g_uInternalNodeCount];
g_Height = new dist_t[g_uInternalNodeCount];
g_LeftLength = new dist_t[g_uInternalNodeCount];
g_RightLength = new dist_t[g_uInternalNodeCount];
for (unsigned i = 0; i < g_uLeafCount; ++i)
{
g_MinDist[i] = BIG_DIST;
g_uNodeIndex[i] = i;
g_uNearestNeighbor[i] = uInsane;
Ids[i] = DC.GetId(i);
Names[i] = strsave(DC.GetName(i));
}
for (unsigned i = 0; i < g_uInternalNodeCount; ++i)
{
g_uLeft[i] = uInsane;
g_uRight[i] = uInsane;
g_LeftLength[i] = BIG_DIST;
g_RightLength[i] = BIG_DIST;
g_Height[i] = BIG_DIST;
}
// Compute initial NxN triangular distance matrix.
// Store minimum distance for each full (not triangular) row.
// Loop from 1, not 0, because "row" is 0, 1 ... i-1,
// so nothing to do when i=0.
for (unsigned i = 1; i < g_uLeafCount; ++i)
{
dist_t *Row = g_Dist + TriangleSubscript(i, 0);
DC.CalcDistRange(i, Row);
for (unsigned j = 0; j < i; ++j)
{
const dist_t d = Row[j];
if (d < g_MinDist[i])
{
g_MinDist[i] = d;
g_uNearestNeighbor[i] = j;
}
if (d < g_MinDist[j])
{
g_MinDist[j] = d;
g_uNearestNeighbor[j] = i;
}
}
}
#if TRACE
Log("Initial state:\n");
ListState();
#endif
for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1;
++g_uInternalNodeIndex)
{
#if TRACE
Log("\n");
Log("Internal node index %5u\n", g_uInternalNodeIndex);
Log("-------------------------\n");
#endif
// Find nearest neighbors
unsigned Lmin = uInsane;
unsigned Rmin = uInsane;
dist_t dtMinDist = BIG_DIST;
for (unsigned j = 0; j < g_uLeafCount; ++j)
{
if (uInsane == g_uNodeIndex[j])
continue;
dist_t d = g_MinDist[j];
if (d < dtMinDist)
{
dtMinDist = d;
Lmin = j;
Rmin = g_uNearestNeighbor[j];
assert(uInsane != Rmin);
assert(uInsane != g_uNodeIndex[Rmin]);
}
}
assert(Lmin != uInsane);
assert(Rmin != uInsane);
assert(dtMinDist != BIG_DIST);
#if TRACE
Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",
Lmin,
g_uNodeIndex[Lmin],
Rmin,
g_uNodeIndex[Rmin],
dtMinDist);
#endif
// Compute distances to new node
// New node overwrites row currently assigned to Lmin
dist_t dtNewMinDist = BIG_DIST;
unsigned uNewNearestNeighbor = uInsane;
for (unsigned j = 0; j < g_uLeafCount; ++j)
{
if (j == Lmin || j == Rmin)
continue;
if (uInsane == g_uNodeIndex[j])
continue;
const unsigned vL = TriangleSubscript(Lmin, j);
const unsigned vR = TriangleSubscript(Rmin, j);
const dist_t dL = g_Dist[vL];
const dist_t dR = g_Dist[vR];
dist_t dtNewDist;
switch (Linkage)
{
case LINKAGE_Avg:
dtNewDist = AVG(dL, dR);
break;
case LINKAGE_Min:
dtNewDist = MIN(dL, dR);
break;
case LINKAGE_Max:
dtNewDist = MAX(dL, dR);
break;
case LINKAGE_Biased:
dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);
break;
default:
Quit("UPGMA2: Invalid LINKAGE_%u", Linkage);
}
// Nasty special case.
// If nearest neighbor of j is Lmin or Rmin, then make the new
// node (which overwrites the row currently occupied by Lmin)
// the nearest neighbor. This situation can occur when there are
// equal distances in the matrix. If we don't make this fix,
// the nearest neighbor pointer for j would become invalid.
// (We don't need to test for == Lmin, because in that case
// the net change needed is zero due to the change in row
// numbering).
if (g_uNearestNeighbor[j] == Rmin)
g_uNearestNeighbor[j] = Lmin;
#if TRACE
Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",
j, Lmin, dL, Rmin, dR, dtNewDist);
#endif
g_Dist[vL] = dtNewDist;
if (dtNewDist < dtNewMinDist)
{
dtNewMinDist = dtNewDist;
uNewNearestNeighbor = j;
}
}
assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);
assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);
const unsigned v = TriangleSubscript(Lmin, Rmin);
const dist_t dLR = g_Dist[v];
const dist_t dHeightNew = dLR/2;
const unsigned uLeft = g_uNodeIndex[Lmin];
const unsigned uRight = g_uNodeIndex[Rmin];
const dist_t HeightLeft =
uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];
const dist_t HeightRight =
uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];
g_uLeft[g_uInternalNodeIndex] = uLeft;
g_uRight[g_uInternalNodeIndex] = uRight;
g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;
g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;
g_Height[g_uInternalNodeIndex] = dHeightNew;
// Row for left child overwritten by row for new node
g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;
g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;
g_MinDist[Lmin] = dtNewMinDist;
// Delete row for right child
g_uNodeIndex[Rmin] = uInsane;
#if TRACE
Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",
g_uInternalNodeIndex, Lmin, Rmin);
ListState();
#endif
}
unsigned uRoot = g_uLeafCount - 2;
tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength,
Ids, Names);
#if TRACE
tree.LogMe();
#endif
delete[] g_Dist;
delete[] g_uNodeIndex;
delete[] g_uNearestNeighbor;
delete[] g_MinDist;
delete[] g_Height;
delete[] g_uLeft;
delete[] g_uRight;
delete[] g_LeftLength;
delete[] g_RightLength;
for (unsigned i = 0; i < g_uLeafCount; ++i)
free(Names[i]);
delete[] Names;
delete[] Ids;
}
class DistCalcTest : public DistCalc
{
virtual void CalcDistRange(unsigned i, dist_t Dist[]) const
{
static dist_t TestDist[5][5] =
{
0, 2, 14, 14, 20,
2, 0, 14, 14, 20,
14, 14, 0, 4, 20,
14, 14, 4, 0, 20,
20, 20, 20, 20, 0,
};
for (unsigned j = 0; j < i; ++j)
Dist[j] = TestDist[i][j];
}
virtual unsigned GetCount() const
{
return 5;
}
virtual unsigned GetId(unsigned i) const
{
return i;
}
virtual const char *GetName(unsigned i) const
{
return "name";
}
};
void Test()
{
SetListFileName("c:\\tmp\\lobster.log", false);
DistCalcTest DC;
Tree tree;
UPGMA2(DC, tree, LINKAGE_Avg);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -