📄 fastdistkmer.cpp
字号:
#include "muscle.h"
#include "msa.h"
#include "seqvect.h"
#include "seq.h"
#include "distfunc.h"
#include <math.h>
#define TRACE 0
/***
Some candidate alphabets considered because they
have high correlations and small table sizes.
Correlation coefficent is between k-mer distance
and %id D measured from a CLUSTALW alignment.
Table size is N^k where N is size of alphabet.
A is standard (uncompressed) amino alphabet.
Correlation
Alpha N k Table Size all 25-50%
----- -- - ---------- ---- ------
A 20 3 8,000 0.943 0.575
A 20 4 160,000 0.962 0.685 <<
LiA 14 4 38,416 0.966 0.645
SEB 14 4 38,416 0.964 0.634
LiA 13 4 28,561 0.965 0.640
LiA 12 4 20,736 0.963 0.620
LiA 10 5 100,000 0.964 0.652
We select A with k=4 because it has the best
correlations. The only drawback is a large table
size, but space is readily available and the only
additional time cost is in resetting the table to
zero, which can be done quickly with memset or by
keeping a list of the k-mers that were found (should
test to see which is faster, and may vary by compiler
and processor type). It also has the minor advantage
that we don't need to convert the alphabet.
Fractional identity d is estimated as follows.
F = fractional k-mer count
if F is 0: F = 0.01
Y = log(0.02 + F)
d = -4.1 + 4.12*Y
The constant 0.02 was chosen to make the relationship
between Y and D linear. The constants -4.1 and 4.12
were chosen to fit a straight line to the scatterplot
of Y vs D.
***/
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
const unsigned K = 4;
const unsigned N = 20;
const unsigned N_2 = 20*20;
const unsigned N_3 = 20*20*20;
const unsigned N_4 = 20*20*20*20;
const unsigned TABLE_SIZE = N_4;
// For debug output
const char *KmerToStr(unsigned Kmer)
{
static char s[5];
unsigned c3 = (Kmer/N_3)%N;
unsigned c2 = (Kmer/N_2)%N;
unsigned c1 = (Kmer/N)%N;
unsigned c0 = Kmer%N;
s[0] = LetterToChar(c3);
s[1] = LetterToChar(c2);
s[2] = LetterToChar(c1);
s[3] = LetterToChar(c0);
return s;
}
void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])
{
#if TRACE
Log("CountKmers\n");
#endif
memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));
const byte *ptrKmerStart = s;
const byte *ptrKmerEnd = s + 4;
const byte *ptrSeqEnd = s + uSeqLength;
unsigned c3 = s[0]*N_3;
unsigned c2 = s[1]*N_2;
unsigned c1 = s[2]*N;
unsigned c0 = s[3];
unsigned Kmer = c3 + c2 + c1 + c0;
for (;;)
{
assert(Kmer < TABLE_SIZE);
#if TRACE
Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));
#endif
++(KmerCounts[Kmer]);
if (ptrKmerEnd == ptrSeqEnd)
break;
// Compute k-mer as function of previous k-mer:
// 1. Subtract first letter from previous k-mer.
// 2. Multiply by N.
// 3. Add next letter.
c3 = (*ptrKmerStart++) * N_3;
Kmer = (Kmer - c3)*N;
Kmer += *ptrKmerEnd++;
}
}
unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,
const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)
{
byte KmerCounts2[TABLE_SIZE];
CountKmers(Seq2, uSeqLength2, KmerCounts2);
const byte *ptrKmerStart = Seq;
const byte *ptrKmerEnd = Seq + 4;
const byte *ptrSeqEnd = Seq + uSeqLength;
unsigned c3 = Seq[0]*N_3;
unsigned c2 = Seq[1]*N_2;
unsigned c1 = Seq[2]*N;
unsigned c0 = Seq[3];
unsigned Kmer = c3 + c2 + c1 + c0;
unsigned uCommonCount = 0;
for (;;)
{
assert(Kmer < TABLE_SIZE);
const byte Count1 = KmerCounts1[Kmer];
const byte Count2 = KmerCounts2[Kmer];
uCommonCount += MIN(Count1, Count2);
// Hack so we don't double-count
KmerCounts2[Kmer] = 0;
if (ptrKmerEnd == ptrSeqEnd)
break;
// Compute k-mer as function of previous k-mer:
// 1. Subtract first letter from previous k-mer.
// 2. Multiply by N.
// 3. Add next letter.
c3 = (*ptrKmerStart++) * N_3;
Kmer = (Kmer - c3)*N;
Kmer += *ptrKmerEnd++;
}
return uCommonCount;
}
static void SeqToLetters(const Seq &s, byte Letters[])
{
const unsigned uSeqLength = s.Length();
for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)
{
char c = s.GetChar(uCol);
// Ugly hack. My k-mer counting code isn't wild-card
// aware. Arbitrarily replace wildcards by a specific
// amino acid.
if (IsWildcardChar(c))
c = 'A';
*Letters++ = CharToLetter(c);
}
}
void FastDistKmer(const SeqVect &v, DistFunc &DF)
{
byte KmerCounts[TABLE_SIZE];
const unsigned uSeqCount = v.GetSeqCount();
DF.SetCount(uSeqCount);
if (0 == uSeqCount)
return;
// Initialize distance matrix to zero
for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
{
DF.SetDist(uSeq1, uSeq1, 0);
for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
DF.SetDist(uSeq1, uSeq2, 0);
}
unsigned uMaxLength = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
const Seq &s = v.GetSeq(uSeqIndex);
unsigned uSeqLength = s.Length();
if (uSeqLength > uMaxLength)
uMaxLength = uSeqLength;
}
if (0 == uMaxLength)
return;
byte *Seq1Letters = new byte[uMaxLength];
byte *Seq2Letters = new byte[uMaxLength];
for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)
{
const Seq &s1 = v.GetSeq(uSeqIndex1);
const unsigned uSeqLength1 = s1.Length();
SeqToLetters(s1, Seq1Letters);
CountKmers(Seq1Letters, uSeqLength1, KmerCounts);
for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;
++uSeqIndex2)
{
const Seq &s2 = v.GetSeq(uSeqIndex2);
const unsigned uSeqLength2 = s2.Length();
SeqToLetters(s2, Seq2Letters);
unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,
KmerCounts, Seq2Letters, uSeqLength2);
unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);
double F = (double) uCommonKmerCount / (uMinLength - K + 1);
if (0.0 == F)
F = 0.01;
double Y = log(0.02 + F);
double EstimatedPctId = Y/4.12 + 0.995;
double KD = KimuraDist(EstimatedPctId);
// DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);
DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));
#if TRACE
Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",
uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);
#endif
}
}
delete[] Seq1Letters;
delete[] Seq2Letters;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -