📄 msa.cpp
字号:
unsigned uSeqLength = s.Length();
SetSize(1, uSeqLength);
SetSeqName(0, s.GetName());
if (0 != m_SeqIndexToId)
SetSeqId(0, s.GetId());
for (unsigned n = 0; n < uSeqLength; ++n)
SetChar(0, n, s[n]);
}
unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const
{
assert(uSeqIndex < GetSeqCount());
assert(uColIndex < GetColCount());
unsigned uCol = 0;
for (unsigned n = 0; n <= uColIndex; ++n)
if (!IsGap(uSeqIndex, n))
++uCol;
return uCol;
}
void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex)
{
assert(uToSeqIndex < m_uSeqCount);
const unsigned uColCount = msaFrom.GetColCount();
assert(m_uColCount == uColCount ||
(0 == m_uColCount && uColCount <= m_uCacheSeqLength));
memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount);
SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex));
if (0 == m_uColCount)
m_uColCount = uColCount;
}
const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const
{
assert(uSeqIndex < m_uSeqCount);
return m_szSeqs[uSeqIndex];
}
void MSA::DeleteSeq(unsigned uSeqIndex)
{
assert(uSeqIndex < m_uSeqCount);
delete m_szSeqs[uSeqIndex];
delete m_szNames[uSeqIndex];
const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *);
if (uBytesToMove > 0)
{
memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove);
memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove);
}
--m_uSeqCount;
delete[] m_Weights;
m_Weights = 0;
}
bool MSA::IsEmptyCol(unsigned uColIndex) const
{
const unsigned uSeqCount = GetSeqCount();
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
if (!IsGap(uSeqIndex, uColIndex))
return false;
return true;
}
//void MSA::DeleteEmptyCols(bool bProgress)
// {
// unsigned uColCount = GetColCount();
// for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
// {
// if (IsEmptyCol(uColIndex))
// {
// if (bProgress)
// {
// Log("Deleting col %u of %u\n", uColIndex, uColCount);
// printf("Deleting col %u of %u\n", uColIndex, uColCount);
// }
// DeleteCol(uColIndex);
// --uColCount;
// }
// }
// }
unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const
{
Quit("MSA::AlignedColIndexToColIndex not implemented");
return 0;
}
WEIGHT MSA::GetTotalSeqWeight() const
{
WEIGHT wTotal = 0;
const unsigned uSeqCount = GetSeqCount();
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
wTotal += m_Weights[uSeqIndex];
return wTotal;
}
bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2,
unsigned uSeqIndex2)
{
Seq s1;
Seq s2;
a1.GetSeq(uSeqIndex1, s1);
a2.GetSeq(uSeqIndex2, s2);
s1.StripGaps();
s2.StripGaps();
return s1.EqIgnoreCase(s2);
}
unsigned MSA::GetSeqLength(unsigned uSeqIndex) const
{
assert(uSeqIndex < GetSeqCount());
const unsigned uColCount = GetColCount();
unsigned uLength = 0;
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
if (!IsGap(uSeqIndex, uColIndex))
++uLength;
return uLength;
}
void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID,
unsigned *ptruPosCount) const
{
assert(uSeqIndex1 < GetSeqCount());
assert(uSeqIndex2 < GetSeqCount());
unsigned uSameCount = 0;
unsigned uPosCount = 0;
const unsigned uColCount = GetColCount();
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
{
char c1 = GetChar(uSeqIndex1, uColIndex);
if (IsGapChar(c1))
continue;
char c2 = GetChar(uSeqIndex2, uColIndex);
if (IsGapChar(c2))
continue;
++uPosCount;
if (c1 == c2)
++uSameCount;
}
*ptruPosCount = uPosCount;
if (uPosCount > 0)
*ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount;
else
*ptrPWID = 0;
}
void MSA::UnWeight()
{
for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
m_Weights[uSeqIndex] = BTInsane;
}
unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const
{
assert(uColIndex < GetColCount());
unsigned Counts[MAX_ALPHA];
memset(Counts, 0, sizeof(Counts));
const unsigned uSeqCount = GetSeqCount();
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
continue;
const unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
++(Counts[uLetter]);
}
unsigned uUniqueCount = 0;
for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
if (Counts[uLetter] > 0)
++uUniqueCount;
return uUniqueCount;
}
double MSA::GetOcc(unsigned uColIndex) const
{
unsigned uGapCount = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
if (IsGap(uSeqIndex, uColIndex))
++uGapCount;
unsigned uSeqCount = GetSeqCount();
return (double) (uSeqCount - uGapCount) / (double) uSeqCount;
}
void MSA::ToFile(TextFile &File) const
{
if (g_bMSF)
ToMSFFile(File);
else if (g_bAln)
ToAlnFile(File);
else if (g_bHTML)
ToHTMLFile(File);
else if (g_bPHYS)
ToPhySequentialFile(File);
else if (g_bPHYI)
ToPhyInterleavedFile(File);
else
ToFASTAFile(File);
if (0 != g_pstrScoreFileName)
WriteScoreFile(*this);
}
bool MSA::ColumnHasGap(unsigned uColIndex) const
{
const unsigned uSeqCount = GetSeqCount();
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
if (IsGap(uSeqIndex, uColIndex))
return true;
return false;
}
void MSA::SetIdCount(unsigned uIdCount)
{
//if (m_uIdCount != 0)
// Quit("MSA::SetIdCount: may only be called once");
if (m_uIdCount > 0)
{
if (uIdCount > m_uIdCount)
Quit("MSA::SetIdCount: cannot increase count");
return;
}
m_uIdCount = uIdCount;
}
void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId)
{
assert(uSeqIndex < m_uSeqCount);
assert(uId < m_uIdCount);
if (0 == m_SeqIndexToId)
{
if (0 == m_uIdCount)
Quit("MSA::SetSeqId, SetIdCount has not been called");
m_IdToSeqIndex = new unsigned[m_uIdCount];
m_SeqIndexToId = new unsigned[m_uSeqCount];
memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned));
memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned));
}
m_SeqIndexToId[uSeqIndex] = uId;
m_IdToSeqIndex[uId] = uSeqIndex;
}
unsigned MSA::GetSeqIndex(unsigned uId) const
{
assert(uId < m_uIdCount);
assert(0 != m_IdToSeqIndex);
unsigned uSeqIndex = m_IdToSeqIndex[uId];
assert(uSeqIndex < m_uSeqCount);
return uSeqIndex;
}
bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const
{
for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)
{
if (uId == m_SeqIndexToId[uSeqIndex])
{
*ptruIndex = uSeqIndex;
return true;
}
}
return false;
}
unsigned MSA::GetSeqId(unsigned uSeqIndex) const
{
assert(uSeqIndex < m_uSeqCount);
unsigned uId = m_SeqIndexToId[uSeqIndex];
assert(uId < m_uIdCount);
return uId;
}
bool MSA::WeightsSet() const
{
return BTInsane != m_Weights[0];
}
void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount,
MSA &msaOut)
{
const unsigned uColCount = msaIn.GetColCount();
msaOut.SetSize(uIdCount, uColCount);
for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut)
{
const unsigned uId = Ids[uSeqIndexOut];
const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId);
const char *ptrName = msaIn.GetSeqName(uSeqIndexIn);
msaOut.SetSeqId(uSeqIndexOut, uId);
msaOut.SetSeqName(uSeqIndexOut, ptrName);
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
{
const char c = msaIn.GetChar(uSeqIndexIn, uColIndex);
msaOut.SetChar(uSeqIndexOut, uColIndex, c);
}
}
}
// Caller must allocate ptrSeq and ptrLabel as new char[n].
void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel)
{
if (m_uSeqCount > m_uCacheSeqCount)
Quit("Internal error MSA::AppendSeq");
if (m_uSeqCount == m_uCacheSeqCount)
ExpandCache(m_uSeqCount + 4, uSeqLength);
m_szSeqs[m_uSeqCount] = ptrSeq;
m_szNames[m_uSeqCount] = ptrLabel;
++m_uSeqCount;
}
void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount)
{
if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount)
Quit("Internal error MSA::ExpandCache");
if (m_uSeqCount > 0 && uColCount != m_uColCount)
Quit("Internal error MSA::ExpandCache, ColCount changed");
char **NewSeqs = new char *[uSeqCount];
char **NewNames = new char *[uSeqCount];
WEIGHT *NewWeights = new WEIGHT[uSeqCount];
for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)
{
NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex];
NewNames[uSeqIndex] = m_szNames[uSeqIndex];
NewWeights[uSeqIndex] = m_Weights[uSeqIndex];
}
for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex)
{
char *Seq = new char[uColCount];
NewSeqs[uSeqIndex] = Seq;
#if DEBUG
memset(Seq, '?', uColCount);
#endif
}
delete[] m_szSeqs;
delete[] m_szNames;
delete[] m_Weights;
m_szSeqs = NewSeqs;
m_szNames = NewNames;
m_Weights = NewWeights;
m_uCacheSeqCount = uSeqCount;
m_uCacheSeqLength = uColCount;
m_uColCount = uColCount;
}
void MSA::FixAlpha()
{
ClearInvalidLetterWarning();
for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex)
{
for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex)
{
char c = GetChar(uSeqIndex, uColIndex);
if (!IsResidueChar(c) && !IsGapChar(c))
{
char w = GetWildcardChar();
// Warning("Invalid letter '%c', replaced by '%c'", c, w);
InvalidLetterWarning(c, w);
SetChar(uSeqIndex, uColIndex, w);
}
}
}
ReportInvalidLetters();
}
ALPHA MSA::GuessAlpha() const
{
// If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap
// letters belong to the nucleotide alphabet, guess nucleo.
// Otherwise amino.
const unsigned CHAR_COUNT = 100;
const unsigned MIN_NUCLEO_PCT = 95;
const unsigned uSeqCount = GetSeqCount();
const unsigned uColCount = GetColCount();
if (0 == uSeqCount)
return ALPHA_Amino;
unsigned uDNACount = 0;
unsigned uRNACount = 0;
unsigned uTotal = 0;
unsigned i = 0;
for (;;)
{
unsigned uSeqIndex = i/uColCount;
if (uSeqIndex >= uSeqCount)
break;
unsigned uColIndex = i%uColCount;
++i;
char c = GetChar(uSeqIndex, uColIndex);
if (IsGapChar(c))
continue;
if (IsDNA(c))
++uDNACount;
if (IsRNA(c))
++uRNACount;
++uTotal;
if (uTotal >= CHAR_COUNT)
break;
}
if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT)
return ALPHA_RNA;
if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT)
return ALPHA_DNA;
return ALPHA_Amino;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -