⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pe.cpp

📁 a N-gram and Fast Pattern Extraction Algorithms~
💻 CPP
字号:
// PE.cpp: implementation of the LZW class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "pe.h"

CPatternAlaysis::CPatternAlaysis()
{
	m_pDes = NULL;
	m_alpDic.NoRepeat = true;
}

CPatternAlaysis::~CPatternAlaysis()
{
	if(m_pDes)
		delete m_pDes;
}

int AddWordToPattern(BYTE* lps)
{
	int nLength = 0;
	while(lps[nLength] == ' ')
		nLength++;
	while(lps[nLength] && lps[nLength] != ' ')
		nLength++;
	return max(2, nLength);
}

int GetPatternLength(BYTE* lps, int& nPrevLength, int nMinPatternWords = 2)
{
	int nLength = 0, nCount = 0;
	while(nMinPatternWords--)
	{
		while(lps[nLength] == ' ')
			nLength++;
		if(nCount++ == 1)
			nPrevLength = nLength;
		while(lps[nLength] && lps[nLength] != ' ')
			nLength++;
	}

	return max(2, nLength);
}
//////////////////////////////////////////////////////////////////////
void CPatternAlaysis::ConstructPatterns(BYTE *pSrc, int nSrcLen, LPCSTR lpcsDelimiters /*= NULL*/, int nMinPatternWords /*= 2*/, bool bFixedNGram /*= false*/)
{
	// discard initial spaces
	while(*pSrc == ' ')
		pSrc++, nSrcLen--;
	
	if(m_pDes)
		delete m_pDes;
	m_pDes = new BYTE[nSrcLen+1];
	int nIndex[2] = { 0, 0 }, nDesLen = 0;
	if(lpcsDelimiters == NULL)
		// copy source buffer
		memcpy(m_pDes, pSrc, nIndex[1] = nSrcLen);
	else	// discard delimiters
		while (nIndex[0] < nSrcLen)
		{
			if(strchr(lpcsDelimiters, pSrc[nIndex[0]]) == NULL)
				m_pDes[nIndex[1]++] = pSrc[nIndex[0]];
			nIndex[0]++;
		}
	
	// discard repeated spaces
	nIndex[0] = 0;
	while (nIndex[0] < nIndex[1])
	{
		// discard sequenced spaces
		while(m_pDes[nIndex[0]] == ' ' && m_pDes[nIndex[0]+1] == ' ')
			nIndex[0]++;
		m_pDes[nDesLen++] = m_pDes[nIndex[0]++];
	}
	m_pDes[nDesLen] = 0;

	m_alpDic.RemoveAll();

	// tree node to keep last success search to start with
	CBinaryTreeNode<CPattern, int>* pNode = m_alpDic.Root;
	// left m_alpDic Samples points to the source buffer
	int nPrevLength;
	CPattern node(m_pDes, GetPatternLength(m_pDes, nPrevLength, nMinPatternWords));
	// scan the input buffer
	while(node.m_pBuffer < m_pDes+nDesLen)
	{
		pNode = m_alpDic.Insert(&node, -1, pNode);
		pNode->Key.m_nFrequency = pNode->Count;
		if(bFixedNGram == false && pNode->Count > 1)
			// (repeated pattern), increment node length by a new word length
			node.m_nLength += AddWordToPattern(node.m_pBuffer+node.m_nLength);
		else
		{	// initialize node to next entity
			node.m_pBuffer += nPrevLength;
			node.m_nLength = GetPatternLength(node.m_pBuffer, nPrevLength, nMinPatternWords);
			// initialize binary tree search root
			pNode = m_alpDic.Root;
		}
	}
}

int CPatternAlaysis::GetPatternCount()
{
	return m_alpDic.Count;
}

void CPatternAlaysis::GetPatterns(IN int nSortType, IN bool bIgnoreUniquePatterns, OUT vector<CPattern*>& vPatterns)
{
	vPatterns.clear();
	if(nSortType == 0)
	{	// alphabetical
		CBinaryTreeNode<CPattern, int>* pAlpNode = m_alpDic.Min(m_alpDic.Root);
		while(pAlpNode)
		{
			if(pAlpNode->Count > 1 || !bIgnoreUniquePatterns)	// ignore unique pattern
				vPatterns.push_back(&pAlpNode->Key);
			pAlpNode = m_alpDic.Successor(pAlpNode);
		}
	}
	else	if(nSortType == 1 || nSortType == 2)
	{	// frequency - pattern length
		CBinaryTree<CValue<int>, int, vector<CPattern*>, vector<CPattern*>* > displayDic;
		CBinaryTreeNode<CPattern, int>* pAlpNode = m_alpDic.Min(m_alpDic.Root);
		while(pAlpNode != NULL)
		{
			if(pAlpNode->Count > 1 || !bIgnoreUniquePatterns)	// ignore unique pattern
				displayDic.Insert(nSortType == 1 ? pAlpNode->Count/*frequency*/ : pAlpNode->Key.m_nLength/*length*/)->Data.push_back(&pAlpNode->Key);
			pAlpNode = m_alpDic.Successor(pAlpNode);
		}

		CBinaryTreeNode<CValue<int>, vector<CPattern*> >* pNode = displayDic.Max(displayDic.Root);
		while(pNode)
		{
			for(vector<CPattern*>::iterator i = pNode->Data.begin(), end = pNode->Data.end(); i != end; i++)
				vPatterns.push_back(*i);
			pNode = displayDic.Predecessor(pNode);
		}
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -