freqcounter.cpp

来自「汉字字频统计工具。功能很简单。」· C++ 代码 · 共 323 行

CPP
323
字号
// FreqCounter.cpp: implementation of the CFreqCounter class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "CPT.h"
#include "FreqCounter.h"
#include <math.h>

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

ChChar CFreqCounter::chars[MAXNUMCHINESECHAR];

UINT CFreqCounter::retvalue=CFreqCounter::InitChineseChars();

CFreqCounter::CFreqCounter(UINT nDepth)
{
	m_nSizeArray = 0;
	m_aucArray = NULL;
	m_tucRoot=NULL;
	InitCounterTree(&m_tucRoot, nDepth, 0, MAXNUMCHINESECHAR-1);
}

CFreqCounter::~CFreqCounter()
{
	if (m_tucRoot)
		ReleaseCounterTree(m_tucRoot);
	for (UINT i=0; i<m_nSizeArray; i++)
		if (m_aucArray[i].next)
			delete m_aucArray[i].next;
	if (m_nSizeArray)
		delete m_aucArray;
}

// Construct the lookup table for Chinese Chars
UINT CFreqCounter::InitChineseChars()
{
	unsigned char i, j;
	int k=0;

	for (i=0xb0; i<=0xf7; i++)
		for (j=0xa1; j<=0xfe; j++)
		{
			ASSERT(k<MAXNUMCHINESECHAR);
			chars[k][0]=i;
			chars[k++][1]=j;
		}
	return 0;
}

void CFreqCounter::InitCounterTree(TUCounter ** tucTree, UINT nDepth, int nLeft, int nRight)
{
	TUCounter *p;
	int i=(nLeft+nRight)/2;

	if (nLeft>nRight || nDepth==0)
		return;
	p=new TUCounter;
	*tucTree = p;
	(p->ch)[0]=chars[i][0];
	(p->ch)[1]=chars[i][1];
	p->counter=0;
	p->left=p->right=NULL;
	InitCounterTree(&(p->left), nDepth-1, nLeft, i-1);
	InitCounterTree(&(p->right), nDepth-1, i+1, nRight);
}

void CFreqCounter::ReleaseCounterTree(TUCounter *root)
{
	if (root==NULL)
		return;
	ReleaseCounterTree(root->left);
	ReleaseCounterTree(root->right);
	delete root;
}

void CFreqCounter::AddGram(ChChar first)
{
	TUCounter * p=m_tucRoot;
	TUCounter **q=&m_tucRoot;

	while (p)
	{
		register int i = CompareChineseChar(first, p->ch);
		if (i==0)
		{
			(p->counter)++;
			return;
		}
		else if (i<0)
		{
			q=&(p->left);
			p=p->left;
		}
		else
		{
			q=&(p->right);
			p=p->right;
		}
	}
	p=new TUCounter;
	p->ch[0]=first[0];
	p->ch[1]=first[1];
	p->counter=1;
	p->left=p->right=NULL;
	*q=p;
}

void CFreqCounter::AddGram(ChChar first, ChChar second)
{
	int l=0, r=m_nSizeArray-1, j, k;

	do 
	{
		k=(l+r)/2;
		j=CompareChineseChar(first, m_aucArray[k].ch);
		if (j<0)
			r=k-1;
		else if (j>0)
			l=k+1;
		else
			break;
	}
	while (1);
	// must found
	
	if (CFreqCounter * p=m_aucArray[k].next)
		p->AddGram(second);
}

void CFreqCounter::AddGram(ChChar first, ChChar second, ChChar third)
{
	int l=0, r=m_nSizeArray-1, j, k;

	do 
	{
		k=(l+r)/2;
		j=CompareChineseChar(first, m_aucArray[k].ch);
		if (j<0)
			r=k-1;
		else if (j>0)
			l=k+1;
		else
			break;
	}
	while (1);
	// must found
	
	if (CFreqCounter * p=m_aucArray[k].next)
		p->AddGram(second, third);
}

void CFreqCounter::PostFirstScan()
{
	m_nSizeArray=CountUniqueGrams(m_tucRoot);
	if (m_nSizeArray)
	{
		m_aucArray = new AUCounter[m_nSizeArray];
		UINT i=OutputTreeToArray(0, m_tucRoot);
	}
	ReleaseCounterTree(m_tucRoot);
	m_tucRoot=NULL;
}

void CFreqCounter::InitSecondScan()
{
	for (UINT i=0; i<m_nSizeArray; i++)
		m_aucArray[i].next=new CFreqCounter((UINT)(log((double)m_aucArray[i].counter)/2));
}

void CFreqCounter::PostSecondScan()
{
	for (UINT i=0; i<m_nSizeArray; i++)
		m_aucArray[i].next->PostFirstScan();
}

void CFreqCounter::InitThirdScan()
{
	for (UINT i=0; i<m_nSizeArray; i++)
		m_aucArray[i].next->InitSecondScan();
}

void CFreqCounter::PostThirdScan()
{
	for (UINT i=0; i<m_nSizeArray; i++)
		m_aucArray[i].next->PostSecondScan();
}

UINT CFreqCounter::CountUniqueGrams(TUCounter * tucTree)
{
	if (tucTree == NULL)
		return 0;
	UINT l=CountUniqueGrams(tucTree->left), \
		r=CountUniqueGrams(tucTree->right);

	if (tucTree->counter)
		return l+r+1;
	else return l+r;
}

UINT CFreqCounter::OutputTreeToArray(UINT left, TUCounter * tuc)
{
	if (tuc == NULL)
		return 0;
	UINT i=OutputTreeToArray(left, tuc->left);
	if (tuc->counter)
	{
		m_aucArray[left+i].ch[0]=tuc->ch[0];
		m_aucArray[left+i].ch[1]=tuc->ch[1];
		m_aucArray[left+i].counter=tuc->counter;
		m_aucArray[left+i].next= NULL; 
		i++;
	}
	i += OutputTreeToArray(left+i, tuc->right);
	return i;
}

void CFreqCounter::OutputUnigramFrequency(CFile &fout, const void * buf, size_t len)
{
	AUCounter *p=new AUCounter[m_nSizeArray];

	memmove(p, m_aucArray, m_nSizeArray*sizeof(AUCounter));
	qsort(p, m_nSizeArray, sizeof(AUCounter), AUCounter::CompareCounterUnit);
	for (UINT i=0; i<m_nSizeArray; i++)
	{
		char buf1[50];
		if (buf)
			fout.Write(buf, len);
		fout.Write(&(p[i].ch), sizeof(ChChar));
		sprintf(buf1, " %u\n", p[i].counter);
		fout.Write(buf1, strlen(buf1));
	}
	delete p;
}


void CFreqCounter::OutputBigramFrequency(CFile &fout, const void * buf, size_t len)
{
	char* buf1=new char[len+sizeof(ChChar)];
	if (buf)
		memmove(buf1, buf, len);

	for (UINT i=0; i<m_nSizeArray; i++)
	{
		memmove(buf1+len, &(m_aucArray[i].ch), sizeof(ChChar));
		m_aucArray[i].next->OutputUnigramFrequency(fout, buf1, len+2);
	}
	delete buf1;
}

void CFreqCounter::OutputTrigramFrequency(CFile &fout, const void * buf, size_t len)
{
	char* buf1=new char[len+sizeof(ChChar)];
	if (buf)
		memmove(buf1, buf, len);

	for (UINT i=0; i<m_nSizeArray; i++)
	{
		memmove(buf1+len, &(m_aucArray[i].ch), sizeof(ChChar));
		m_aucArray[i].next->OutputBigramFrequency(fout, buf1, len+2);
	}
	delete buf1;
}

double AUMutual::moffset=0;

void CFreqCounter::OutputAllMutual(CFile &fout)
{
	UINT n=0, i, j, k;
	int l, r, m, ret;
	ChChar cc;

	for (i=0; i<m_nSizeArray; i++)
		n+=m_aucArray[i].next->m_nSizeArray;
	AUMutual *p=new AUMutual[n];
	for (i=0, k=0; i<m_nSizeArray; i++)
		for (j=0; j<m_aucArray[i].next->m_nSizeArray; j++, k++)
		{
			p[k].ch[0][0]=m_aucArray[i].ch[0];
			p[k].ch[0][1]=m_aucArray[i].ch[1];
			p[k].ch[1][0]=m_aucArray[i].next->m_aucArray[j].ch[0];
			p[k].ch[1][1]=m_aucArray[i].next->m_aucArray[j].ch[1];
			cc[0]=p[k].ch[1][0];
			cc[1]=p[k].ch[1][1];
			p[k].c[0]=m_aucArray[i].counter;
			p[k].counter=m_aucArray[i].next->m_aucArray[j].counter;
			l=0; r=m_nSizeArray-1;
			do 
			{
				m=(l+r)/2;
				ret = CompareChineseChar(cc, m_aucArray[m].ch);
				if (ret < 0)
					r = m-1;
				else if (ret > 0)
					l = m+1;
				else 
					break;
			}
			while (1);
			// must found
			p[k].c[1]=m_aucArray[m].counter;
			p[k].CalculateMutual();
		}
		qsort(p, n, sizeof(AUMutual), AUMutual::CompareMutual);
		for (i=0; i<n; i++)
		{
			char buf[100];
			fout.Write(p[i].ch, 2*sizeof(ChChar));
			sprintf(buf, " %u %u %u %g\n", p[i].c[0], \
				p[i].c[1], p[i].counter, p[i].mutual);
			fout.Write(buf, strlen(buf));
		}
		delete p;
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?