sentseg.cpp

来自「汉字字频统计工具。功能很简单。」· C++ 代码 · 共 133 行

CPP
133
字号
// SentSeg.cpp: implementation of the CSentSeg class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "CPT.h"
#include "SentSeg.h"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CSentSeg::CSentSeg(CCPTDoc * pDoc, CString dirpath)
{
	m_pDoc = pDoc;
	m_sDir = dirpath;
	m_fOut = NULL;
	m_nIdx = 0;
}

CSentSeg::~CSentSeg()
{
	if (m_fOut)
		delete m_fOut;
}

void CSentSeg::DoSentSeg()
{
	m_pDoc->ClearAllCounters();
	m_saFiles.RemoveAll();
	FindAllFiles(m_sDir, "*.htm;*.html;*.txt;");

	m_fOut = new CFile;
	if (!(m_fOut->Open("sentence.txt", CFile::modeCreate | CFile::modeWrite)))
	{
		delete m_fOut;
		m_fOut = NULL;
		return;
	}
		
	for (int i=0; i<m_saFiles.GetSize(); i++,m_pDoc->IncNumFiles())
	{
		try
		{
			ChChar cc;
			UINT n;

			CFile f;
			f.Open(m_saFiles[i], CFile::modeRead);
			while ((n=f.Read(cc, 1))==1)
			{
				if (cc[0] >= 128)
				{
					n=f.Read(cc+1, 1);
					if (n==1 && IsChineseChar(cc))
					{
						if (m_nIdx == 1023)
						{
							delete m_fOut;
							m_fOut = NULL;
							return;
						}
						m_ccBuf[m_nIdx][0]=cc[0];
						m_ccBuf[m_nIdx++][1]=cc[1];
						m_pDoc->IncNumChars();
						continue;
					}
				}
				OutputSentence();
			}
			OutputSentence();
			f.Close();
		}
		catch (CFileException *e) {}
	}
	delete m_fOut;
	m_fOut = NULL;
}

void CSentSeg::OutputSentence()
{
	ASSERT(m_fOut);
	ASSERT(m_nIdx < 1024);
	if (m_nIdx == 0)
		return;
	m_pDoc->IncNumSent();
	((unsigned char *)(m_ccBuf+m_nIdx))[0]=(unsigned char)0;
	m_fOut->Write(m_ccBuf, m_nIdx<<1);
	m_fOut->Write("\n", 1);
	m_nIdx = 0;
}

void CSentSeg::FindAllFiles(CString sDir, CString sPattern)
{
	CFileFind finder;
	BOOL bWorking;
	CString sPatterns=sPattern;
	CStringArray sPat;
	int i;

	while ((i=sPatterns.FindOneOf(",;"))!=-1)
	{
		sPat.Add(sPatterns.Left(i));
		sPatterns = sPatterns.Right(sPatterns.GetLength() - i - 1);
	}
	if (sPatterns != "")
		sPat.Add(sPatterns);

	for (i=0; i<sPat.GetSize(); i++)
	{
		bWorking = finder.FindFile(sDir + "\\" + sPat[i]);
		while (bWorking)
		{
			bWorking = finder.FindNextFile();
			m_saFiles.Add(finder.GetFilePath());
		}
	}

	bWorking = finder.FindFile(sDir+"\\"+"*");
	while (bWorking)
	{
		bWorking = finder.FindNextFile();
		if (finder.IsDirectory() && !finder.IsDots())
			FindAllFiles(finder.GetFilePath(), sPattern);
	}
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?