⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 saxfilehandlers.cpp

📁 基于SVM的文本分类算法
💻 CPP
字号:
// SAXFileHandlers.cpp: implementation of the CSAXFileHandlers class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "SAXFileHandlers.h"
#include "Message.h"
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/sax2/SAX2XMLReader.hpp>
#include <xercesc/sax2/XMLReaderFactory.hpp>
#include <xercesc/sax2/DefaultHandler.hpp>
#include <xercesc/util/XMLString.hpp>

#include <xercesc/sax2/Attributes.hpp>
#include <xercesc/sax/SAXParseException.hpp>
#include <xercesc/sax/SAXException.hpp>
#include <direct.h>

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

CSAXFileHandlers theSaxFileHandler;
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CSAXFileHandlers::CSAXFileHandlers()
{
	m_nStatus=0;
	m_nTargetFormat=0;
	m_nSourceFormat=0;
	m_astrTop10Category.Add("acq");
	m_astrTop10Category.Add("corn");
	m_astrTop10Category.Add("crude");
	m_astrTop10Category.Add("earn");
	m_astrTop10Category.Add("grain");
	m_astrTop10Category.Add("interest");
	m_astrTop10Category.Add("money-fx");
	m_astrTop10Category.Add("ship");
	m_astrTop10Category.Add("trade");
	m_astrTop10Category.Add("wheat");	
}

CSAXFileHandlers::~CSAXFileHandlers()
{

}

// ---------------------------------------------------------------------------
//  SAXCountHandlers: Implementation of the SAX DocumentHandler interface
// ---------------------------------------------------------------------------
void CSAXFileHandlers::startElement(const XMLCh* const uri
                                   , const XMLCh* const localname
                                   , const XMLCh* const qname
                                   , const Attributes& attrs)
{
	m_bInAuthor=false;
	m_bInDateLine=false;
	m_bInTitle=false;
	if(XMLString::compareIString(qname,L"REUTERS")==0)
	{
		//清空文档的内容
		clear();
		m_nStatus=1;
		char *id=XMLString::transcode(attrs.getValue(L"NEWID"));
		char info[50]="正在解析文档";
		strcat(info,id);
		CMessage::PrintStatusInfo(info);
		if(XMLString::compareIString(attrs.getValue(L"TOPICS"),L"YES")==0)
		{
			if((m_nSplitType==0&&m_nDocsSetType==0&&XMLString::compareIString(attrs.getValue(L"LEWISSPLIT"),L"TRAIN")==0)
				||(m_nSplitType==0&&m_nDocsSetType==1&&XMLString::compareIString(attrs.getValue(L"LEWISSPLIT"),L"TEST")==0)
				||(m_nSplitType==1&&m_nDocsSetType==0&&XMLString::compareIString(attrs.getValue(L"CGISPLIT"),L"TRAINING-SET")==0)
				||(m_nSplitType==1&&m_nDocsSetType==1&&XMLString::compareIString(attrs.getValue(L"CGISPLIT"),L"PUBLISHED-TESTSET")==0))
				strcpy(m_pDocID,id);
		}
		else
			m_nStatus=0;
	}
	else if(XMLString::compareIString(qname,L"TOPICS")==0)
	{	
		if(m_nStatus==1) m_nStatus=2;
		else m_nStatus=0;
		m_astrTopics.RemoveAll();
	}
	else if(XMLString::compareIString(qname,L"D")==0)
	{	
		if(m_nStatus==2) m_nStatus=3;
	}
	else if(XMLString::compareIString(qname,L"TEXT")==0)
	{	
		if(m_nStatus==4) m_nStatus=5;
		else m_nStatus=0;
	}
	else if(XMLString::compareIString(qname,L"AUTHOR")==0)
	{	
		if(m_nStatus==5) m_bInAuthor=true;
		else m_nStatus=0;
	}
	else if(XMLString::compareIString(qname,L"DATELINE")==0)
	{	
		if(m_nStatus==5)m_bInDateLine=true;
		else m_nStatus=0;
	}
	else if(XMLString::compareIString(qname,L"TITLE")==0)
	{	
		if(m_nStatus==5) m_bInTitle=true;
		else m_nStatus=0;
	}
}

void CSAXFileHandlers::endElement(const XMLCh *const uri, 
				const XMLCh* const localname, 
				const XMLCh* const qname)
{
	if(((m_nStatus==2)||(m_nStatus==3))&&(XMLString::compareIString(qname,L"TOPICS")==0))
		m_nStatus=4;
	if(m_nStatus==5&&XMLString::compareIString(qname,L"TEXT")==0)
	{
		writeContent();
		clear();
		m_nStatus=0;
	}
}

void CSAXFileHandlers::characters(  const   XMLCh* const    chars
								    , const unsigned int    length)
{
	if(m_nStatus==3)
	{
		char *str=XMLString::transcode(chars);
		if(str[0]!='\0'&&str[0]!='\t'&&str[0]!='\n') m_astrTopics.Add(str);
	}
	else if(m_nStatus==5&&m_bInTitle)
	{
		char *str=XMLString::transcode(chars);
		if(str[0]!='\0'&&
			((str[0]!='\t')&&(str[1]!='\0'))&&
			((str[0]!='\n')&&(str[1]!='\0')))
			strcat(m_pTitle,str);	
	}
	else if(m_nStatus==5&&m_pStream!=NULL&&
		!m_bInAuthor&&!m_bInDateLine)
	{
		char *str=XMLString::transcode(chars);
		if(str[0]!='\0'&&
			((str[0]!='\t')&&(str[1]!='\0'))&&
			((str[0]!='\n')&&(str[1]!='\0')))
			strcat(m_pContent,str);
	}
}

// ---------------------------------------------------------------------------
void CSAXFileHandlers::error(const SAXParseException& e)
{
	if(m_bShowErrors)
	{
		char info[MAX_PATH];
		sprintf(info,"\r\nFatal Error at file %s , line %d, char %d\r\n  Message: %s",
			XMLString::transcode(e.getSystemId()),
			e.getLineNumber(),
			e.getColumnNumber(),
			XMLString::transcode(e.getMessage()));
		CMessage::PrintInfo(info);
	}
}

void CSAXFileHandlers::fatalError(const SAXParseException& e)
{
	if(m_bShowErrors)
	{
		char info[MAX_PATH];
		sprintf(info,"\r\nError at file %s , line %d, char %d\r\n  Message: %s",
			XMLString::transcode(e.getSystemId()),
			e.getLineNumber(),
			e.getColumnNumber(),
			XMLString::transcode(e.getMessage()));
		CMessage::PrintInfo(info);
	}
}

void CSAXFileHandlers::warning(const SAXParseException& e)
{
	if(m_bShowErrors)
	{
		char info[MAX_PATH];
		sprintf(info,"\r\nWarning at file %s , line %d, char %d\r\n  Message: %s",
			XMLString::transcode(e.getSystemId()),
			e.getLineNumber(),
			e.getColumnNumber(),
			XMLString::transcode(e.getMessage()));
		CMessage::PrintInfo(info);
	}
}

void CSAXFileHandlers::resetErrors()
{

}

bool CSAXFileHandlers::Convert()
{
	CString		strTmp;
	FILE *stream;
	if(m_nTargetFormat!=0)
	{
		if( (stream = fopen(m_pTarget, "w+" )) == NULL )
		{
			strTmp= "无法创建文件";
			strTmp+=m_pTarget;
			strTmp+="!";
			AfxMessageBox(strTmp);
			return false;
		}
	}
	
    try {
        XMLPlatformUtils::Initialize();
    }
    catch (const XMLException&) {
		CMessage::PrintError("XML解析器初始化失败!");
        return false;
    }

    SAX2XMLReader* parser = XMLReaderFactory::createXMLReader();

    parser->setContentHandler(&theSaxFileHandler);
    parser->setErrorHandler(&theSaxFileHandler);
	parser->setFeature(XMLUni::fgSAX2CoreValidation, false);
	parser->setExitOnFirstFatalError(false);
	if(m_nTargetFormat!=0) m_pStream=stream;
	if(m_nSourceFormat==0)
	{
		HANDLE hFinder;
		LPWIN32_FIND_DATA lpFindFileData;
		lpFindFileData  = new WIN32_FIND_DATA;
		strTmp=m_pSource;
		strTmp+="\\*.sgm";
		hFinder = ::FindFirstFile(strTmp,lpFindFileData );
		do
		{
			if(lpFindFileData==NULL || !strcmp(lpFindFileData->cFileName,".") ||
				!strcmp(lpFindFileData->cFileName,"..") )
				continue;

			if(!(lpFindFileData->dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
			{
				strTmp=m_pSource;
				strTmp+="\\";
				strTmp+=lpFindFileData->cFileName;			
				try
				{
					// 在使用前将字符串转换成为宽字符,这样才能使用中文路径
					wchar_t xmlFile[MAX_PATH];
					int len=MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,strTmp.GetBuffer(0),strTmp.GetLength(),xmlFile,MAX_PATH);
					xmlFile[len]=0;
					parser->parse(xmlFile);
				}
				catch (const XMLException&)
				{
					AfxMessageBox("error!");
				}
			}
		}while(::FindNextFile(hFinder,lpFindFileData));  // process the catalog dir;
		delete	lpFindFileData;
	}
	else
	{
		try
		{
			// 在使用前将字符串转换成为宽字符,这样才能使用中文路径
			wchar_t xmlFile[MAX_PATH];
			int len=MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,m_pSource,
				strlen(m_pSource),xmlFile,MAX_PATH);
			xmlFile[len]=0;
			parser->parse(xmlFile);
		}
		catch (const XMLException&)
		{
			AfxMessageBox("error!");
		}
	}
	delete parser;
	XMLPlatformUtils::Terminate();
	if(m_nTargetFormat!=0) fclose(stream);
	return true;
}

void CSAXFileHandlers::clear()
{
	//清空文档的内容
	m_pContent[0]='\0';
	m_pTitle[0]='\0';
	m_pDocID[0]='\0';
	m_astrTopics.RemoveAll();
}

void CSAXFileHandlers::writeContent()
{
	if(m_astrTopics.GetSize()<=0||m_pDocID[0]=='\0') return;
	if(m_nTargetFormat==0)
	{
		if(m_pTitle[0]=='\0'&&m_pContent[0]=='\0') return;
		//创建文件
		char fileName[MAX_PATH];
		int i=0,nFirstTopic=0;
		if(m_nCategoryNum==0)
		{
			nFirstTopic=-1;
			for(i=0;i<m_astrTopics.GetSize();i++)
			{
				if(IsInTop10Category(m_astrTopics[0].GetBuffer(0)))
				{
					nFirstTopic=i;	
					break;
				}
			}
			if(nFirstTopic<0||nFirstTopic>=m_astrTopics.GetSize()) return;
		}
		sprintf(fileName,"%s\\%s",m_pTarget,m_astrTopics[nFirstTopic]);
		if(_chdir(fileName)<0) _mkdir(fileName);
		strcat(fileName,"\\");
		strcat(fileName,m_pDocID);
		strcat(fileName,".txt");
		if((m_pStream=fopen(fileName,"w+"))==NULL) return;
		//写文件的TITLE
		fprintf(m_pStream,"%s\n",m_pTitle);
		//写文件的内容
		fprintf(m_pStream,"%s\n",m_pContent);		
		fclose(m_pStream);

		char copyFile[MAX_PATH];
		for(i=nFirstTopic;i<m_astrTopics.GetSize();i++)
		{
			if(m_nCategoryNum==0&&!IsInTop10Category(m_astrTopics[i].GetBuffer(0))) continue;
			sprintf(copyFile,"%s\\%s",m_pTarget,m_astrTopics[i]);
			if(_chdir(copyFile)<0) _mkdir(copyFile);
			strcat(copyFile,"\\");
			strcat(copyFile,m_pDocID);
			strcat(copyFile,".txt");
			CopyFile(fileName,copyFile,false);
		}
	}
	else
	{
		if(m_pStream!=NULL)
		{
			CString strTopics;
			if(m_nCategoryNum==0)
				GetTopicString(m_astrTopics,strTopics,true);
			else
				GetTopicString(m_astrTopics,strTopics,false);
			if(!strTopics.IsEmpty())
			{
				//写文件的ID
				fprintf(m_pStream,".I %s\n",m_pDocID);
				//写文件的TOPICS
				fprintf(m_pStream,".C\n");
				fprintf(m_pStream,"%s\n",strTopics);
				//写文件的TITLE
				fprintf(m_pStream,".T\n%s\n",m_pTitle);
				//写文件的内容
				fprintf(m_pStream,".W\n%s\n",m_pContent);
			}
		}
	}	
}

bool CSAXFileHandlers::IsInTop10Category(char * name)
{
	bool bResults=false;
	CString strName=name;
	strName.MakeLower();
	for(int i=0;i<m_astrTop10Category.GetSize();i++)
	{
		if(m_astrTop10Category[i]==strName)
		{
			bResults=true;
			break;
		}
	}
	return bResults;
}

short CSAXFileHandlers::GetTopicString(CStringArray &astrToipics, CString &strTopic, bool bTop10)
{
	short num=0;
	strTopic.Empty();
	for(int i=0;i<astrToipics.GetSize();i++)
	{
		if((bTop10&&IsInTop10Category(astrToipics[i].GetBuffer(0)))||!bTop10)
		{
			strTopic+=(astrToipics[i]+" 1; ");
			num++;
		}
	}
	if(num>0) strTopic=strTopic.Left(strTopic.GetLength()-2);
	return num;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -