📄 saxfilehandlers.cpp
字号:
// SAXFileHandlers.cpp: implementation of the CSAXFileHandlers class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "SAXFileHandlers.h"
#include "Message.h"
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/sax2/SAX2XMLReader.hpp>
#include <xercesc/sax2/XMLReaderFactory.hpp>
#include <xercesc/sax2/DefaultHandler.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/sax2/Attributes.hpp>
#include <xercesc/sax/SAXParseException.hpp>
#include <xercesc/sax/SAXException.hpp>
#include <direct.h>
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
CSAXFileHandlers theSaxFileHandler;
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CSAXFileHandlers::CSAXFileHandlers()
{
m_nStatus=0;
m_nTargetFormat=0;
m_nSourceFormat=0;
m_astrTop10Category.Add("acq");
m_astrTop10Category.Add("corn");
m_astrTop10Category.Add("crude");
m_astrTop10Category.Add("earn");
m_astrTop10Category.Add("grain");
m_astrTop10Category.Add("interest");
m_astrTop10Category.Add("money-fx");
m_astrTop10Category.Add("ship");
m_astrTop10Category.Add("trade");
m_astrTop10Category.Add("wheat");
}
CSAXFileHandlers::~CSAXFileHandlers()
{
}
// ---------------------------------------------------------------------------
// SAXCountHandlers: Implementation of the SAX DocumentHandler interface
// ---------------------------------------------------------------------------
void CSAXFileHandlers::startElement(const XMLCh* const uri
, const XMLCh* const localname
, const XMLCh* const qname
, const Attributes& attrs)
{
m_bInAuthor=false;
m_bInDateLine=false;
m_bInTitle=false;
if(XMLString::compareIString(qname,L"REUTERS")==0)
{
//清空文档的内容
clear();
m_nStatus=1;
char *id=XMLString::transcode(attrs.getValue(L"NEWID"));
char info[50]="正在解析文档";
strcat(info,id);
CMessage::PrintStatusInfo(info);
if(XMLString::compareIString(attrs.getValue(L"TOPICS"),L"YES")==0)
{
if((m_nSplitType==0&&m_nDocsSetType==0&&XMLString::compareIString(attrs.getValue(L"LEWISSPLIT"),L"TRAIN")==0)
||(m_nSplitType==0&&m_nDocsSetType==1&&XMLString::compareIString(attrs.getValue(L"LEWISSPLIT"),L"TEST")==0)
||(m_nSplitType==1&&m_nDocsSetType==0&&XMLString::compareIString(attrs.getValue(L"CGISPLIT"),L"TRAINING-SET")==0)
||(m_nSplitType==1&&m_nDocsSetType==1&&XMLString::compareIString(attrs.getValue(L"CGISPLIT"),L"PUBLISHED-TESTSET")==0))
strcpy(m_pDocID,id);
}
else
m_nStatus=0;
}
else if(XMLString::compareIString(qname,L"TOPICS")==0)
{
if(m_nStatus==1) m_nStatus=2;
else m_nStatus=0;
m_astrTopics.RemoveAll();
}
else if(XMLString::compareIString(qname,L"D")==0)
{
if(m_nStatus==2) m_nStatus=3;
}
else if(XMLString::compareIString(qname,L"TEXT")==0)
{
if(m_nStatus==4) m_nStatus=5;
else m_nStatus=0;
}
else if(XMLString::compareIString(qname,L"AUTHOR")==0)
{
if(m_nStatus==5) m_bInAuthor=true;
else m_nStatus=0;
}
else if(XMLString::compareIString(qname,L"DATELINE")==0)
{
if(m_nStatus==5)m_bInDateLine=true;
else m_nStatus=0;
}
else if(XMLString::compareIString(qname,L"TITLE")==0)
{
if(m_nStatus==5) m_bInTitle=true;
else m_nStatus=0;
}
}
void CSAXFileHandlers::endElement(const XMLCh *const uri,
const XMLCh* const localname,
const XMLCh* const qname)
{
if(((m_nStatus==2)||(m_nStatus==3))&&(XMLString::compareIString(qname,L"TOPICS")==0))
m_nStatus=4;
if(m_nStatus==5&&XMLString::compareIString(qname,L"TEXT")==0)
{
writeContent();
clear();
m_nStatus=0;
}
}
void CSAXFileHandlers::characters( const XMLCh* const chars
, const unsigned int length)
{
if(m_nStatus==3)
{
char *str=XMLString::transcode(chars);
if(str[0]!='\0'&&str[0]!='\t'&&str[0]!='\n') m_astrTopics.Add(str);
}
else if(m_nStatus==5&&m_bInTitle)
{
char *str=XMLString::transcode(chars);
if(str[0]!='\0'&&
((str[0]!='\t')&&(str[1]!='\0'))&&
((str[0]!='\n')&&(str[1]!='\0')))
strcat(m_pTitle,str);
}
else if(m_nStatus==5&&m_pStream!=NULL&&
!m_bInAuthor&&!m_bInDateLine)
{
char *str=XMLString::transcode(chars);
if(str[0]!='\0'&&
((str[0]!='\t')&&(str[1]!='\0'))&&
((str[0]!='\n')&&(str[1]!='\0')))
strcat(m_pContent,str);
}
}
// ---------------------------------------------------------------------------
void CSAXFileHandlers::error(const SAXParseException& e)
{
if(m_bShowErrors)
{
char info[MAX_PATH];
sprintf(info,"\r\nFatal Error at file %s , line %d, char %d\r\n Message: %s",
XMLString::transcode(e.getSystemId()),
e.getLineNumber(),
e.getColumnNumber(),
XMLString::transcode(e.getMessage()));
CMessage::PrintInfo(info);
}
}
void CSAXFileHandlers::fatalError(const SAXParseException& e)
{
if(m_bShowErrors)
{
char info[MAX_PATH];
sprintf(info,"\r\nError at file %s , line %d, char %d\r\n Message: %s",
XMLString::transcode(e.getSystemId()),
e.getLineNumber(),
e.getColumnNumber(),
XMLString::transcode(e.getMessage()));
CMessage::PrintInfo(info);
}
}
void CSAXFileHandlers::warning(const SAXParseException& e)
{
if(m_bShowErrors)
{
char info[MAX_PATH];
sprintf(info,"\r\nWarning at file %s , line %d, char %d\r\n Message: %s",
XMLString::transcode(e.getSystemId()),
e.getLineNumber(),
e.getColumnNumber(),
XMLString::transcode(e.getMessage()));
CMessage::PrintInfo(info);
}
}
void CSAXFileHandlers::resetErrors()
{
}
bool CSAXFileHandlers::Convert()
{
CString strTmp;
FILE *stream;
if(m_nTargetFormat!=0)
{
if( (stream = fopen(m_pTarget, "w+" )) == NULL )
{
strTmp= "无法创建文件";
strTmp+=m_pTarget;
strTmp+="!";
AfxMessageBox(strTmp);
return false;
}
}
try {
XMLPlatformUtils::Initialize();
}
catch (const XMLException&) {
CMessage::PrintError("XML解析器初始化失败!");
return false;
}
SAX2XMLReader* parser = XMLReaderFactory::createXMLReader();
parser->setContentHandler(&theSaxFileHandler);
parser->setErrorHandler(&theSaxFileHandler);
parser->setFeature(XMLUni::fgSAX2CoreValidation, false);
parser->setExitOnFirstFatalError(false);
if(m_nTargetFormat!=0) m_pStream=stream;
if(m_nSourceFormat==0)
{
HANDLE hFinder;
LPWIN32_FIND_DATA lpFindFileData;
lpFindFileData = new WIN32_FIND_DATA;
strTmp=m_pSource;
strTmp+="\\*.sgm";
hFinder = ::FindFirstFile(strTmp,lpFindFileData );
do
{
if(lpFindFileData==NULL || !strcmp(lpFindFileData->cFileName,".") ||
!strcmp(lpFindFileData->cFileName,"..") )
continue;
if(!(lpFindFileData->dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
{
strTmp=m_pSource;
strTmp+="\\";
strTmp+=lpFindFileData->cFileName;
try
{
// 在使用前将字符串转换成为宽字符,这样才能使用中文路径
wchar_t xmlFile[MAX_PATH];
int len=MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,strTmp.GetBuffer(0),strTmp.GetLength(),xmlFile,MAX_PATH);
xmlFile[len]=0;
parser->parse(xmlFile);
}
catch (const XMLException&)
{
AfxMessageBox("error!");
}
}
}while(::FindNextFile(hFinder,lpFindFileData)); // process the catalog dir;
delete lpFindFileData;
}
else
{
try
{
// 在使用前将字符串转换成为宽字符,这样才能使用中文路径
wchar_t xmlFile[MAX_PATH];
int len=MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,m_pSource,
strlen(m_pSource),xmlFile,MAX_PATH);
xmlFile[len]=0;
parser->parse(xmlFile);
}
catch (const XMLException&)
{
AfxMessageBox("error!");
}
}
delete parser;
XMLPlatformUtils::Terminate();
if(m_nTargetFormat!=0) fclose(stream);
return true;
}
void CSAXFileHandlers::clear()
{
//清空文档的内容
m_pContent[0]='\0';
m_pTitle[0]='\0';
m_pDocID[0]='\0';
m_astrTopics.RemoveAll();
}
void CSAXFileHandlers::writeContent()
{
if(m_astrTopics.GetSize()<=0||m_pDocID[0]=='\0') return;
if(m_nTargetFormat==0)
{
if(m_pTitle[0]=='\0'&&m_pContent[0]=='\0') return;
//创建文件
char fileName[MAX_PATH];
int i=0,nFirstTopic=0;
if(m_nCategoryNum==0)
{
nFirstTopic=-1;
for(i=0;i<m_astrTopics.GetSize();i++)
{
if(IsInTop10Category(m_astrTopics[0].GetBuffer(0)))
{
nFirstTopic=i;
break;
}
}
if(nFirstTopic<0||nFirstTopic>=m_astrTopics.GetSize()) return;
}
sprintf(fileName,"%s\\%s",m_pTarget,m_astrTopics[nFirstTopic]);
if(_chdir(fileName)<0) _mkdir(fileName);
strcat(fileName,"\\");
strcat(fileName,m_pDocID);
strcat(fileName,".txt");
if((m_pStream=fopen(fileName,"w+"))==NULL) return;
//写文件的TITLE
fprintf(m_pStream,"%s\n",m_pTitle);
//写文件的内容
fprintf(m_pStream,"%s\n",m_pContent);
fclose(m_pStream);
char copyFile[MAX_PATH];
for(i=nFirstTopic;i<m_astrTopics.GetSize();i++)
{
if(m_nCategoryNum==0&&!IsInTop10Category(m_astrTopics[i].GetBuffer(0))) continue;
sprintf(copyFile,"%s\\%s",m_pTarget,m_astrTopics[i]);
if(_chdir(copyFile)<0) _mkdir(copyFile);
strcat(copyFile,"\\");
strcat(copyFile,m_pDocID);
strcat(copyFile,".txt");
CopyFile(fileName,copyFile,false);
}
}
else
{
if(m_pStream!=NULL)
{
CString strTopics;
if(m_nCategoryNum==0)
GetTopicString(m_astrTopics,strTopics,true);
else
GetTopicString(m_astrTopics,strTopics,false);
if(!strTopics.IsEmpty())
{
//写文件的ID
fprintf(m_pStream,".I %s\n",m_pDocID);
//写文件的TOPICS
fprintf(m_pStream,".C\n");
fprintf(m_pStream,"%s\n",strTopics);
//写文件的TITLE
fprintf(m_pStream,".T\n%s\n",m_pTitle);
//写文件的内容
fprintf(m_pStream,".W\n%s\n",m_pContent);
}
}
}
}
bool CSAXFileHandlers::IsInTop10Category(char * name)
{
bool bResults=false;
CString strName=name;
strName.MakeLower();
for(int i=0;i<m_astrTop10Category.GetSize();i++)
{
if(m_astrTop10Category[i]==strName)
{
bResults=true;
break;
}
}
return bResults;
}
short CSAXFileHandlers::GetTopicString(CStringArray &astrToipics, CString &strTopic, bool bTop10)
{
short num=0;
strTopic.Empty();
for(int i=0;i<astrToipics.GetSize();i++)
{
if((bTop10&&IsInTop10Category(astrToipics[i].GetBuffer(0)))||!bTop10)
{
strTopic+=(astrToipics[i]+" 1; ");
num++;
}
}
if(num>0) strTopic=strTopic.Left(strTopic.GetLength()-2);
return num;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -