📄 textparser.cpp
字号:
/*
* Copyright (c) 2001,2002,2003 Mike Matsnev. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice immediately at the beginning of the file, without modification,
* this list of conditions, and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Absolutely no warranty of function or purpose is made by the author
* Mike Matsnev.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $Id: TextParser.cpp,v 1.53.2.4 2003/10/21 23:26:24 mike Exp $
*
*/
#include <afxtempl.h>
#include <afxcmn.h>
#include "TextParser.h"
#include "XMLParser.h"
#include "Unicode.h"
#include "FastArray.h"
#include "Image.h"
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
struct Para { // paragraph reference
DWORD off; // offset into file
DWORD start; // start of parsed para
DWORD rlen; // raw length
};
typedef FastArray<Para> PArray;
struct SimpleFormat {
const TCHAR *name;
DWORD mask,cmp;
};
#define MAXPLEN 4096
// soft hyphen
#define SHY 0x00AD
// Moshkov's formatting
#define STARTBOLD 20
#define ENDBOLD 21
// formats list
static SimpleFormat g_simple_formats[]={
{ _T("Line per paragraph"), 0xff, 0x0a },
{ _T("Indented first line"), 0xffff, 0x0a20 },
{ _T("MAC Line per paragraph"), 0xff, 0x0d },
{ _T("MAC Indented first line"), 0xffff, 0x0d20 }
};
#define NUM_SIMPLE_FORMATS (sizeof(g_simple_formats)/sizeof(g_simple_formats[0]))
static const TCHAR *g_ext_formats[]={
_T("XML"),
_T("PNG"),
_T("JPEG")
};
#define NUM_EXT_FORMATS (sizeof(g_ext_formats)/sizeof(g_ext_formats[0]))
#define XML_FORMAT (NUM_SIMPLE_FORMATS)
#define PNG_FORMAT (NUM_SIMPLE_FORMATS+1)
#define JPEG_FORMAT (NUM_SIMPLE_FORMATS+2)
TextParser::~TextParser() {
if (m_heap)
HeapDestroy(m_heap);
}
class SimpleTextParser : public TextParser {
protected:
PArray m_pp; // paragraph list
SimpleFormat *m_sf;
void GenericFileParse(CBufFile *fp,PArray& pp,DWORD mask,DWORD cmp);
Paragraph GenericBufParse(CBufFile *fp,const Para& p,int len);
public:
SimpleTextParser(Meter *m,CBufFile *fp,HANDLE heap,int format,int encoding,Bookmarks *bmk) :
TextParser(m,fp,heap,bmk), m_sf(&g_simple_formats[format]), m_pp(heap)
{
m_format=format;
m_encoding=encoding;
if (m_encoding<0) {
char tmp[1024];
DWORD sp=m_fp->pos();
int nb=m_fp->read(tmp,sizeof(tmp));
m_encoding=Unicode::DetectCodePage(tmp,nb);
m_fp->seek(sp);
}
GenericFileParse(m_fp,m_pp,m_sf->mask,m_sf->cmp);
if (bmk) {
for (int i=0;i<Length(0)-1;++i)
if ((i==0 || GetPLength(0,i-1)==0) && GetPLength(0,i+1)==0 &&
GetPLength(0,i)>0 && GetPLength(0,i)<90)
bmk->AddTocEnt(i,1,0,FilePos(i,0,0),0);
}
}
// paragraphs
virtual int Length(int docid) { return m_pp.GetSize()-1; } // in paragraphs
virtual Paragraph GetParagraph(int docid,int para) {
if (para>=0 && para<m_pp.GetSize()-1)
return GenericBufParse(m_fp,m_pp[para],m_pp[para+1].start-m_pp[para].start);
return Paragraph();
}
virtual int GetPLength(int docid,int para) {
if (para<0 || para>=m_pp.GetSize()-1)
return 0;
return m_pp[para+1].start-m_pp[para].start;
}
virtual int GetPStart(int docid,int para) {
if (para<0)
return 0;
if (para>=m_pp.GetSize()-1)
return m_pp[m_pp.GetSize()-1].start;
return m_pp[para].start;
}
virtual int GetTotalLength(int docid) {
return m_pp[m_pp.GetSize()-1].start;
}
virtual int LookupParagraph(int docid,int charpos);
};
int SimpleTextParser::LookupParagraph(int docid,int charpos) {
if (charpos<0 || charpos>(int)m_pp[m_pp.GetSize()-1].start)
return 0;
if (charpos==(int)m_pp[m_pp.GetSize()-1].start)
return m_pp.GetSize()-1;
int i=0,j=m_pp.GetSize()-1;
while (i<=j) {
int m=(i+j)>>1;
if (charpos<(int)m_pp[m].start)
j=m-1;
else if (charpos>=(int)m_pp[m+1].start)
i=m+1;
else
return m;
}
return 0;
}
#define RSPACE(x) ((x)<=32)
#define SPACE(x) (RSPACE(x) || (x)==SHY)
static void Addpara(int enc,PArray& pp,Buffer<char>& b,
int& parsed_start,int len,DWORD start)
{
// convert to unicode
int wclen=Unicode::WCLength(enc,b,len);
Buffer<wchar_t> wb(wclen);
Unicode::ToWC(enc,b,len,wb,wclen);
// now count length
int i,plen=0;
// skip leading spaces
for (i=0;i<wclen && SPACE(wb[i]);++i);
// count length
while (i<wclen) {
// word
while (i<wclen && !RSPACE(wb[i])) {
if (wb[i]!=SHY)
++plen;
++i;
}
// spaces
while (i<wclen && SPACE(wb[i]))
++i;
if (i<wclen) // this was not trailing space
++plen;
}
Para p;
p.start=parsed_start;
p.rlen=len;
p.off=start;
pp.Add(p);
parsed_start+=plen;
}
void SimpleTextParser::GenericFileParse(CBufFile *fp,PArray& pp,DWORD mask,DWORD cmp) {
int ch;
DWORD hist=0;
Buffer<char> b(MAXPLEN);
int rlen=0;
DWORD start=fp->pos();
int parsed_start=0;
for (;;) {
if ((ch=fp->ch())==BEOF) {
Addpara(m_encoding,pp,b,parsed_start,rlen,start);
break;
}
hist=hist<<8|ch;
if ((hist&0xffff)==0x0a0a || hist==0x0d0a0d0a || (hist&0xffff)==0x0d0d ||
(hist&mask)==cmp || rlen>=MAXPLEN)
{
bool f=rlen>=MAXPLEN;
ProgSetCur(fp->pos());
Addpara(m_encoding,pp,b,parsed_start,rlen,start);
rlen=0;
start=m_fp->pos();
if (f)
b[rlen++]=ch;
} else
b[rlen++]=ch;
}
Para p;
p.off=0;
p.rlen=0;
p.start=parsed_start;
pp.Add(p);
}
// generic buffer parser for all simple formats
Paragraph SimpleTextParser::GenericBufParse(CBufFile *fp,const Para& p,int len) {
if (!len)
return Paragraph();
// read entire buffer
Buffer<char> mbbuf(p.rlen);
fp->seek(p.off);
int nread=fp->read(mbbuf,p.rlen);
ASSERT(nread==(int)p.rlen);
// convert to unicode here
int wclen=Unicode::WCLength(m_encoding,mbbuf,nread);
Buffer<wchar_t> wcbuf(wclen);
Unicode::ToWC(m_encoding,mbbuf,nread,wcbuf,wclen);
// strip whitespace and soft hyphens
Paragraph ret(len);
wchar_t *bp=ret.str;
Attr *cfp=ret.cflags;
Attr fmt;
int count=0;
int i;
ret.cflags.Zero();
fmt.wa=0;
// skip leading spaces
for (i=0;i<wclen && SPACE(wcbuf[i]);++i);
// copy text
while (i<wclen && count<len) {
// copy word
while (i<wclen && count<len && !RSPACE(wcbuf[i])) {
if (wcbuf[i]!=SHY)
bp[count++]=wcbuf[i];
else { // handle hyphenation
// XXX
}
++i;
}
// skip spaces
while (i<wclen && SPACE(wcbuf[i]))
++i;
if (i<wclen && count<len) // not a trailing space
bp[count++]=' ';
}
ASSERT(len==count);
//p.len=count; // update paragraph length
ret.str.setsize(len);
ret.len=len;
ret.findent=3; // XXX
return ret;
}
// XXX depends on the order of records in g_simple_formats
int TextParser::DetectFormat(CBufFile *fp) {
int lines,ws,chars,check=_T('\n'),base=0;
Buffer<BYTE> buf(2048);
int nb;
fp->seek(0);
nb=fp->read(buf,2048);
/* check if this is some sort of xml */
if (nb>8 && (memcmp("<?xml",buf,5)==0 || memcmp("\xef\xbb\xbf<?xml",buf,8)==0))
return XML_FORMAT;
/* check for png */
if (nb>=4 && memcmp("\x89PNG",buf,4)==0)
return PNG_FORMAT;
/* check for jpeg */
if (nb>=10 && buf[0]==0xff && buf[1]==0xd8 &&
(memcmp("JFIF",buf+6,4)==0 || memcmp("Exif",buf+6,4)==0))
return JPEG_FORMAT;
/* check if this is macintosh crap with their CR madness */
for (chars=0;chars<nb;++chars) {
if (buf[chars]=='\n')
goto ok;
}
check='\r';
base=2;
ok:
/* we read first 50 lines and if more than 3 of them start with spaces,
and there are no lines longer than 80 chars, then this a spaced format */
int n=0;
for (lines=ws=0;lines<50;++lines) {
for (chars=0;;++chars) {
if (n>=nb) {
++lines;
goto done;
}
int ch=buf[n++];
if (chars==0 && ch==_T(' '))
++ws;
if (ch==check)
break;
}
if (chars>80 && lines>5) /* got a long line, but after the first five */
return base;
}
done:
if (lines>10 && ws>3)
return base+1;
return base;
}
int TextParser::GetNumFormats() {
return NUM_SIMPLE_FORMATS+1;
}
const TCHAR *TextParser::GetFormatName(int format) {
if (format<0 || format>=NUM_SIMPLE_FORMATS+NUM_EXT_FORMATS)
return _T("Invalid format ID");
if (format>=NUM_SIMPLE_FORMATS)
return g_ext_formats[format-NUM_SIMPLE_FORMATS];
return g_simple_formats[format].name;
}
/* Image parser */
class ImageParser : public TextParser, public ImageLoader::BinReader {
protected:
Image m_cache;
int m_crot,m_cmaxw,m_cmaxh;
public:
ImageParser(Meter *m,CBufFile *fp,HANDLE heap,int format,int encoding,Bookmarks *bmk) :
TextParser(m,fp,heap,bmk)
{
m_format=format;
m_encoding=encoding;
m_cache.hBmp=0;
}
~ImageParser() {
if (m_cache.hBmp)
DeleteObject(m_cache.hBmp);
}
// paragraphs
virtual int Length(int docid) { return 1; } // in paragraphs
virtual Paragraph GetParagraph(int docid,int para);
virtual int GetPLength(int docid,int para) { return para==0 ? 32 : 0; }
virtual int GetPStart(int docid,int para) { return 0; }
virtual int GetTotalLength(int docid) { return 32; }
virtual int LookupParagraph(int docid,int charpos) { return 0; }
// images
virtual bool GetImage(const wchar_t *name,HDC hDC,
int maxwidth,int maxheight,int rotation,Image& img);
virtual void InvalidateImageCache() {
if (m_cache.hBmp) {
DeleteObject(m_cache.hBmp);
m_cache.hBmp=0;
}
}
virtual bool IsImage() { return true; }
// BinReader interface
virtual int Read(void *buffer,int count) {
return m_fp->read(buffer,count);
}
};
Paragraph ImageParser::GetParagraph(int docid,int para) {
if (para!=0)
return Paragraph();
Paragraph p(ImageLoader::IMAGE_VSIZE);
p.lindent=p.rindent=p.findent=0;
for (int i=0;i<ImageLoader::IMAGE_VSIZE;++i) {
p.str[i]=L' ';
p.cflags[i].wa=0;
}
// abuse links for image href
p.links=Buffer<Paragraph::Link>(1);
p.links[0].off=0;
p.links[0].len=ImageLoader::IMAGE_VSIZE;
p.links[0].target=L"1";
p.flags=Paragraph::image;
return p;
}
bool ImageParser::GetImage(const wchar_t *name,HDC hDC,
int maxwidth,int maxheight,int rotation,Image& img)
{
if (!name || wcscmp(name,L"1"))
return false;
if (m_cache.hBmp &&
(m_cmaxw==maxwidth || (m_cache.width<m_cmaxw && m_cache.width<maxwidth)) &&
(m_cmaxh==maxheight || (m_cache.height<m_cmaxh && m_cache.height<maxheight)) &&
m_crot==rotation)
{
img=m_cache;
return true;
}
if (m_cache.hBmp) {
DeleteObject(m_cache.hBmp);
m_cache.hBmp=0;
}
m_fp->seek(0);
bool ret=ImageLoader::Load(hDC,
m_format==PNG_FORMAT ? L"image/png" : L"image/jpeg",
this,maxwidth,maxheight,rotation,
m_cache.hBmp,m_cache.width,m_cache.height);
if (ret) {
img=m_cache;
m_crot=rotation;
m_cmaxh=maxheight;
m_cmaxw=maxwidth;
} else
m_cache.hBmp=0;
return ret;
}
TextParser *TextParser::Create(Meter *m,CBufFile *fp,int format,int encoding,Bookmarks *bmk) {
if (format<0)
return NULL;
HANDLE heap;
heap=HeapCreate(HEAP_NO_SERIALIZE,1048576*4,0); // reserve up to 4 megs of ram
if (!heap)
return NULL;
TRY {
if (format<NUM_SIMPLE_FORMATS) {
fp->seek(0);
return new SimpleTextParser(m,fp,heap,format,encoding,bmk);
} else if (format==XML_FORMAT) { /* XML */
XMLParser *p=XMLParser::MakeParser(m,fp,bmk,heap);
p->m_format=NUM_SIMPLE_FORMATS;
if (p->ParseFile(encoding))
return p;
delete p;
} else if (format==PNG_FORMAT || format==JPEG_FORMAT) { /* Images */
return new ImageParser(m,fp,heap,format,encoding,bmk);
}
} CATCH_ALL(e) {
HeapDestroy(heap);
THROW_LAST();
}
END_CATCH_ALL
return NULL;
}
// hyphenation code by Mark Lipsman, modified my Mike
static BYTE vlist[0x92]={
//0 1 2 3 4 5 6 7 8 9 a b c d e f
0,1,0,0,1,0,1,1,0,0,0,0,0,0,1,0, // 0x400-0x40f
1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0, // 0x410-0x41f
0,0,0,1,0,0,0,0,0,0,2,1,3,1,1,1, // 0x420-0x42f
1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0, // 0x430-0x43f
0,0,0,1,0,0,0,0,0,0,2,1,3,1,1,1, // 0x440-0x44f
0,1,0,0,1,0,1,1,0,0,0,0,0,0,1,0, // 0x450-0x45f
};
#define isLetter(ch) ((ch)>=0x0401 && (ch)<=0x0491)
#define isVowel(ch) (vlist[(ch)-0x400]==1)
#define isHardSign(ch) (vlist[(ch)-0x400]==2)
#define isSoftSign(ch) (vlist[(ch)-0x400]==3)
#define isConsonant(ch) (vlist[(ch)-0x400]==0)
void Paragraph::Hyphenate() {
if (flags&hypdone)
return;
flags|=hypdone;
DWORD len=str.size();
if (!len)
return;
const wchar_t *s=str;
Attr *a=cflags;
DWORD start,end,i,j;
for (start=0;start<len;) {
// find start of word
while (start<len && !isLetter(s[start]))
++start;
// find end of word
for (end=start+1;end<len && isLetter(s[end]);++end) ;
// now look over word, placing hyphens
if (end-start>3) // word must be long enough
for (i=start;i<end-3;++i)
if (isVowel(s[i]))
for (j=i+1;j<end;++j)
if (isVowel(s[j])) {
if (isConsonant(s[i+1]) && isConsonant(s[i+2]))
++i;
else if (isConsonant(s[i+1]) &&
(isHardSign(s[i+2]) || isSoftSign(s[i+2])))
i+=2;
if (i-start>1 && end-i>2)
a[i+1].hyphen=true;
break;
}
start=end;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -