📄 xmlparser.cpp
字号:
/** **************************************************************************** * <P> XML.c - implementation file for basic XML parser written in ANSI C++ * for portability. It works by using recursion and a node tree for breaking * down the elements of an XML document. </P> * * @version V1.11 * * @author Frank Vanden Berghen * based on original implementation by Martyn C Brown * * NOTE: * * If you add "#define STRICT_PARSING", on the first line of this file * the parser will see the following XML-stream: * <a><b>some text</b><b>other text </a> * as an error. Otherwise, this tring will be equivalent to: * <a><b>some text</b><b>other text</b></a> * * NOTE: * * If you add "#define APPROXIMATE_PARSING", on the first line of this file * the parser will see the following XML-stream: * <data name="n1"> * <data name="n2"> * <data name="n3" /> * as equivalent to the following XML-stream: * <data name="n1" /> * <data name="n2" /> * <data name="n3" /> * This can be useful for badly-formed XML-streams but prevent the use * of the following XML-stream: * <data name="n1"> * <data name="n2"> * <data name="n3" /> * </data> * </data> * * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1 as published by the Free Software Foundation * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * **************************************************************************** */#ifdef WIN32#define WIN32_LEAN_AND_MEAN
#include <Windows.h> // to have IsTextUnicode, MultiByteToWideChar, WideCharToMultiByte // to handle unicode files#endif#include <memory.h>#include <assert.h>#include <stdio.h>#include <string.h>#include <stdlib.h>#include "xmlParser.h"//#ifdef WIN32
//#ifdef _DEBUG
//#define _CRTDBG_MAP_ALLOC
//#include <crtdbg.h>
//#endif
//#endif
XMLNode XMLNode::emptyXMLNode;XMLClear XMLNode::emptyXMLClear={ NULL, NULL, NULL};XMLAttribute XMLNode::emptyXMLAttribute={ NULL, NULL};#ifndef WIN32int _tcslen(const char *c) { return strlen(c); }int _tcsnicmp(const char *c1, const char *c2, int l) { return strncasecmp(c1,c2,l); }int _tcsicmp(const char *c1, const char *c2) { return strcasecmp(c1,c2); }char *_tcsstr(const char *c1, const char *c2) { return (char*)strstr(c1,c2); }char *_tcschr(const char *c1, int c2) { return (char*)strchr(c1,c2); }char *_tcscpy(char *c1, const char *c2) { return (char*)strcpy(c1,c2); }#endifinline int mmin( const int t1, const int t2 ) { return t1 < t2 ? t1 : t2; }// Enumeration used to decipher what type a token istypedef enum TokenTypeTag{ eTokenText = 0, eTokenQuotedText, eTokenTagStart, /* "<" */ eTokenTagEnd, /* "</" */ eTokenCloseTag, /* ">" */ eTokenEquals, /* "=" */ eTokenDeclaration, /* "<?" */ eTokenShortHandClose, /* "/>" */ eTokenClear, eTokenError};#define INDENTCHAR _T('\t')typedef struct ClearTag{ LPCTSTR lpszOpen; LPCTSTR lpszClose;} ClearTag;// Main structure used for parsing XMLtypedef struct XML{ LPCTSTR lpXML; int nIndex; enum XMLError error; LPCTSTR lpEndTag; int cbEndTag; LPCTSTR lpNewElement; int cbNewElement; int nFirst; ClearTag *pClrTags;} XML;typedef struct{ ClearTag *pClr; LPCTSTR pStr;} NextToken;// Enumeration used when parsing attributestypedef enum Attrib{ eAttribName = 0, eAttribEquals, eAttribValue} Attrib;// Enumeration used when parsing elements to dictate whether we are currently// inside a tagtypedef enum Status{ eInsideTag = 0, eOutsideTag} Status;// private:LPTSTR toXMLString(LPTSTR dest,LPCTSTR source){ LPTSTR dd=dest; while (*source) { switch (*source) { case '<' : _tcscpy(dest,_T("<" )); dest+=4; break; case '>' : _tcscpy(dest,_T(">" )); dest+=4; break; case '&' : _tcscpy(dest,_T("&" )); dest+=5; break; case '\'': _tcscpy(dest,_T("'")); dest+=6; break; case '"' : _tcscpy(dest,_T(""")); dest+=6; break; default: *dest=*source; dest++; break; } source++; } *dest=0; return dd;}// private:int lengthXMLString(LPCTSTR source){ int r=0; while (*source) { switch (*source) { case '<': r+=3; break; case '>' : r+=3; break; case '&' : r+=4; break; case '\'': r+=5; break; case '"' : r+=5; break; } source++; r++; } return r;}LPTSTR toXMLString(LPCTSTR source){ LPTSTR dest=(LPTSTR)malloc((lengthXMLString(source)+1)*sizeof(TCHAR)); return toXMLString(dest,source);}LPTSTR toXMLStringFast(LPTSTR *dest,int *destSz, LPCTSTR source){ int l=lengthXMLString(source)+1; if (l>*destSz) { *destSz=l; *dest=(LPTSTR)realloc(*dest,l*sizeof(TCHAR)); } return toXMLString(*dest,source);}// private:LPTSTR fromXMLString(LPCTSTR s, int lo){ // This function is the opposite of the function "toXMLString". It decodes the escape // sequences &, ", ', <, > and replace them by the characters // &,",',<,>. This function is used internally by the XML Parser. All the calls to // the XML library will always gives you back "decoded" strings. // // in: string (s) and length (lo) of string // out: new allocated string converted from xml if (!s) return NULL; int ll=0; LPTSTR d; LPCTSTR ss=s; while (((lo--)>0)&&(*s)) { if (*s==_T('&')) { s++; if (_tcsnicmp(s,_T("lt;" ),3)==0) { s+=2; lo-=3; } else if (_tcsnicmp(s,_T("gt;" ),3)==0) { s+=2; lo-=3; } else if (_tcsnicmp(s,_T("amp;" ),4)==0) { s+=3; lo-=4; } else if (_tcsnicmp(s,_T("apos;"),5)==0) { s+=4; lo-=5; } else if (_tcsnicmp(s,_T("quot;"),5)==0) { s+=4; lo-=5; } else { ll=0; while (s[ll]&&(s[ll]!=_T(';'))&&(ll<10)) ll++; ll++; d=(LPTSTR)malloc((ll+1)*sizeof(TCHAR)); d[ll]=0; while(ll--) d[ll]=s[ll];#ifdef _UNICODE printf("unknown escape character: '&%S'",d);#else printf("unknown escape character: '&%s'",d);#endif free(d); exit(255); } }; ll++; s++; } d=(LPTSTR)malloc((ll+1)*sizeof(TCHAR)); s=d; while (ll--) { if (*ss==_T('&')) { ss++; if (_tcsnicmp(ss,_T("lt;" ),3)==0) { *(d++)=_T('<' ); ss+=3; } else if (_tcsnicmp(ss,_T("gt;" ),3)==0) { *(d++)=_T('>' ); ss+=3; } else if (_tcsnicmp(ss,_T("amp;" ),4)==0) { *(d++)=_T('&' ); ss+=4; } else if (_tcsnicmp(ss,_T("apos;"),5)==0) { *(d++)=_T('\''); ss+=5; } else { *(d++)=_T('"' ); ss+=5; } } else { *(d++)=*ss; ss++; } } *d=0; return (LPTSTR)s;}// private:char myTagCompare(LPCTSTR cclose, LPCTSTR copen)// !!!! WARNING strange convention&:// return 0 if equals// return 1 if different{ if (!cclose) return 1; int l=(int)_tcslen(cclose); if (_tcsnicmp(cclose, copen, l)!=0) return 1; const TCHAR c=copen[l]; if ((c==_T('\n'))|| (c==_T(' ' ))|| (c==_T('\t'))|| (c==_T('\r'))|| (c==_T('/' ))|| (c==_T('<' ))|| (c==_T('>' ))|| (c==_T('=' ))) return 0; return 1;}// private:// update "order" information when deleting a content of a XMLNodevoid XMLNode::removeOrderElement(XMLNodeData *d, XMLElementType t, int index){ int j=(int)((index<<2)+t),i=0,n=nElement(d)+1, *o=d->pOrder; while ((o[i]!=j)&&(i<n)) i++; n--; memmove(o+i, o+i+1, (n-i)*sizeof(int)); for (;i<n;i++) if ((o[i]&3)==(int)t) o[i]-=4;// We should normally do:// d->pOrder=(int)realloc(d->pOrder,n*sizeof(int));// but we skip reallocation because it's too time consuming.// Anyway, at the end, it will be free'd completely at once.}// Obtain the next character from the string.static inline TCHAR getNextChar(XML *pXML){ TCHAR ch = pXML->lpXML[pXML->nIndex]; if (ch!=0) pXML->nIndex++; return ch;}// Find next non-white space character.static TCHAR FindNonWhiteSpace(XML *pXML){ TCHAR ch; int nExit = FALSE; assert(pXML); // Iterate through characters in the string until we find a NULL or a // non-white space character while((nExit == FALSE) && (ch = getNextChar(pXML))) { switch(ch) { // Ignore white space case _T('\n'): case _T(' '): case _T('\t'): case _T('\r'): continue; default: nExit = TRUE; } } return ch;}// Find the next token in a string.// pcbToken contains the number of characters that have been read.static NextToken GetNextToken(XML *pXML, int *pcbToken, enum TokenTypeTag *pType){ NextToken result; LPCTSTR lpXML; TCHAR ch; TCHAR chTemp; int nSize; int nFoundMatch; int nExit; int n; LPCTSTR lpszOpen; int cbOpen; int nIsText = FALSE; // Find next non-white space character ch = FindNonWhiteSpace(pXML); if (ch) { // Cache the current string pointer lpXML = pXML->lpXML; result.pStr = &lpXML[pXML->nIndex-1]; // First check whether the token is in the clear tag list (meaning it // does not need formatting). n = 0; while(TRUE) { // Obtain the name of the open part of the clear tag lpszOpen = pXML->pClrTags[n].lpszOpen; if (lpszOpen) { // Compare the open tag with the current token cbOpen = (int)_tcslen(lpszOpen); // if (myTagCompare(lpszOpen, result.pStr) == 0) if (_tcsnicmp(lpszOpen, result.pStr, cbOpen)==0) { result.pClr = &pXML->pClrTags[n]; pXML->nIndex += (int)(_tcslen(lpszOpen)-1); *pType = eTokenClear; return result; } n++; } else break; } // If we didn't find a clear tag then check for standard tokens chTemp = 0; lpXML = pXML->lpXML; switch(ch) { // Check for quotes case _T('\''): case _T('\"'): // Type of token *pType = eTokenQuotedText; chTemp = ch; n=pXML->nIndex; // Set the size nSize = 1; nFoundMatch = FALSE; // Search through the string to find a matching quote while((ch = getNextChar(pXML))) { nSize++; if (ch==chTemp) { nFoundMatch = TRUE; break; } if (ch==_T('<')) break; } // If we failed to find a matching quote if (nFoundMatch == FALSE) { pXML->nIndex=n-1; ch=getNextChar(pXML); nIsText=TRUE; break; } // 4.02.2002 if (FindNonWhiteSpace(pXML)) { pXML->nIndex--; } break; // Equals (used with attribute values) case _T('='): nSize = 1; *pType = eTokenEquals; break; // Close tag case _T('>'): nSize = 1; *pType = eTokenCloseTag; break; // Check for tag start and tag end case _T('<'): // Peek at the next character to see if we have an end tag '</', // or an xml declaration '<?' chTemp = pXML->lpXML[pXML->nIndex]; // If we have a tag end... if (chTemp == _T('/')) { // Set the type and ensure we point at the next character getNextChar(pXML); *pType = eTokenTagEnd; nSize = 2; } // If we have an XML declaration tag else if (chTemp == _T('?')) { // Set the type and ensure we point at the next character getNextChar(pXML); *pType = eTokenDeclaration; nSize = 2; } // Otherwise we must have a start tag else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -