📄 ezxml.c
字号:
if (! root->pi[0]) *(root->pi = malloc(sizeof(char **))) = NULL; //first pi
while (root->pi[i] && strcmp(target, root->pi[i][0])) i++; // find target
if (! root->pi[i]) { // new target
root->pi = realloc(root->pi, sizeof(char **) * (i + 2));
root->pi[i] = malloc(sizeof(char *) * 3);
root->pi[i][0] = target;
root->pi[i][1] = (char *)(root->pi[i + 1] = NULL); // terminate pi list
root->pi[i][2] = strdup(""); // empty document position list
}
while (root->pi[i][j]) j++; // find end of instruction list for this target
root->pi[i] = realloc(root->pi[i], sizeof(char *) * (j + 3));
root->pi[i][j + 2] = realloc(root->pi[i][j + 1], j + 1);
strcpy(root->pi[i][j + 2] + j - 1, (root->xml.name) ? ">" : "<");
root->pi[i][j + 1] = NULL; // null terminate pi list for this target
root->pi[i][j] = s; // set instruction
}
// called when the parser finds an internal doctype subset
short ezxml_internal_dtd(ezxml_root_t root, char *s, size_t len)
{
char q, *c, *t, *n = NULL, *v, **ent, **pe;
int i, j;
pe = memcpy(malloc(sizeof(EZXML_NIL)), EZXML_NIL, sizeof(EZXML_NIL));
for (s[len] = '\0'; s; ) {
while (*s && *s != '<' && *s != '%') s++; // find next declaration
if (! *s) break;
else if (! strncmp(s, "<!ENTITY", 8)) { // parse entity definitions
c = s += strspn(s + 8, EZXML_WS) + 8; // skip white space separator
n = s + strspn(s, EZXML_WS "%"); // find name
*(s = n + strcspn(n, EZXML_WS)) = ';'; // append ; to name
v = s + strspn(s + 1, EZXML_WS) + 1; // find value
if ((q = *(v++)) != '"' && q != '\'') { // skip externals
s = strchr(s, '>');
continue;
}
for (i = 0, ent = (*c == '%') ? pe : root->ent; ent[i]; i++);
ent = realloc(ent, (i + 3) * sizeof(char *)); // space for next ent
if (*c == '%') pe = ent;
else root->ent = ent;
*(++s) = '\0'; // null terminate name
if ((s = strchr(v, q))) *(s++) = '\0'; // null terminate value
ent[i + 1] = ezxml_decode(v, pe, '%'); // set value
ent[i + 2] = NULL; // null terminate entity list
if (! ezxml_ent_ok(n, ent[i + 1], ent)) { // circular reference
if (ent[i + 1] != v) free(ent[i + 1]);
ezxml_err(root, v, "circular entity declaration &%s", n);
break;
}
else ent[i] = n; // set entity name
}
else if (! strncmp(s, "<!ATTLIST", 9)) { // parse default attributes
t = s + strspn(s + 9, EZXML_WS) + 9; // skip whitespace separator
if (! *t) { ezxml_err(root, t, "unclosed <!ATTLIST"); break; }
if (*(s = t + strcspn(t, EZXML_WS ">")) == '>') continue;
else *s = '\0'; // null terminate tag name
for (i = 0; root->attr[i] && strcmp(n, root->attr[i][0]); i++);
while (*(n = ++s + strspn(s, EZXML_WS)) && *n != '>') {
if (*(s = n + strcspn(n, EZXML_WS))) *s = '\0'; // attr name
else { ezxml_err(root, t, "malformed <!ATTLIST"); break; }
s += strspn(s + 1, EZXML_WS) + 1; // find next token
c = (strncmp(s, "CDATA", 5)) ? "*" : " "; // is it cdata?
if (! strncmp(s, "NOTATION", 8))
s += strspn(s + 8, EZXML_WS) + 8;
s = (*s == '(') ? strchr(s, ')') : s + strcspn(s, EZXML_WS);
if (! s) { ezxml_err(root, t, "malformed <!ATTLIST"); break; }
s += strspn(s, EZXML_WS ")"); // skip white space separator
if (! strncmp(s, "#FIXED", 6))
s += strspn(s + 6, EZXML_WS) + 6;
if (*s == '#') { // no default value
s += strcspn(s, EZXML_WS ">") - 1;
if (*c == ' ') continue; // cdata is default, nothing to do
v = NULL;
}
else if ((*s == '"' || *s == '\'') && // default value
(s = strchr(v = s + 1, *s))) *s = '\0';
else { ezxml_err(root, t, "malformed <!ATTLIST"); break; }
if (! root->attr[i]) { // new tag name
root->attr = (! i) ? malloc(2 * sizeof(char **))
: realloc(root->attr,
(i + 2) * sizeof(char **));
root->attr[i] = malloc(2 * sizeof(char *));
root->attr[i][0] = t; // set tag name
root->attr[i][1] = (char *)(root->attr[i + 1] = NULL);
}
for (j = 1; root->attr[i][j]; j += 3); // find end of list
root->attr[i] = realloc(root->attr[i],
(j + 4) * sizeof(char *));
root->attr[i][j + 3] = NULL; // null terminate list
root->attr[i][j + 2] = c; // is it cdata?
root->attr[i][j + 1] = (v) ? ezxml_decode(v, root->ent, *c)
: NULL;
root->attr[i][j] = n; // attribute name
}
}
else if (! strncmp(s, "<!--", 4)) s = strstr(s + 4, "-->"); // comments
else if (! strncmp(s, "<?", 2)) { // processing instructions
if ((s = strstr(c = s + 2, "?>")))
ezxml_proc_inst(root, c, s++ - c);
}
else if (*s == '<') s = strchr(s, '>'); // skip other declarations
else if (*(s++) == '%' && ! root->standalone) break;
}
free(pe);
return ! *root->err;
}
// Converts a UTF-16 string to UTF-8. Returns a new string that must be freed
// or NULL if no conversion was needed.
char *ezxml_str2utf8(char **s, size_t *len)
{
char *u;
size_t l = 0, sl, max = *len;
long c, d;
int b, be = (**s == '\xFE') ? 1 : (**s == '\xFF') ? 0 : -1;
if (be == -1) return NULL; // not UTF-16
u = malloc(max);
for (sl = 2; sl < *len - 1; sl += 2) {
c = (be) ? (((*s)[sl] & 0xFF) << 8) | ((*s)[sl + 1] & 0xFF) //UTF-16BE
: (((*s)[sl + 1] & 0xFF) << 8) | ((*s)[sl] & 0xFF); //UTF-16LE
if (c >= 0xD800 && c <= 0xDFFF && (sl += 2) < *len - 1) { // high-half
d = (be) ? (((*s)[sl] & 0xFF) << 8) | ((*s)[sl + 1] & 0xFF)
: (((*s)[sl + 1] & 0xFF) << 8) | ((*s)[sl] & 0xFF);
c = (((c & 0x3FF) << 10) | (d & 0x3FF)) + 0x10000;
}
while (l + 6 > max) u = realloc(u, max += EZXML_BUFSIZE);
if (c < 0x80) u[l++] = c; // US-ASCII subset
else { // multi-byte UTF-8 sequence
for (b = 0, d = c; d; d /= 2) b++; // bits in c
b = (b - 2) / 5; // bytes in payload
u[l++] = (0xFF << (7 - b)) | (c >> (6 * b)); // head
while (b) u[l++] = 0x80 | ((c >> (6 * --b)) & 0x3F); // payload
}
}
return *s = realloc(u, *len = l);
}
// frees a tag attribute list
void ezxml_free_attr(char **attr) {
int i = 0;
char *m;
if (! attr || attr == EZXML_NIL) return; // nothing to free
while (attr[i]) i += 2; // find end of attribute list
m = attr[i + 1]; // list of which names and values are malloced
for (i = 0; m[i]; i++) {
if (m[i] & EZXML_NAMEM) free(attr[i * 2]);
if (m[i] & EZXML_TXTM) free(attr[(i * 2) + 1]);
}
free(m);
free(attr);
}
// parse the given xml string and return an ezxml structure
ezxml_t ezxml_parse_str(char *s, size_t len)
{
ezxml_root_t root = (ezxml_root_t)ezxml_new(NULL);
char q, e, *d, **attr, **a = NULL; // initialize a to avoid compile warning
int l, i, j;
root->m = s;
if (! len) return ezxml_err(root, s, "root tag missing");
root->u = ezxml_str2utf8(&s, &len); // convert utf-16 to utf-8
root->e = (root->s = s) + len; // record start and end of work area
e = s[len - 1]; // save end char
s[len - 1] = '\0'; // turn end char into null terminator
while (*s && *s != '<') s++; // find first tag
if (! *s) return ezxml_err(root, s, "root tag missing");
for (; ; ) {
attr = (char **)EZXML_NIL;
d = ++s;
if (isalpha(*s) || *s == '_' || *s == ':') { // new tag
if (! root->cur)
return ezxml_err(root, d, "markup outside of root element");
s += strcspn(s, EZXML_WS "/>");
while (isspace(*s)) *(s++) = '\0'; // null terminate tag name
if (*s && *s != '/' && *s != '>') // find tag in default attr list
for (i = 0; (a = root->attr[i]) && strcmp(a[0], d); i++);
for (l = 0; *s && *s != '/' && *s != '>'; l += 2) { // new attrib
attr = (l) ? realloc(attr, (l + 4) * sizeof(char *))
: malloc(4 * sizeof(char *)); // allocate space
attr[l + 3] = (l) ? realloc(attr[l + 1], (l / 2) + 2)
: malloc(2); // mem for list of maloced vals
strcpy(attr[l + 3] + (l / 2), " "); // value is not malloced
attr[l + 2] = NULL; // null terminate list
attr[l + 1] = ""; // temporary attribute value
attr[l] = s; // set attribute name
s += strcspn(s, EZXML_WS "=/>");
if (*s == '=' || isspace(*s)) {
*(s++) = '\0'; // null terminate tag attribute name
q = *(s += strspn(s, EZXML_WS "="));
if (q == '"' || q == '\'') { // attribute value
attr[l + 1] = ++s;
while (*s && *s != q) s++;
if (*s) *(s++) = '\0'; // null terminate attribute val
else {
ezxml_free_attr(attr);
return ezxml_err(root, d, "missing %c", q);
}
for (j = 1; a && a[j] && strcmp(a[j], attr[l]); j +=3);
attr[l + 1] = ezxml_decode(attr[l + 1], root->ent, (a
&& a[j]) ? *a[j + 2] : ' ');
if (attr[l + 1] < d || attr[l + 1] > s)
attr[l + 3][l / 2] = EZXML_TXTM; // value malloced
}
}
while (isspace(*s)) s++;
}
if (*s == '/') { // self closing tag
*(s++) = '\0';
if ((*s && *s != '>') || (! *s && e != '>')) {
if (l) ezxml_free_attr(attr);
return ezxml_err(root, d, "missing >");
}
ezxml_open_tag(root, d, attr);
ezxml_close_tag(root, d, s);
}
else if ((q = *s) == '>' || (! *s && e == '>')) { // open tag
*s = '\0'; // temporarily null terminate tag name
ezxml_open_tag(root, d, attr);
*s = q;
}
else {
if (l) ezxml_free_attr(attr);
return ezxml_err(root, d, "missing >");
}
}
else if (*s == '/') { // close tag
s += strcspn(d = s + 1, EZXML_WS ">") + 1;
if (! (q = *s) && e != '>') return ezxml_err(root, d, "missing >");
*s = '\0'; // temporarily null terminate tag name
if (ezxml_close_tag(root, d, s)) return &root->xml;
if (isspace(*s = q)) s += strspn(s, EZXML_WS);
}
else if (! strncmp(s, "!--", 3)) { // comment
if (! (s = strstr(s + 3, "--")) || (*(s += 2) != '>' && *s) ||
(! *s && e != '>')) return ezxml_err(root, d, "unclosed <!--");
}
else if (! strncmp(s, "![CDATA[", 8)) { // cdata
if ((s = strstr(s, "]]>")))
ezxml_char_content(root, d + 8, (s += 2) - d - 10, 'c');
else return ezxml_err(root, d, "unclosed <![CDATA[");
}
else if (! strncmp(s, "!DOCTYPE", 8)) { // dtd
for (l = 0; *s && ((! l && *s != '>') || (l && (*s != ']' ||
*(s + strspn(s + 1, EZXML_WS) + 1) != '>')));
l = (*s == '[') ? 1 : l) s += strcspn(s + 1, "[]>") + 1;
if (! *s && e != '>')
return ezxml_err(root, d, "unclosed <!DOCTYPE");
d = (l) ? strchr(d, '[') + 1 : d;
if (l && ! ezxml_internal_dtd(root, d, s++ - d)) return &root->xml;
}
else if (*s == '?') { // <?...?> processing instructions
do { s = strchr(s, '?'); } while (s && *(++s) && *s != '>');
if (! s || (! *s && e != '>'))
return ezxml_err(root, d, "unclosed <?");
else ezxml_proc_inst(root, d + 1, s - d - 2);
}
else return ezxml_err(root, d, "unexpected <");
if (! s || ! *s) break;
*s = '\0';
d = ++s;
if (*s && *s != '<') { // tag character content
while (*s && *s != '<') s++;
if (*s) ezxml_char_content(root, d, s - d, '&');
else break;
}
else if (! *s) break;
}
if (! root->cur) return &root->xml;
else if (! root->cur->name) return ezxml_err(root, d, "root tag missing");
else return ezxml_err(root, d, "unclosed tag <%s>", root->cur->name);
}
#ifdef CYGPKG_IO_FILEIO
// Wrapper for ezxml_parse_str() that accepts a file stream. Reads the entire
// stream into memory and then parses it. For xml files, use ezxml_parse_file()
// or ezxml_parse_fd()
ezxml_t ezxml_parse_fp(FILE *fp)
{
ezxml_root_t root;
size_t l, len = 0;
char *s;
if (! (s = malloc(EZXML_BUFSIZE))) return NULL;
do {
len += (l = fread((s + len), 1, EZXML_BUFSIZE, fp));
if (l == EZXML_BUFSIZE) s = realloc(s, len + EZXML_BUFSIZE);
} while (s && l == EZXML_BUFSIZE);
if (! s) return NULL;
root = (ezxml_root_t)ezxml_parse_str(s, len);
root->len = -1; // so we know to free s in ezxml_free()
return &root->xml;
}
// A wrapper for ezxml_parse_str() that accepts a file descriptor. First
// attempts to mem map the file. Failing that, reads the file into memory.
// Returns NULL on failure.
ezxml_t ezxml_parse_fd(int fd)
{
ezxml_root_t root;
struct stat st;
size_t l;
void *m;
if (fd < 0) return NULL;
fstat(fd, &st);
#ifndef EZXML_NOMMAP
l = (st.st_size + sysconf(_SC_PAGESIZE) - 1) & ~(sysconf(_SC_PAGESIZE) -1);
if ((m = mmap(NULL, l, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0)) !=
MAP_FAILED) {
madvise(m, l, MADV_SEQUENTIAL); // optimize for sequential access
root = (ezxml_root_t)ezxml_parse_str(m, st.st_size);
madvise(m, root->len = l, MADV_NORMAL); // put it back to normal
}
else { // mmap failed, read file into memory
#endif // EZXML_NOMMAP
l = read(fd, m = malloc(st.st_size), st.st_size);
root = (ezxml_root_t)ezxml_parse_str(m, l);
root->len = -1; // so we know to free s in ezxml_free()
#ifndef EZXML_NOMMAP
}
#endif // EZXML_NOMMAP
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -