📄 parsersax.tcc
字号:
#include <cctype>//#define DEBUG//#include "debug.h"staticstruct literal_tag { int len; char* str; int is_cdata;} literal_mode_elem[] ={ {6, "script", 1}, {5, "style", 1}, {3, "xmp", 1}, {9, "plaintext", 1}, {8, "textarea", 0}, {0, 0, 0}};template <typename _Iterator>void htmlcxx::HTML::ParserSax::parse(_Iterator begin, _Iterator end){// std::cerr << "Parsing iterator" << std::endl; parse(begin, end, typename std::iterator_traits<_Iterator>::iterator_category());}template <typename _Iterator>void htmlcxx::HTML::ParserSax::parse(_Iterator &begin, _Iterator &end, std::forward_iterator_tag){ typedef _Iterator iterator;// std::cerr << "Parsing forward_iterator" << std::endl; mCdata = false; mpLiteral = 0; mCurrentOffset = 0; this->beginParsing();// DEBUGP("Parsed text\n"); while (begin != end) { *begin; // This is for the multi_pass to release the buffer iterator c(begin); while (c != end) { // For some tags, the text inside it is considered literal and is // only closed for its </TAG> counterpart while (mpLiteral) {// DEBUGP("Treating literal %s\n", mpLiteral); while (c != end && *c != '<') ++c; if (c == end) { if (c != begin) this->parseContent(begin, c); goto DONE; } iterator end_text(c); ++c; if (*c == '/') { ++c; const char *l = mpLiteral; while (*l && ::tolower(*c) == *l) { ++c; ++l; } // FIXME: Mozilla stops when it sees a /plaintext. Check // other browsers and decide what to do if (!*l && strcmp(mpLiteral, "plaintext")) { // matched all and is not tag plaintext while (isspace(*c)) ++c; if (*c == '>') { ++c; if (begin != end_text) this->parseContent(begin, end_text); mpLiteral = 0; c = end_text; begin = c; break; } } } else if (*c == '!') { // we may find a comment and we should support it iterator e(c); ++e; if (e != end && *e == '-' && ++e != end && *e == '-') {// DEBUGP("Parsing comment\n"); ++e; c = this->skipHtmlComment(e, end); } //if (begin != end_text) //this->parseContent(begin, end_text, end); //this->parseComment(end_text, c, end); // continue from the end of the comment //begin = c; } } if (*c == '<') { iterator d(c); ++d; if (d != end) { if (isalpha(*d)) { // beginning of tag if (begin != c) this->parseContent(begin, c);// DEBUGP("Parsing beginning of tag\n"); d = this->skipHtmlTag(d, end); this->parseHtmlTag(c, d); // continue from the end of the tag c = d; begin = c; break; } if (*d == '/') { if (begin != c) this->parseContent(begin, c); iterator e(d); ++e; if (e != end && isalpha(*e)) { // end of tag// DEBUGP("Parsing end of tag\n"); d = this->skipHtmlTag(d, end); this->parseHtmlTag(c, d); } else { // not a conforming end of tag, treat as comment // as Mozilla does// DEBUGP("Non conforming end of tag\n"); d = this->skipHtmlTag(d, end); this->parseComment(c, d); } // continue from the end of the tag c = d; begin = c; break; } if (*d == '!') { // comment if (begin != c) this->parseContent(begin, c); iterator e(d); ++e; if (e != end && *e == '-' && ++e != end && *e == '-') {// DEBUGP("Parsing comment\n"); ++e; d = this->skipHtmlComment(e, end); } else { d = this->skipHtmlTag(d, end); } this->parseComment(c, d); // continue from the end of the comment c = d; begin = c; break; } if (*d == '?' || *d == '%') { // something like <?xml or <%VBSCRIPT if (begin != c) this->parseContent(begin, c); d = this->skipHtmlTag(d, end); this->parseComment(c, d); // continue from the end of the comment c = d; begin = c; break; } } } c++; } // There may be some text in the end of the document if (begin != c) { this->parseContent(begin, c); begin = c; } }DONE: this->endParsing(); return;}template <typename _Iterator>void htmlcxx::HTML::ParserSax::parseComment(_Iterator b, _Iterator c){// DEBUGP("Creating comment node %s\n", std::string(b, c).c_str()); htmlcxx::HTML::Node com_node; //FIXME: set_tagname shouldn't be needed, but first I must check //legacy code std::string comment(b, c); com_node.tagName(comment); com_node.text(comment); com_node.offset(mCurrentOffset); com_node.length((unsigned int)comment.length()); com_node.isTag(false); com_node.isComment(true); mCurrentOffset += com_node.length(); // Call callback method this->foundComment(com_node);}template <typename _Iterator>void htmlcxx::HTML::ParserSax::parseContent(_Iterator b, _Iterator c){// DEBUGP("Creating text node %s\n", (std::string(b, c)).c_str()); htmlcxx::HTML::Node txt_node; //FIXME: set_tagname shouldn't be needed, but first I must check //legacy code std::string text(b, c); txt_node.tagName(text); txt_node.text(text); txt_node.offset(mCurrentOffset); txt_node.length((unsigned int)text.length()); txt_node.isTag(false); txt_node.isComment(false); mCurrentOffset += txt_node.length(); // Call callback method this->foundText(txt_node);}template <typename _Iterator>void htmlcxx::HTML::ParserSax::parseHtmlTag(_Iterator b, _Iterator c){ _Iterator name_begin(b); ++name_begin; bool is_end_tag = (*name_begin == '/'); if (is_end_tag) ++name_begin; _Iterator name_end(name_begin); while (name_end != c && isalnum(*name_end)) { ++name_end; } std::string name(name_begin, name_end);// DEBUGP("Found %s tag %s\n", is_end_tag ? "closing" : "opening", name.c_str()); if (!is_end_tag) { std::string::size_type tag_len = name.length(); for (int i = 0; literal_mode_elem[i].len; ++i) { if (tag_len == literal_mode_elem[i].len) { #ifdef WIN32 if (!_stricmp(name.c_str(), literal_mode_elem[i].str)) #else if (!strcasecmp(name.c_str(), literal_mode_elem[i].str)) #endif { mpLiteral = literal_mode_elem[i].str; break; } } } } htmlcxx::HTML::Node tag_node; //by now, length is just the size of the tag std::string text(b, c); tag_node.length(static_cast<unsigned int>(text.length())); tag_node.tagName(name); tag_node.text(text); tag_node.offset(mCurrentOffset); tag_node.isTag(true); tag_node.isComment(false); mCurrentOffset += tag_node.length(); this->foundTag(tag_node, is_end_tag);}template <typename _Iterator>_Iteratorhtmlcxx::HTML::ParserSax::skipHtmlComment(_Iterator c, _Iterator end){ while ( c != end ) { if (*c++ == '-' && c != end && *c == '-') { _Iterator d(c); while (++c != end && isspace(*c)); if (c == end || *c++ == '>') break; c = d; } } return c;}namespace htmlcxx { namespace HTML {template <typename _Iterator>static inline_Iterator find_next_quote(_Iterator c, _Iterator end, char quote){// std::cerr << "generic find" << std::endl; while (c != end && *c != quote) ++c; return c;}template <>static inlineconst char *find_next_quote(const char *c, const char *end, char quote){// std::cerr << "fast find" << std::endl; const char *d = reinterpret_cast<const char*>(memchr(c, quote, end - c)); if (d) return d; else return end;}}}template <typename _Iterator>_Iterator htmlcxx::HTML::ParserSax::skipHtmlTag(_Iterator c, _Iterator end){ while (c != end && *c != '>') { if (*c != '=') { ++c; } else { // found an attribute ++c; while (c != end && isspace(*c)) ++c; if (c == end) break; if (*c == '\"' || *c == '\'') { _Iterator save(c); char quote = *c++; c = find_next_quote(c, end, quote);// while (c != end && *c != quote) ++c;// c = static_cast<char*>(memchr(c, quote, end - c)); if (c != end) { ++c; } else { c = save; ++c; }// DEBUGP("Quotes: %s\n", std::string(save, c).c_str()); } } } if (c != end) ++c; return c;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -