📄 sgml.c
字号:
/* General SGML Parser code SGML.c** ========================**** This module implements an HTStream object. To parse an** SGML file, create this object which is a parser. The object** is (currently) created by being passed a DTD structure,** and a target HTStructured object at which to throw the parsed stuff.**** 6 Feb 93 Binary searches used. Interface modified.*/#include <HTUtils.h>/* Remove the following to disable the experimental HTML DTD parsing. Currently only used in this source file. - kw */#ifndef NO_EXTENDED_HTMLDTD#define EXTENDED_HTMLDTD#endif#include <SGML.h>#include <HTMLDTD.h>#include <HTCJK.h>#include <UCMap.h>#include <UCDefs.h>#include <UCAux.h>#include <HTChunk.h>#include <LYCharSets.h>#include <LYCharVals.h> /* S/390 -- gil -- 0635 */#include <LYGlobalDefs.h>#include <LYStrings.h>#include <LYLeaks.h>#ifdef USE_COLOR_STYLE# include <LYStyle.h>#endif#ifdef USE_PRETTYSRC# include <LYPrettySrc.h>#endif#define INVALID (-1)#ifdef USE_PRETTYSRCchar* entity_string; /* this is used for printing entity name. Unconditionally added since redundant assigments don't hurt much*/PRIVATE void fake_put_character ARGS2( void*, p GCC_UNUSED, char, c GCC_UNUSED){}#define START TRUE#define STOP FALSE#define PUTS_TR(x) psrc_convert_string = TRUE; PUTS(x)#endif/* my_casecomp() - optimized by the first character, NOT_ASCII ok */#define my_casecomp(a,b) ((TOUPPER(*a) == TOUPPER(*b)) ? \ AS_casecomp(a,b) : \ (TOASCII(TOUPPER(*a)) - TOASCII(TOUPPER(*b))))#if ANSI_PREPRO /* will use partially inlined version */#define orig_HTChunkPutUtf8Char HTChunkPutUtf8Char#undef HTChunkPutUtf8Char/* ...used for comments and attributes value like href... */#define HTChunkPutUtf8Char(ch,x) \ { \ if ((TOASCII(x) < 128) && (ch->size < ch->allocated)) \ ch->data[ch->size++] = (char)x; \ else \ orig_HTChunkPutUtf8Char(ch,x); \ }#if 0#define orig_HTChunkPutc HTChunkPutc#undef HTChunkPutc#define HTChunkPutc(ch,x) \ { \ if (ch->size < ch->allocated) \ ch->data[ch->size++] = x; \ else \ orig_HTChunkPutc(ch,x); \ }#undef HTChunkTerminate#define HTChunkTerminate(ch) \ HTChunkPutc(ch, (char)0)#endif /* */#endif /* ANSI_PREPRO */#define PUTS(str) ((*context->actions->put_string)(context->target, str))#define PUTC(ch) ((*context->actions->put_character)(context->target, ch))#define PUTUTF8(code) (UCPutUtf8_charstring((HTStream *)context->target, \ (putc_func_t*)(context->actions->put_character), code))#define OPT 1/*the following macros are used for pretty source view. */#define IS_C(attr) (attr.type == HTMLA_CLASS)PUBLIC HTCJKlang HTCJK = NOCJK; /* CJK enum value. */PUBLIC BOOL HTPassEightBitRaw = FALSE; /* Pass 161-172,174-255 raw. */PUBLIC BOOL HTPassEightBitNum = FALSE; /* Pass ^ numeric entities raw. */PUBLIC BOOL HTPassHighCtrlRaw = FALSE; /* Pass 127-160,173, raw. */PUBLIC BOOL HTPassHighCtrlNum = FALSE; /* Pass €-Ÿ raw. *//* The State (context) of the parser**** This is passed with each call to make the parser reentrant***/#define MAX_ATTRIBUTES 36 /* Max number of attributes per element *//* Element Stack** -------------** This allows us to return down the stack reselecting styles.** As we return, attribute values will be garbage in general.*/typedef struct _HTElement HTElement;struct _HTElement { HTElement * next; /* Previously nested element or 0 */ HTTag* tag; /* The tag at this level */};typedef enum { S_text = 0 ,S_attr ,S_attr_gap ,S_comment ,S_cro ,S_doctype ,S_dollar ,S_dollar_dq ,S_dollar_paren ,S_dollar_paren_dq ,S_dollar_paren_sq ,S_dollar_sq ,S_dquoted ,S_end ,S_entity ,S_equals ,S_ero ,S_esc ,S_esc_dq ,S_esc_sq ,S_exclamation ,S_in_kanji ,S_incro ,S_junk_pi ,S_junk_tag ,S_litteral ,S_marked ,S_nonascii_text ,S_nonascii_text_dq ,S_nonascii_text_sq ,S_paren ,S_paren_dq ,S_paren_sq ,S_pcdata ,S_script ,S_sgmlatt ,S_sgmlele ,S_sgmlent ,S_squoted ,S_tag ,S_tag_gap ,S_tagname_slash ,S_value} sgml_state;/* Internal Context Data Structure** -------------------------------*/struct _HTStream { CONST HTStreamClass * isa; /* inherited from HTStream */ CONST SGML_dtd *dtd; CONST HTStructuredClass *actions; /* target class */ HTStructured *target; /* target object */ HTTag *current_tag; HTTag *slashedtag; CONST HTTag *unknown_tag; BOOL inSELECT; BOOL no_lynx_specialcodes; int current_attribute_number; HTChunk *string; int leading_spaces; int trailing_spaces; HTElement *element_stack; sgml_state state; unsigned char kanji_buf;#ifdef CALLERDATA void * callerData;#endif /* CALLERDATA */ BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */ char * value[MAX_ATTRIBUTES]; /* NULL, or strings alloc'd with StrAllocCopy_extra() */ BOOL lead_exclamation; BOOL first_dash; BOOL end_comment; BOOL doctype_bracket; BOOL first_bracket; BOOL second_bracket; BOOL isHex; HTParentAnchor * node_anchor; LYUCcharset * inUCI; /* pointer to anchor UCInfo */ int inUCLYhndl; /* charset we are fed */ LYUCcharset * outUCI; /* anchor UCInfo for target */ int outUCLYhndl; /* charset for target */ char utf_count; UCode_t utf_char; char utf_buf[8]; char * utf_buf_p; UCTransParams T; int current_tag_charset; /* charset to pass attributes */ char * recover; int recover_index; char * include; char * active_include; int include_index; char * url; char * csi; int csi_index;#ifdef USE_PRETTYSRC BOOL cur_attr_is_href; BOOL cur_attr_is_name; BOOL seen_nonwhite_in_junk_tag;#endif};#ifndef NO_LYNX_TRACEPRIVATE char *state_name ARGS1(sgml_state, n){ char *result = "?"; switch (n) { case S_attr: result = "S_attr"; break; case S_attr_gap: result = "S_attr_gap"; break; case S_comment: result = "S_comment"; break; case S_cro: result = "S_cro"; break; case S_doctype: result = "S_doctype"; break; case S_dollar: result = "S_dollar"; break; case S_dollar_dq: result = "S_dollar_dq"; break; case S_dollar_paren: result = "S_dollar_paren"; break; case S_dollar_paren_dq: result = "S_dollar_paren_dq"; break; case S_dollar_paren_sq: result = "S_dollar_paren_sq"; break; case S_dollar_sq: result = "S_dollar_sq"; break; case S_dquoted: result = "S_dquoted"; break; case S_end: result = "S_end"; break; case S_entity: result = "S_entity"; break; case S_equals: result = "S_equals"; break; case S_ero: result = "S_ero"; break; case S_esc: result = "S_esc"; break; case S_esc_dq: result = "S_esc_dq"; break; case S_esc_sq: result = "S_esc_sq"; break; case S_exclamation: result = "S_exclamation"; break; case S_in_kanji: result = "S_in_kanji"; break; case S_incro: result = "S_incro"; break; case S_junk_pi: result = "S_junk_pi"; break; case S_junk_tag: result = "S_junk_tag"; break; case S_litteral: result = "S_litteral"; break; case S_marked: result = "S_marked"; break; case S_nonascii_text: result = "S_nonascii_text"; break; case S_nonascii_text_dq: result = "S_nonascii_text_dq"; break; case S_nonascii_text_sq: result = "S_nonascii_text_sq"; break; case S_paren: result = "S_paren"; break; case S_paren_dq: result = "S_paren_dq"; break; case S_paren_sq: result = "S_paren_sq"; break; case S_pcdata: result = "S_pcdata"; break; case S_script: result = "S_script"; break; case S_sgmlatt: result = "S_sgmlatt"; break; case S_sgmlele: result = "S_sgmlele"; break; case S_sgmlent: result = "S_sgmlent"; break; case S_squoted: result = "S_squoted"; break; case S_tag: result = "S_tag"; break; case S_tag_gap: result = "S_tag_gap"; break; case S_tagname_slash: result = "S_tagname_slash"; break; case S_text: result = "S_text"; break; case S_value: result = "S_value"; break; } return result;}#endif/* storage for Element Stack */#define DEPTH 10static HTElement pool[DEPTH];static int depth = 0;PRIVATE HTElement* pool_alloc NOARGS{ depth++; if (depth > DEPTH) return (HTElement*) malloc(sizeof(HTElement)); return (pool + depth - 1);}PRIVATE void pool_free ARGS1(HTElement*, e){ if (depth > DEPTH) FREE(e); depth--; return;}#ifdef USE_PRETTYSRCPRIVATE void HTMLSRC_apply_markup ARGS3( HTStream *, context, HTlexeme, lexeme, BOOL, start){ HT_tagspec* ts = *( ( start ? lexeme_start : lexeme_end ) + lexeme); while (ts) {#ifdef USE_COLOR_STYLE if (ts->start) { current_tag_style = ts->style; force_current_tag_style = TRUE; forced_classname = ts->class_name; force_classname = TRUE; }#endif CTRACE((tfp,ts->start ? "SRCSTART %d\n" : "SRCSTOP %d\n",(int)lexeme)); if (ts->start) (*context->actions->start_element)( context->target, ts->element, ts->present, (CONST char **)ts->value, context->current_tag_charset, (char **)&context->include); else (*context->actions->end_element)( context->target, ts->element, (char **)&context->include); ts = ts->next; }}#if ANSI_PREPRO# define PSRCSTART(x) HTMLSRC_apply_markup(context,HTL_##x,START)# define PSRCSTOP(x) HTMLSRC_apply_markup(context,HTL_##x,STOP)#else# define PSRCSTART(x) HTMLSRC_apply_markup(context,HTL_/**/x,START)# define PSRCSTOP(x) HTMLSRC_apply_markup(context,HTL_/**/x,STOP)#endif#define attr_is_href context->cur_attr_is_href#define attr_is_name context->cur_attr_is_name#endifPRIVATE void set_chartrans_handling ARGS3( HTStream *, context, HTParentAnchor *, anchor, int, chndl){ if (chndl < 0) { /* ** Nothing was set for the parser in earlier stages, ** so the HTML parser's UCLYhndl should still be its ** default. - FM */ chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_STRUCTURED); if (chndl < 0) /* ** That wasn't set either, so seek the HText default. - FM */ chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT); if (chndl < 0) /* ** That wasn't set either, so assume the current display ** character set. - FM */ chndl = current_char_set; /* ** Try to set the HText and HTML stages' chartrans info ** with the default lock level (will not be changed if ** it was set previously with a higher lock level). - FM */ HTAnchor_setUCInfoStage(anchor, chndl, UCT_STAGE_HTEXT, UCT_SETBY_DEFAULT); HTAnchor_setUCInfoStage(anchor, chndl, UCT_STAGE_STRUCTURED, UCT_SETBY_DEFAULT); /* ** Get the chartrans info for output to the HTML parser. - FM */ context->outUCI = HTAnchor_getUCInfoStage(anchor, UCT_STAGE_STRUCTURED); context->outUCLYhndl = HTAnchor_getUCLYhndl(context->node_anchor, UCT_STAGE_STRUCTURED); } /* ** Set the in->out transformation parameters. - FM */ UCSetTransParams(&context->T, context->inUCLYhndl, context->inUCI, context->outUCLYhndl, context->outUCI); /* ** This is intended for passing the SGML parser's input ** charset as an argument in each call to the HTML ** parser's start tag function, but it would be better ** to call a Lynx_HTML_parser function to set an element ** in its HTStructured object, itself, if this were ** needed. - FM */ if (HTCJK != NOCJK) { context->current_tag_charset = -1; } else if (context->T.transp) { context->current_tag_charset = context->inUCLYhndl; } else if (context->T.decode_utf8) { context->current_tag_charset = context->inUCLYhndl; } else if (context->T.do_8bitraw || context->T.use_raw_char_in) { context->current_tag_charset = context->inUCLYhndl; } else if (context->T.output_utf8 || context->T.trans_from_uni) { context->current_tag_charset = UCGetLYhndl_byMIME("utf-8"); } else { context->current_tag_charset = LATIN1; }}PRIVATE void change_chartrans_handling ARGS1( HTStream *, context){ int new_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor, UCT_STAGE_PARSER); if (new_LYhndl != context->inUCLYhndl && new_LYhndl >= 0) { /* * Something changed. but ignore if a META wants an unknown charset. */ LYUCcharset * new_UCI = HTAnchor_getUCInfoStage(context->node_anchor, UCT_STAGE_PARSER); if (new_UCI) { LYUCcharset * next_UCI = HTAnchor_getUCInfoStage( context->node_anchor, UCT_STAGE_STRUCTURED ); int next_LYhndl = HTAnchor_getUCLYhndl( context->node_anchor, UCT_STAGE_STRUCTURED ); context->inUCI = new_UCI; context->inUCLYhndl = new_LYhndl; context->outUCI = next_UCI; context->outUCLYhndl = next_LYhndl; set_chartrans_handling(context, context->node_anchor, next_LYhndl); } }}#ifdef USE_COLOR_STYLE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -