📄 sgml.c
字号:
/* SGML.c** GENERAL SGML PARSER CODE**** (c) COPYRIGHT MIT 1995.** Please first read the full copyright statement in the file COPYRIGH.** @(#) $Id: SGML.c,v 1.53 1999/05/18 21:38:57 frystyk Exp $**** This module implements an HTStream object. To parse an** SGML file, create this object which is a parser. The object** is (currently) created by being passed a DTD structure,** and a target HTStructured oject at which to throw the parsed stuff.** ** 6 Feb 93 Binary seraches used. Intreface modified.** 8 Jul 94 FM Insulate free() from _free structure element.** Nov 1996 msa Strip down the parser to minimal HTML tokenizer,** Stop allocating space for the attribute values,** use pointers to the string chunk instead.*/#include <assert.h>/* Library include files */#include "wwwsys.h"#include "HTUtils.h"#include "HTString.h"#include "HTChunk.h"#include "SGML.h"#define INVALID (-1)/* The State (context) of the parser**** This is passed with each call to make the parser reentrant***/typedef enum _sgml_state { S_text, S_literal, S_tag, S_tag_gap, S_attr, S_attr_gap, S_equals, S_value, S_after_open, S_nl, S_nl_tago, S_ero, S_cro,#ifdef ISO_2022_JP S_esc, S_dollar, S_paren, S_nonascii_text,#endif S_squoted, S_dquoted, S_end, S_entity, S_junk_tag, S_md, S_md_sqs, S_md_dqs, S_com_1, S_com, S_com_2, S_com_2a } sgml_state;/* Internal Context Data Structure** -------------------------------*/struct _HTStream { const HTStreamClass *isa; /* inherited from HTStream */ const SGML_dtd *dtd; HTStructuredClass *actions; /* target class */ HTStructured *target; /* target object */ HTTag *current_tag; int current_attribute_number; SGMLContent contents; /* current content mode */ HTChunk *string; int token; /* ptr into string buffer */ sgml_state state; BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */ int value[MAX_ATTRIBUTES]; /* Offset pointers to the string */ };#define PUTC(ch) ((*context->actions->put_character)(context->target, ch))#define PUTB(b,l) ((*context->actions->put_block)(context->target, b, l))/* Find Attribute Number** ---------------------*/PRIVATE int SGMLFindAttribute (HTTag* tag, const char * s) { HTAttr* attributes = tag->attributes; int high, low, i, diff; /* Binary search for attribute name */ assert(tag->number_of_attributes <= MAX_ATTRIBUTES); for(low=0, high=tag->number_of_attributes; high > low ; diff < 0 ? (low = i+1) : (high = i) ) { i = (low + (high-low)/2); diff = strcasecomp(attributes[i].name, s); if (diff==0) return i; /* success: found it */ } return -1; }/* Handle Attribute** ----------------*//* PUBLIC const char * SGML_default = ""; ?? */PRIVATE void handle_attribute_name (HTStream * context, const char * s) { HTTag * tag = context->current_tag; /* Note: if tag==NULL, we are skipping unknown tag... */ if (tag) { int i = SGMLFindAttribute(tag, s); if (i >= 0) { context->current_attribute_number = i; context->present[i] = YES; return; } HTTRACE(SGML_TRACE, "Unknown attribute %s for tag %s\n" _ s _ context->current_tag->name); } context->current_attribute_number = INVALID; /* Invalid */ }/* Handle attribute value** ----------------------*/PRIVATE void handle_attribute_value (HTStream * context) { /* Deal with attributes only if tag is known, ignore silently otherwise */ if (context->current_tag) { if (context->current_attribute_number != INVALID) context->value[context->current_attribute_number] = context->token; else { char * data = HTChunk_data(context->string); HTTRACE(SGML_TRACE, "Attribute value %s ignored\n" _ data ? data+context->token : "<null>"); } } context->current_attribute_number = INVALID; /* can't have two assignments! */ }/* Handle entity** -------------**** On entry,** s contains the entity name zero terminated*/PRIVATE void handle_entity (HTStream * context) { const char ** entities = context->dtd->entity_names; const char *s = HTChunk_data(context->string); int high, low, i, diff; for(low=0, high = context->dtd->number_of_entities; high > low ; diff < 0 ? (low = i+1) : (high = i)) { i = (low + (high-low)/2); diff = strcmp(entities[i], s); /* Case sensitive! */ if (diff==0) { /* success: found it */ (*context->actions->put_entity)(context->target, i); return; } } /* If entity string not found */ HTTRACE(SGML_TRACE, "Unknown entity %s\n" _ s); (*context->actions->unparsed_entity) (context->target, HTChunk_data(context->string), HTChunk_size(context->string)); }/* End element** -----------*/PRIVATE void end_element (HTStream * context, HTTag *tag) { HTTRACE(SGML_TRACE, "End </%s>\n" _ tag->name); (*context->actions->end_element) (context->target, tag - context->dtd->tags); }/* Start an element** ----------------*/PRIVATE void start_element (HTStream * context) { int i; char *value[MAX_ATTRIBUTES]; HTTag *tag = context->current_tag; HTTRACE(SGML_TRACE, "Start <%s>\n" _ tag->name); context->contents = tag->contents; /* ** Build the actual pointers to the value strings stored in the ** chunk buffer. (Must use offsets while collecting the values, ** because the string chunk may get resized during the collection ** and potentially relocated). */ for (i = 0; i < MAX_ATTRIBUTES; ++i) value[i] = context->value[i] < 0 ? NULL : HTChunk_data(context->string) + context->value[i]; (*context->actions->start_element) (context->target, tag - context->dtd->tags, context->present, (const char**)value); /* coerce type for think c */ }/* Find Tag in DTD tag list** ------------------------**** On entry,** dtd points to dtd structire including valid tag list** string points to name of tag in question**** On exit,** returns:** NULL tag not found** else address of tag structure in dtd*/PRIVATE HTTag * SGMLFindTag (const SGML_dtd* dtd, const char * string) { int high, low, i, diff; for(low=0, high=dtd->number_of_tags; high > low ; diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */ i = (low + (high-low)/2); diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */ if (diff==0) /* success: found it */ return &dtd->tags[i]; } return NULL; }/*________________________________________________________________________** Public Methods*//* Could check that we are back to bottom of stack! @@ */PRIVATE int SGML_flush (HTStream * context) { return (*context->actions->flush)(context->target); }PRIVATE int SGML_free (HTStream * context) { int status; if ((status = (*context->actions->_free)(context->target)) != HT_OK) return status; HTChunk_delete(context->string); HT_FREE(context); return HT_OK; }PRIVATE int SGML_abort (HTStream * context, HTList * e) { (*context->actions->abort)(context->target, e); HTChunk_delete(context->string); HT_FREE(context); return HT_ERROR; }PRIVATE int SGML_write (HTStream * context, const char * b, int l) { const SGML_dtd *dtd = context->dtd; HTChunk *string = context->string; const char *text = b; int count = 0; while (l-- > 0) { char c = *b++; switch(context->state) { got_element_open: /* ** The label is jumped when the '>' of a the element ** start tag has been detected. This DOES NOT FALL TO ** THE CODE S_after_open, only processes the tag and ** sets the state (c should still contain the ** terminating character of the tag ('>')) */ if (context->current_tag && context->current_tag->name) start_element(context); context->state = S_after_open; break; case S_after_open: /* ** State S_after_open is entered only for single ** character after the element opening tag to test ** against newline. Strip one trainling newline only ** after opening nonempty element. - SGML: Ugh! */ text = b; count = 0; if (c == '\n' && (context->contents != SGML_EMPTY)) { context->state = S_text; break; } --text; goto S_text; S_text: context->state = S_text; case S_text:#ifdef ISO_2022_JP if (c == '\033') { context->state = S_esc; ++count; break; }#endif /* ISO_2022_JP */ if (c == '&') { if (count > 0) PUTB(text, count); count = 0; HTChunk_clear(string); context->state = S_ero; } else if (c == '<') { if (count > 0) PUTB(text, count); count = 0; HTChunk_clear(string); /* should scrap LITERAL, and use CDATA and RCDATA -- msa */ context->state = (context->contents == SGML_LITERAL) ? S_literal : S_tag; } else if (c == '\n') /* Newline - ignore if before end tag! */ context->state = S_nl; else ++count; break; case S_nl: if (c == '<') { if (count > 0) PUTB(text, count); count = 0; HTChunk_clear(string); context->state = (context->contents == SGML_LITERAL) ? S_literal : S_nl_tago; } else { ++count; goto S_text; } break; case S_nl_tago: /* Had newline and tag opener */ if (c != '/') PUTC('\n'); /* Only ignore newline before </ */ context->state = S_tag; goto handle_S_tag;#ifdef ISO_2022_JP case S_esc: if (c=='$') context->state = S_dollar; else if (c=='(') context->state = S_paren; else context->state = S_text; ++count; break; case S_dollar: if (c=='@' || c=='B') context->state = S_nonascii_text; else context->state = S_text; ++count; break; case S_paren: if (c=='B' || c=='J') context->state = S_text; else context->state = S_text; ++count; break; case S_nonascii_text: if (c == '\033') context->state = S_esc; ++count; break;#endif /* ISO_2022_JP */ /* In literal mode, waits only for specific end tag! ** Only foir compatibility with old servers. */ case S_literal: HTChunk_putc(string, c); if ( TOUPPER(c) != ((HTChunk_size(string) == 1) ? '/' : context->current_tag->name[HTChunk_size(string)-2])) { /* If complete match, end literal */ if ((c == '>') && (!context->current_tag->name[HTChunk_size(string)-2])) { end_element (context,context->current_tag); /* ...setting SGML_MIXED below is a bit of kludge, but a good guess that currently works, anything other than SGML_LITERAL would work... -- msa */ context->contents = SGML_MIXED; } else { /* If Mismatch: recover string. */ PUTC( '<'); PUTB(HTChunk_data(string), HTChunk_size(string)); } context->state = S_text; text = b; count = 0; } break; /*
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -