📄 htmlparse.c
字号:
while (*ptr != '\0') { if (*ptr == '<') { if (isalpha((int)(*(ptr + 1)))) { break; } else if (*(ptr + 1) == '/') { if (isalpha((int)(*(ptr + 2)))) { break; } } else if (*(ptr + 1) == '!') /* a comment */ { break; } } ptr++; } *endp = ptr; if (ptr == start) { return(NULL); } /* * Copy the text into its own buffer, and clean it * of escape sequences. */ tchar = *ptr; *ptr = '\0'; text = (char *)malloc(strlen(start) + 1); if (text == NULL) { errorlog("Cannot malloc space for text\n"); *ptr = tchar; return(NULL); } strcpy(text, start); *ptr = tchar; clean_text(text); return(text);}/* * Get the mark text between '<' and '>'. From the text, determine * its type, and fill in a mark_up structure to return. Also returns * endp pointing to the ttrailing '>' in the original string. */struct mark_up *get_mark(start, endp) char *start; char **endp;{ char *ptr; char *text; char tchar; struct mark_up *mark; int comment=0; /* amb - comment==1 if we are in a comment */ char *first_gt=NULL; /* keep track of ">" for old broken comments */ if (start == NULL) { return(NULL); } if (*start != '<') { return(NULL); } /* amb - check if we are in a comment, start tag is <!-- */ if (strncmp (start, "<!--", 4)==0) comment=1; start++; first_gt = NULL; mark = (struct mark_up *)malloc(sizeof(struct mark_up)); if (mark == NULL) { errorlog("Cannot malloc space for mark_up struct\n"); return(NULL); } /* * Grab the mark text */ ptr = start; /* amb - skip over the comment text */ /* end tag is --*>, where * is zero or more spaces (ugh) */ if (comment) { while (*ptr != '\0') { if ( (*ptr == '>') && (!first_gt) ) first_gt = ptr; if (strncmp (ptr, "--", 2) == 0) /* found double dash (--) */ { ptr += 2; while ((*ptr != '\0') && ((*ptr == ' ') || (*ptr == '\n') || (*ptr == '-') )) ptr++; /* skip spaces and newlines */ if (*ptr == '>') /* completed end comment */ { *endp = ptr; mark->is_end = 1; mark->type = M_COMMENT; mark->start = NULL; mark->text = NULL; mark->end = NULL; mark->next = NULL; return(mark); } } else /* if no double dash (--) found */ ptr++; } /* if we get here, this document must use the old broken comment style */ if (first_gt) { ptr = first_gt; } } /* end of: if (comment) */ while (ptr&&(*ptr != '>')&&(*ptr != '\0')) { ptr++; } if (ptr) { *endp=ptr; } else { return(NULL); /*only if EOF and no close comment -- SWP*/ } if (*ptr != '>') {#ifdef VERBOSE errorlog(stderr, "error: bad mark format\n");#endif return(NULL); } /* * Copy the mark text to its own buffer, and * clean it of escapes, and odd white space. */ tchar = *ptr; *ptr = '\0'; text = (char *)malloc(strlen(start) + 1); if (text == NULL) { errorlog("Cannot malloc space for mark\n"); *ptr = tchar; return(NULL); } strcpy(text, start); *ptr = tchar; clean_text(text);/* * No longer needed because the parsing code is now smarter * clean_white_space(text); * */ /* * Set whether this is the start or end of a mark * block, as well as determining its type. */ if (*text == '/') { mark->is_end = 1; mark->type = ParseMarkType((char *)(text + 1)); mark->start = NULL; mark->text = NULL; mark->end = text; } else { mark->is_end = 0; mark->type = ParseMarkType(text); mark->start = text; mark->text = NULL; mark->end = NULL; } mark->text = NULL; mark->next = NULL; return(mark);}/* * Special version of get_text. It reads all text up to the * end of the plain text mark, or the end of the file. */char *get_plain_text(start, endp) char *start; char **endp;{ char *ptr; char *text; char tchar; if (start == NULL) { return(NULL); } /* * Read until stopped by end plain text mark. */ ptr = start; while (*ptr != '\0') { /* * Beginning of a mark is '<' followed by any letter, * or followed by '!' for a comment, * or '</' followed by any letter. */ if ((*ptr == '<')&& ((isalpha((int)(*(ptr + 1))))|| (*(ptr + 1) == '!')|| ((*(ptr + 1) == '/')&&(isalpha((int)(*(ptr + 2))))))) { struct mark_up *mp; char *ep; /* * We think we found a mark. If it is the * end of plain text, break out */ mp = get_mark(ptr, &ep); if (mp != NULL) { if (((mp->type == M_PLAIN_TEXT)|| (mp->type == M_LISTING_TEXT))&&(mp->is_end)) { if (mp->end != NULL) { free((char *)mp->end); } free((char *)mp); break; } if (mp->start != NULL) { free((char *)mp->start); } if (mp->end != NULL) { free((char *)mp->end); } free((char *)mp); } } ptr++; } *endp = ptr; if (ptr == start) { return(NULL); } /* * Copy text to its own malloced buffer, and clean it of * HTML escapes. */ tchar = *ptr; *ptr = '\0'; text = (char *)malloc(strlen(start) + 1); if (text == NULL) { errorlog("Cannot malloc space for text\n"); *ptr = tchar; return(NULL); } strcpy(text, start); *ptr = tchar; clean_text(text); return(text);}static char *atts[]={"text","bgcolor","alink","vlink","link",NULL};/* * Main parser of HTML text. Takes raw text, and produces a linked * list of mark objects. Mark objects are either text strings, or * starting and ending mark delimiters. * The old list is passed in so it can be freed, and in the future we * may want to add code to append to the old list. */struct mark_up *HTMLParse(old_list, str, hw) struct mark_up *old_list; char *str; void *hw;{ int preformat; char *start, *end; char *text, *tptr; struct mark_up *mark; struct mark_up *list; struct mark_up *current; preformat = 0; /* * Free up the previous Object List if one exists */ FreeObjList(old_list); if (str == NULL) { return(NULL); } list = NULL; current = NULL; start = str; end = str; mark = NULL; while (*start != '\0') { /* * Get some text (if any). If our last mark was * a begin plain text we call different function * If last mark was <PLAINTEXT> we lump all the rest of * the text in. */ if ((mark != NULL)&&(mark->type == M_PLAIN_FILE)&& (!mark->is_end)) { text = start; end = text; while (*end != '\0') { end++; } /* * Copy text to its own malloced buffer, and clean it of * HTML escapes. */ tptr = (char *)malloc(strlen(text) + 1); if (tptr == NULL) { errorlog("Cannot malloc space for text\n"); return(list); } strcpy(tptr, text); text = tptr; } else if ((mark != NULL)&& ((mark->type == M_PLAIN_TEXT)|| (mark->type == M_LISTING_TEXT))&& (!mark->is_end)) { text = get_plain_text(start, &end); } else { text = get_text(start, &end); } /* * If text is OK, put it into a mark structure, and add * it to the linked list. */ if (text == NULL) { if (start != end) { errorlog("error parsing text, bailing out\n"); return(list); } } else { mark = (struct mark_up *)malloc(sizeof(struct mark_up)); if (mark == NULL) { errorlog("Cannot malloc for mark_up struct\n"); return(list); } mark->type = M_NONE; mark->is_end = 0; mark->start = NULL; mark->text = text; mark->end = NULL; mark->next = NULL; current = AddObj(&list, current, mark, preformat); } start = end; if (*start == '\0') { break; } /* * Get the next mark if any, and if it is * valid, add it to the linked list. */ mark = get_mark(start, &end); if (mark == NULL) { if (start != end) { errorlog("error parsing mark, bailing out\n"); return(list); } } else { mark->next = NULL; current = AddObj(&list, current, mark, preformat); } start = (char *)(end + 1); if ((mark != NULL)&&(mark->type == M_PLAIN_FILE)&& (!mark->is_end)) { /* * A linefeed immediately after the <PLAINTEXT> * mark is to be ignored. */ if (*start == '\n') { start++; } } else if ((mark != NULL)&&((mark->type == M_PLAIN_TEXT)|| (mark->type == M_LISTING_TEXT))&& (!mark->is_end)) { /* * A linefeed immediately after the <XMP> * or <LISTING> mark is to be ignored. */ if (*start == '\n') { start++; } } /* * If we are parsing pre-formatted text we need to set a * flag so we don't throw out needed linefeeds. */ else if ((mark != NULL)&&(mark->type == M_PREFORMAT)) { if (mark->is_end) { preformat = 0; } else { preformat = 1; /* * A linefeed immediately after the <PRE> * mark is to be ignored. */ if (*start == '\n') { start++; } } } } return(list);}/* * Determine mark type from the identifying string passed */intParseMarkType(str) char *str;{ int type; char *tptr; char tchar; if (str == NULL) { return(M_NONE); } type = M_UNKNOWN; tptr = str; while (*tptr != '\0') { if (isspace((int)*tptr)) { break; } tptr++; } tchar = *tptr; *tptr = '\0'; if (caseless_equal(str, MT_ANCHOR)) { type = M_ANCHOR; } else if (caseless_equal(str, MT_FRAME)) { type = M_FRAME; } else if (caseless_equal(str, MT_AREA)) { type = M_AREA; } else if (caseless_equal(str, MT_TITLE)) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -