htmlparser.c
来自「微型浏览器」· C语言 代码 · 共 1,021 行 · 第 1/2 页
C
1,021 行
/******************************************************************************* * * htmlparser.c - HTML parsing engine * * Original code taken from libhtmlparse by Mooneer Salem * (mooneer@translator.cx) http://msalem.translator.cx/libhtmlparse.html * * Completely butchered by Garett Spencley for the Cheetah Web Browser. * * Cheetah Web Browser * Copyright (C) 2001 Garett Spencley * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * *******************************************************************************/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include <unistd.h>#include <pthread.h>#include "htmltokenizer.h"#include "htmlparser.h"#include "htmltag.h"#include "debug.h"#include "error.h"#include "entity.h"#include "html.h"#include "dw_page.h"#include "cheetah.h"#include "dw_gtk_scrolled_window.h"#include "http.h"#include "progress.h"/* * html_push_tag() - add a tag to the stack */__inline void html_push_tag(html_t *html, unsigned int tag){ int num_items; num_items = html->stack_top + 1; html->stack = (tag_info_t *)realloc(html->stack, sizeof(*html->stack) * html->stack_max + num_items); html->stack[num_items] = html->stack[num_items - 1]; html->stack[num_items].tag = tag; html->stack_top = num_items; a_Dw_style_ref (html->stack[html->stack_top].style);}/* * html_pop_tag() - remove a tack from the stack */__inline void html_pop_tag(html_t *html, unsigned int tag) { register int i; for(i = html->stack_top; i > 0; i--) { if(html->stack[i].tag == tag) { while(html->stack_top >= i) { a_Dw_style_unref(html->stack[html->stack_top].style); html->stack_top--; } } }}/* * html_seek_tag() - returns the first position of tag on the stack */__inline int html_seek_tag(html_t *html, unsigned int tag){ register int i; for(i = 0; i <= html->stack_top; i++) if(html->stack[i].tag == tag) return i; return -1;}/* * process_opening_tag() */__inline int process_opening_tag(html_t *html, char *tag, html_tag_args *args){ html_token *token; register int result = 0; token = get_tag(tag); if(!token) { debug_print("Unsupported tag: '%s'", tag); return 0; } if(token->tag_func) result = token->tag_func(html, tag, args, HTML_TAG_OPEN); return result;}/* * process_closing_tag() */__inline int process_closing_tag(html_t *html, char *tag){ html_token *token; register int result = 0; token = get_tag(tag); if(!token) { debug_print("Unsupported tag: '%s'", tag); return 0; } if(token->tag_func) result = token->tag_func(html, tag, NULL, HTML_TAG_CLOSE); return result;}/* * process_dtd() */int process_dtd(html_t *html, char *tag, html_tag_args *args){ return 0;}/* * set_page_title() - set the page title */__inline void set_page_title(html_t *html, const char *title){ char *string; size_t len; len = strlen(CHEETAH_WINDOW_TITLE) + strlen(title) + 5; string = (char *)malloc(sizeof(char) * len); if(!string) return; snprintf(string, len-1, "%s - %s", title, CHEETAH_WINDOW_TITLE); set_window_title(html->cw, string); free(string);}/* * is_empty() - returns 1 if text is just whitespace, 0 otherwise */__inline static int is_empty(const char *text){ const char *tmp = text; while(*tmp) if(!isspace(*tmp++)) return 0; return 1;}/* * add_preformatted_text() - adds text while in <PRE> tag. */__inline void add_preformatted_text(html_t *html, DwStyle *style, const char *text){ char *word; char *tab = " "; /* 8 spaces */ int i = 0; #define add_word() \ if(i > 0) { \ word[i] = 0; \ i = 0; \ a_Dw_page_add_text((DwPage *)html->dw, g_strdup(word), style); \ } word = (char *)malloc(sizeof(char) * strlen(text) + 1); if(!word) return; while(1) { switch(*text) { case 32: add_word(); a_Dw_page_add_space((DwPage *)html->dw, style); break; case 0: add_word(); free(word); return; case '\n': add_word(); if(*(text + 1) == '\n') a_Dw_page_parbreak((DwPage *)html->dw, 9); else a_Dw_page_linebreak((DwPage *)html->dw); break; case '\t': /* W3C says that tabs should be ignored. But, lot's * of pages use tabs when displaying sample code. What * to do? */ a_Dw_page_add_text((DwPage *)html->dw, g_strdup(tab), style); break; default: word[i++] = *text; break; } ++text; }}/* * add_text() - adds text word by word. */__inline void add_text(html_t *html, DwStyle *style, const char *text){ char word[32], *p; if(html->preformatted) { add_preformatted_text(html, style, text); return; } p = word; if(is_empty(text)) return; if(isspace(*text)) a_Dw_page_add_space((DwPage *)html->dw, style); while(1) { if(isspace(*text) || *text == 0) { *p = 0; p = word; a_Dw_page_add_text((DwPage *)html->dw, g_strdup(word), style); if(*text == 0) break; if(*text == 32) a_Dw_page_add_space((DwPage *)html->dw, style); ++text; } *p++ = *text++; }}/* * process_text() - process any text */int process_text(html_t *html, char *text){ int i; register gboolean parbreak = FALSE; for(i = 0; i <= html->stack_top; i++) { if(html->stack[i].tag == TITLE) { set_page_title(html, text); return 0; } switch(html->stack[i].tag) { case HEADER: parbreak = TRUE; break; case ADDRESS: parbreak = TRUE; break; } } add_text(html, html->stack[html->stack_top].style, text); /* Parbreak after headers and address */ if(parbreak) a_Dw_page_parbreak((DwPage *)html->dw, 9); return 0;}/* * parse_text() - parse text between tags */__inline const char *parse_text(html_t *ht, const char *html){ char *tmp; const char *tmp2; /* Inter-word spacings are allowed, all other whitespace is ignored. * So if the first char is a space (after a closing tag) then add * _one_ space and ignore the rest */ if(isspace(*html)) a_Dw_page_add_space((DwPage *)ht->dw, ht->stack[ht->stack_top].style); while(*html && isspace(*html)) ++html; if (*html == '<') return html; tmp2 = html; while (*html && *html != '<') ++html; tmp = (char *)malloc(sizeof(char) * (html - tmp2 + 1)); if (!tmp) return ""; strncpy(tmp, tmp2, html - tmp2); tmp[html - tmp2] = 0; if (strlen(tmp) > 0) parse_for_entities(ht, tmp); free(tmp); if (*(html + 1) == '>') html += 2; return html;}/* * parse_comment() - parse comment tags */__inline const char *parse_comment(html_t *ht, const char *html){ char *tmp; const char *tmp2; while (*html == '-' || isspace(*html)) ++html; tmp2 = html; while (*html && !(*html == '-' && *(html + 1) == '-' && *(html + 2) == '>')) ++html; tmp = (char *)malloc(sizeof(char) * (html - tmp2 + 1)); if (!tmp) return ""; strncpy(tmp, tmp2, html - tmp2); tmp[html - tmp2] = 0; if (*(html + 3)) html += 3; free(tmp); return html;}/* * parse_closing_tag() - parse closing tags */__inline const char *parse_closing_tag(html_t *ht, const char *html){ char *tmp; const char *tmp2; register int ret = 0; ++html; tmp2 = html; while (*html && *html != '>') ++html; tmp = (char *)malloc(sizeof(char) * (html - tmp2 + 1)); if (!tmp) return ""; strncpy(tmp, tmp2, html - tmp2); tmp[html - tmp2] = 0; ret = process_closing_tag(ht, tmp); if (ret) { free(tmp); return ""; } if (*html == '>') ++html; free(tmp); return html;}/* * parse_opening_tag() - parse an opening tag */__inline const char *parse_opening_tag(html_t *ht, const char *html){ char *tag, *rest; const char *tmp; register int ret = 0; html_tag_args *args; /* First extract the tag */ tmp = html; while (*html && !isspace(*html) && *html != '>') ++html; tag = (char *)malloc(sizeof(char) * (html - tmp + 1)); if(!tag) return ""; strncpy(tag, tmp, html - tmp); tag[html - tmp] = 0; if (*html == '>') { ret = process_opening_tag(ht, tag, NULL); if(*html == '>') ++html; free(tag); return ret ? "" : html; } /* Now extract the args */ while (*html && isspace(*html)) ++html; tmp = html; while (*html && *html != '>') ++html; rest = (char *)malloc(sizeof(char) * (html - tmp + 1)); if(!rest) { free(tag); return ""; } strncpy(rest, tmp, html - tmp); rest[html - tmp] = 0; args = create_tag_args(rest); if(!args) { free(tag); free(rest); return ""; } ret = process_opening_tag(ht, tag, args); if(*html == '>') ++html; free(tag); free(rest);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?