htmlparser.c

来自「微型浏览器」· C语言 代码 · 共 1,021 行 · 第 1/2 页

C
1,021
字号
/******************************************************************************* * * htmlparser.c - HTML parsing engine * * Original code taken from libhtmlparse by Mooneer Salem * (mooneer@translator.cx) http://msalem.translator.cx/libhtmlparse.html *  * Completely butchered by Garett Spencley for the Cheetah Web Browser. * * Cheetah Web Browser * Copyright (C) 2001 Garett Spencley  *  * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  * *******************************************************************************/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include <unistd.h>#include <pthread.h>#include "htmltokenizer.h"#include "htmlparser.h"#include "htmltag.h"#include "debug.h"#include "error.h"#include "entity.h"#include "html.h"#include "dw_page.h"#include "cheetah.h"#include "dw_gtk_scrolled_window.h"#include "http.h"#include "progress.h"/* * html_push_tag() - add a tag to the stack */__inline void html_push_tag(html_t *html, unsigned int tag){	int num_items;	num_items = html->stack_top + 1;	html->stack = (tag_info_t *)realloc(html->stack, sizeof(*html->stack) * html->stack_max + num_items);	html->stack[num_items] = html->stack[num_items - 1];	html->stack[num_items].tag  = tag;	html->stack_top = num_items;	a_Dw_style_ref (html->stack[html->stack_top].style);}/* * html_pop_tag() - remove a tack from the stack */__inline void html_pop_tag(html_t *html, unsigned int tag) {	register int i;	for(i = html->stack_top; i > 0; i--) {		if(html->stack[i].tag == tag) {			while(html->stack_top >= i) {				a_Dw_style_unref(html->stack[html->stack_top].style);				html->stack_top--;			}		}	}}/* * html_seek_tag() - returns the first position of tag on the stack */__inline int html_seek_tag(html_t *html, unsigned int tag){	register int i;	for(i = 0; i <= html->stack_top; i++)		if(html->stack[i].tag == tag)			return i;	return -1;}/* * process_opening_tag() */__inline int process_opening_tag(html_t *html, char *tag, html_tag_args *args){	html_token *token;	register int result = 0;	token = get_tag(tag);	if(!token) {		debug_print("Unsupported tag: '%s'", tag);		return 0;	}	if(token->tag_func)		result = token->tag_func(html, tag, args, HTML_TAG_OPEN);	return result;}/* * process_closing_tag() */__inline int process_closing_tag(html_t *html, char *tag){	html_token *token;	register int result = 0;	token = get_tag(tag);	if(!token) {		debug_print("Unsupported tag: '%s'", tag);		return 0;	}	if(token->tag_func)		result = token->tag_func(html, tag, NULL, HTML_TAG_CLOSE);	return result;}/* * process_dtd() */int process_dtd(html_t *html, char *tag, html_tag_args *args){	return 0;}/* * set_page_title() - set the page title */__inline void set_page_title(html_t *html, const char *title){	char *string;	size_t len;	len = strlen(CHEETAH_WINDOW_TITLE) + strlen(title) + 5;	string = (char *)malloc(sizeof(char) * len);	if(!string)		return;	snprintf(string, len-1, "%s - %s", title, CHEETAH_WINDOW_TITLE);	set_window_title(html->cw, string);	free(string);}/* * is_empty() - returns 1 if text is just whitespace, 0 otherwise */__inline static int is_empty(const char *text){	const char *tmp = text;	while(*tmp) 		if(!isspace(*tmp++))			return 0;	return 1;}/* * add_preformatted_text() - adds text while in <PRE> tag. */__inline void add_preformatted_text(html_t *html, DwStyle *style, const char *text){	char *word;	char *tab = "        "; /* 8 spaces */	int i = 0; 	#define add_word() \	if(i > 0) { \		word[i] = 0; \		i = 0; \		a_Dw_page_add_text((DwPage *)html->dw, g_strdup(word), style); \	}	word = (char *)malloc(sizeof(char) * strlen(text) + 1);	if(!word) 		return;	while(1) {				switch(*text) {		case 32:			add_word();			a_Dw_page_add_space((DwPage *)html->dw, style);			break;		case 0:			add_word();			free(word);			return;		case '\n':			add_word();			if(*(text + 1) == '\n')				a_Dw_page_parbreak((DwPage *)html->dw, 9);			else				a_Dw_page_linebreak((DwPage *)html->dw);			break;		case '\t':			/* W3C says that tabs should be ignored. But, lot's			 * of pages use tabs when displaying sample code. What			 * to do? */			a_Dw_page_add_text((DwPage *)html->dw, g_strdup(tab), style);			break;		default:			word[i++] = *text;			break;		}		++text;	}}/* * add_text() - adds text word by word.  */__inline void add_text(html_t *html, DwStyle *style, const char *text){	char word[32], *p;		if(html->preformatted) {		add_preformatted_text(html, style, text);		return;	}	p = word;				if(is_empty(text))		return;	if(isspace(*text))		a_Dw_page_add_space((DwPage *)html->dw, style);	while(1) {			if(isspace(*text) || *text == 0) {			*p = 0;			p = word;			a_Dw_page_add_text((DwPage *)html->dw, g_strdup(word), style);			if(*text == 0) 				break;			if(*text == 32)				a_Dw_page_add_space((DwPage *)html->dw, style);			++text;		}		*p++ = *text++;	}}/* * process_text() - process any text */int process_text(html_t *html, char *text){	int i;	register gboolean parbreak = FALSE;	for(i = 0; i <= html->stack_top; i++) {		if(html->stack[i].tag == TITLE) {			set_page_title(html, text);			return 0;		}		switch(html->stack[i].tag) {		case HEADER:			parbreak = TRUE;			break;		case ADDRESS:			parbreak = TRUE;			break;		}	}	add_text(html, html->stack[html->stack_top].style, text);	/* Parbreak after headers and address */	if(parbreak) 		a_Dw_page_parbreak((DwPage *)html->dw, 9);	return 0;}/* * parse_text() - parse text between tags */__inline const char *parse_text(html_t *ht, const char *html){	char *tmp;	const char *tmp2;	/* Inter-word spacings are allowed, all other whitespace is ignored.	 * So if the first char is a space (after a closing tag) then add	 * _one_ space and ignore the rest */	if(isspace(*html))		a_Dw_page_add_space((DwPage *)ht->dw, ht->stack[ht->stack_top].style);	while(*html && isspace(*html)) 		++html;	if (*html == '<')		return html;	tmp2 = html;	while (*html && *html != '<')		++html;	tmp = (char *)malloc(sizeof(char) * (html - tmp2 + 1));	if (!tmp)		return "";	strncpy(tmp, tmp2, html - tmp2);	tmp[html - tmp2] = 0;	if (strlen(tmp) > 0) 		parse_for_entities(ht, tmp);	free(tmp);	if (*(html + 1) == '>')		html += 2;	return html;}/* * parse_comment() - parse comment tags */__inline const char *parse_comment(html_t *ht, const char *html){	char *tmp;	const char *tmp2;	while (*html == '-' || isspace(*html))		++html;	tmp2 = html;	while (*html && !(*html == '-' && *(html + 1) == '-' && *(html + 2) == '>'))		++html;	tmp = (char *)malloc(sizeof(char) * (html - tmp2 + 1));	if (!tmp)		return "";	strncpy(tmp, tmp2, html - tmp2);	tmp[html - tmp2] = 0;	if (*(html + 3))		html += 3;	free(tmp);	return html;}/* * parse_closing_tag() - parse closing tags */__inline const char *parse_closing_tag(html_t *ht, const char *html){	char *tmp;	const char *tmp2;	register int ret = 0;	++html;	tmp2 = html;	while (*html && *html != '>')		++html;	tmp = (char *)malloc(sizeof(char) * (html - tmp2 + 1));	if (!tmp)		return "";	strncpy(tmp, tmp2, html - tmp2);	tmp[html - tmp2] = 0;	ret = process_closing_tag(ht, tmp);	if (ret) {		free(tmp);		return "";	}	if (*html == '>')		++html;	free(tmp);	return html;}/* * parse_opening_tag() - parse an opening tag */__inline const char *parse_opening_tag(html_t *ht, const char *html){	char *tag, *rest;	const char *tmp;	register int ret = 0;	html_tag_args *args;	/* First extract the tag */	tmp = html;	while (*html && !isspace(*html) && *html != '>') 			++html;	tag = (char *)malloc(sizeof(char) * (html - tmp + 1));	if(!tag) 		return "";	strncpy(tag, tmp, html - tmp);	tag[html - tmp] = 0;	if (*html == '>') {		ret = process_opening_tag(ht, tag, NULL);				if(*html == '>')			++html;		free(tag);		return ret ? "" : html;	}	/* Now extract the args */	while (*html && isspace(*html))		++html;	tmp = html;	while (*html && *html != '>')			++html;	rest = (char *)malloc(sizeof(char) * (html - tmp + 1));	if(!rest) {		free(tag);		return "";	}	strncpy(rest, tmp, html - tmp);	rest[html - tmp] = 0;	args = create_tag_args(rest);	if(!args) {		free(tag);		free(rest);		return "";	} 	ret = process_opening_tag(ht, tag, args);	if(*html == '>') 		++html;		free(tag);	free(rest);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?