📄 htmlparse.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
static char rcsid[]="$Id: HTMLparse.c,v 2.4 2000/02/03 12:45:56 sxw Exp $";/*   * HTMLparse.c taken from Mosaic 2.7b5 with some alterations for Harvest * /**************************************************************************** * NCSA Mosaic for the X Window System                                      * * Software Development Group                                               * * National Center for Supercomputing Applications                          * * University of Illinois at Urbana-Champaign                               * * 605 E. Springfield, Champaign IL 61820                                   * * mosaic@ncsa.uiuc.edu                                                     * *                                                                          * * Copyright (C) 1993, Board of Trustees of the University of Illinois      * *                                                                          * * NCSA Mosaic software, both binary and source (hereafter, Software) is    * * copyrighted by The Board of Trustees of the University of Illinois       * * (UI), and ownership remains with the UI.                                 * *                                                                          * * The UI grants you (hereafter, Licensee) a license to use the Software    * * for academic, research and internal business purposes only, without a    * * fee.  Licensee may distribute the binary and source code (if released)   * * to third parties provided that the copyright notice and this statement   * * appears on all copies and that no charge is associated with such         * * copies.                                                                  * *                                                                          * * Licensee may make derivative works.  However, if Licensee distributes    * * any derivative work based on or derived from the Software, then          * * Licensee will (1) notify NCSA regarding its distribution of the          * * derivative work, and (2) clearly notify users that such derivative       * * work is a modified version and not the original NCSA Mosaic              * * distributed by the UI.                                                   * *                                                                          * * Any Licensee wishing to make commercial use of the Software should       * * contact the UI, c/o NCSA, to negotiate an appropriate license for such   * * commercial use.  Commercial use includes (1) integration of all or       * * part of the source code into a product for sale or license by or on      * * behalf of Licensee to third parties, or (2) distribution of the binary   * * code or source code to third parties that need it to utilize a           * * commercial product sold or licensed by or on behalf of Licensee.         * *                                                                          * * UI MAKES NO REPRESENTATIONS ABOUT THE SUITABILITY OF THIS SOFTWARE FOR   * * ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED          * * WARRANTY.  THE UI SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY THE    * * USERS OF THIS SOFTWARE.                                                  * *                                                                          * * By using or copying this Software, Licensee agrees to abide by the       * * copyright law and all other applicable laws of the U.S. including, but   * * not limited to, export control laws, and the terms of this license.      * * UI shall have the right to terminate this license immediately by         * * written notice upon Licensee's breach of, or non-compliance with, any    * * of its terms.  Licensee may be held legally responsible for any          * * copyright infringement that is caused or encouraged by Licensee's        * * failure to abide by the terms of this license.                           * *                                                                          * * Comments and questions are welcome and can be sent to                    * * mosaic-x@ncsa.uiuc.edu.                                                  * ****************************************************************************/#ifdef TIMING#include <sys/time.h>struct timeval Tv;struct timezone Tz;#endif#include <stdio.h>#include <ctype.h>#include <stdlib.h>#include <string.h>#include "HTML.h"#include "util.h"static void FreeObjList();static struct mark_up *AddObj();char *ParseMarkTag();extern int tableSupportEnabled;#ifdef NOT_ASCII#define TOLOWER(x)	(tolower(x))#else/* * A hack to speed up caseless_equal.  Thanks to Quincey Koziol for * developing it for me */unsigned char map_table[256]={    0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,    24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,    45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,97,98,    99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,    116,117,118,119,120,121,122,91,92,93,94,95,96,97,98,99,100,101,102,    103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,    120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,    137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,    154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,    171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,    188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,    205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,    222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,    239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255};#define TOLOWER(x)	(map_table[x])#endif /* NOT_ASCII *//* * Check if two strings are equal, ignoring case. * The strings must be of the same length to be equal. * return 1 if equal, 0 otherwise. */intcaseless_equal(str1, str2)	char *str1;	char *str2;{	if ((str1 == NULL)||(str2 == NULL))	{		return(0);	}	while ((*str1 != '\0')&&(*str2 != '\0'))	{		if (TOLOWER(*str1) != TOLOWER(*str2))		{			return(0);		}		str1++;		str2++;	}	if ((*str1 == '\0')&&(*str2 == '\0'))	{		return(1);	}	else	{		return(0);	}}/* * Check if two strings are equal in the first count characters, ignoring case. * The strings must both be at least of length count to be equal. * return 1 if equal, 0 otherwise. */intcaseless_equal_prefix(str1, str2, cnt)	char *str1;	char *str2;	int cnt;{	int i;	if ((str1 == NULL)||(str2 == NULL))	{		return(0);	}	if (cnt < 1)	{		return(1);	}	for (i=0; i < cnt; i++)	{		if (TOLOWER(*str1) != TOLOWER(*str2))		{			return(0);		}		str1++;		str2++;	}	return(1);}/* * Clean up the white space in a string. * Remove all leading and trailing whitespace, and turn all * internal whitespace into single spaces separating words. * The cleaning is done by rearranging the chars in the passed * txt buffer.  The resultant string will probably be shorter, * it can never get longer. */voidclean_white_space(txt)	char *txt;{	char *ptr;	char *start;	start = txt;	ptr = txt;	/*	 * Remove leading white space	 */	while (isspace((int)*ptr))	{		ptr++;	}	/*	 * find a word, copying if we removed some space already	 */	if (start == ptr)	{		while ((!isspace((int)*ptr))&&(*ptr != '\0'))		{			ptr++;		}		start = ptr;	}	else	{		while ((!isspace((int)*ptr))&&(*ptr != '\0'))		{			*start++ = *ptr++;		}	}	while (*ptr != '\0')	{		/*		 * Remove trailing whitespace.		 */		while (isspace((int)*ptr))		{			ptr++;		}		if (*ptr == '\0')		{			break;		}		/*		 * If there are more words, insert a space and if space was		 * removed move up remaining text.		 */		*start++ = ' ';		if (start == ptr)		{			while ((!isspace((int)*ptr))&&(*ptr != '\0'))			{				ptr++;			}			start = ptr;		}		else		{			while ((!isspace((int)*ptr))&&(*ptr != '\0'))			{				*start++ = *ptr++;			}		}	}	*start = '\0';}/* * parse an amperstand escape, and return the appropriate character, or * '\0' on error. * we should really only use caseless_equal_prefix for unterminated, and use * caseless_equal otherwise, but since there are so many escapes, and I * don't want to type everything twice, I always use caseless_equal_prefix * Turns out the escapes are case sensitive, use strncmp. * termination states: *	0: terminated with a ';' *	1: unterminated *	2: terminated with whitespace */charExpandEscapes(esc, endp, termination)	char *esc;	char **endp;	int termination;{	int cnt;	char val;	int unterminated;	unterminated = (termination & 0x01);	esc++;	if (*esc == '#')	{		if (unterminated)		{			char *tptr;			char tchar;			tptr = (char *)(esc + 1);			while (isdigit((int)*tptr))			{				tptr++;			}			tchar = *tptr;			*tptr = '\0';			val = (char)atoi((esc + 1));			*tptr = tchar;			*endp = tptr;		}		else		{			val = (char)atoi((esc + 1));			*endp = (char *)(esc + strlen(esc));		}	}	else	{		int escLen, ampLen;		cnt = 0;		escLen = strlen(esc);		while (AmpEscapes[cnt].tag != NULL)		{			ampLen = strlen(AmpEscapes[cnt].tag);			if ((escLen == ampLen) && (strncmp(esc, AmpEscapes[cnt].tag, ampLen) == 0))			{				val = AmpEscapes[cnt].value;				*endp = (char *)(esc +					strlen(AmpEscapes[cnt].tag));				break;			}			cnt++;		}		if (AmpEscapes[cnt].tag == NULL)		{#ifdef VERBOSE		       	errorlog ("Error bad & string\n");#endif			val = '\0';			*endp = (char *)NULL;		}	}	return(val);}/* * Clean the special HTML character escapes out of the text and replace * them with the appropriate characters "&lt;" = "<", "&gt;" = ">", * "&amp;" = "&" * GAG:  apperantly &lt etc. can be left unterminated, what a nightmare. * Ok, better, they have to be terminated with white-space or ';'. * the '&' character must be immediately followed by a letter to be * a valid escape sequence.  Other &'s are left alone. * The cleaning is done by rearranging chars in the passed txt buffer. * if any escapes are replaced, the string becomes shorter. */voidclean_text(txt)	char *txt;{	int unterminated;	int space_terminated;	char *ptr;	char *ptr2;	char *start;	char *text;	char *tend;	char tchar;	char val;	if (txt == NULL)	{		return;	}	/*	 * Fix from "Peter J. Scott" <pjs@euclid.Jpl.Nasa.Gov> Jan 8, 1996:	 * Turn carriage returns into spaces.	 */	ptr = txt;	while (*ptr != '\0')	  {	    if (*ptr == '\r')	      *ptr = '\n';	    ptr++;	  }	/*	 * Quick scan to find escape sequences.	 * Escape is '&' followed by a letter (or a hash mark).	 * return if there are none.	 */	ptr = txt;	while (*ptr != '\0')	{		if ((*ptr == '&')&&			((isalpha((int)*(ptr + 1)))||(*(ptr + 1) == '#')))		{			break;		}		ptr++;	}	if (*ptr == '\0')	{		return;	}	/*	 * Loop, replaceing escape sequences, and moving up remaining	 * text.	 */	ptr2 = ptr;	while (*ptr != '\0')	{		unterminated = 0;		space_terminated = 0;		/*		 * Extract the escape sequence from start to ptr		 */		start = ptr;		while ((*ptr != ';')&&(!isspace((int)*ptr))&&(*ptr != '\0'))		{			ptr++;		}		if (*ptr == '\0')		{#ifdef VERBOSE		        errorlog("warning:  unterminated & (%s)\n",start);#endif			unterminated = 1;		}		else if (isspace((int)*ptr))		{			space_terminated = 1;		}		/*		 * Copy the escape sequence into a separate buffer.		 * Then clean spaces so the "& lt ;" = "&lt;" etc.		 * The cleaning should be unnecessary.		 */		tchar = *ptr;		*ptr = '\0';		text = (char *)malloc(strlen(start) + 1);		if (text == NULL)		{   		        errorlog("Cannot malloc space for & text\n");			*ptr = tchar;			return;		}		strcpy(text, start);		*ptr = tchar;		clean_white_space(text);		/*		 * Replace escape sequence with appropriate character		 */		val = ExpandEscapes(text, &tend,			((space_terminated << 1) + unterminated));		if (val != '\0')		{			if (unterminated)			{				tchar = *tend;				*tend = '\0';				ptr = (char *)(start + strlen(text) - 1);				*tend = tchar;			}			else if (space_terminated)			{				ptr--;			}			*ptr2 = val;			unterminated = 0;			space_terminated = 0;		}		/*		 * invalid escape sequence. skip it.		 */		else		{#ifdef VERBOSE		errorlog("Error bad & string\n");#endif			ptr = start;			*ptr2 = *ptr;		}		free(text);		/*		 * Copy forward remaining text until you find the next		 * escape sequence		 */		ptr2++;		ptr++;		while (*ptr != '\0')		{			if ((*ptr == '&')&&			    ((isalpha((int)*(ptr + 1)))||(*(ptr + 1) == '#')))			{				break;			}			*ptr2++ = *ptr++;		}	}	*ptr2 = '\0';}/* * Get a block of text from a HTML document. * All text from start to the end, or the first mark * (a mark is '<' or '</' followed by any letter or a '!') * is returned in a malloced buffer.  Also, endp returns * a pointer to the next '<' or '\0' * The returned text has already expanded '&' escapes. */char *get_text(start, endp)	char *start;	char **endp;{	char *ptr;	char *text;	char tchar;	if (start == NULL)	{		return(NULL);	}	/*	 * Copy text up to beginning of a mark, or the end	 */	ptr = start;
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -