📄 htmlparse.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
	while (*ptr != '\0')	{		if (*ptr == '<')		{			if (isalpha((int)(*(ptr + 1))))			{				break;			}			else if (*(ptr + 1) == '/')			{				if (isalpha((int)(*(ptr + 2))))				{					break;				}			}			else if (*(ptr + 1) == '!')  /* a comment */			{				break;			}		}		ptr++;	}	*endp = ptr;	if (ptr == start)	{		return(NULL);	}	/*	 * Copy the text into its own buffer, and clean it	 * of escape sequences.	 */	tchar = *ptr;	*ptr = '\0';	text = (char *)malloc(strlen(start) + 1);	if (text == NULL)	{		errorlog("Cannot malloc space for text\n");		*ptr = tchar;		return(NULL);	}	strcpy(text, start);	*ptr = tchar;	clean_text(text);	return(text);}/* * Get the mark text between '<' and '>'.  From the text, determine * its type, and fill in a mark_up structure to return.  Also returns * endp pointing to the ttrailing '>' in the original string. */struct mark_up *get_mark(start, endp)	char *start;	char **endp;{	char *ptr;	char *text;	char tchar;	struct mark_up *mark;	int  comment=0;       /* amb - comment==1 if we are in a comment */	char *first_gt=NULL;  /* keep track of ">" for old broken comments */	if (start == NULL)	{		return(NULL);	}	if (*start != '<')	{		return(NULL);	}	/* amb - check if we are in a comment, start tag is <!-- */	if (strncmp (start, "<!--", 4)==0)	  comment=1;	start++;	first_gt = NULL;	mark = (struct mark_up *)malloc(sizeof(struct mark_up));	if (mark == NULL)	{	        errorlog("Cannot malloc space for mark_up struct\n");		return(NULL);	}	/*	 * Grab the mark text	 */	ptr = start;	/* amb - skip over the comment text */	/* end tag is --*>, where * is zero or more spaces (ugh) */	if (comment)	  {	    while (*ptr != '\0')	      {		if ( (*ptr == '>') && (!first_gt) )		  first_gt = ptr;		if (strncmp (ptr, "--", 2) == 0)   /* found double dash (--) */		  {		    ptr += 2;		    while ((*ptr != '\0') && ((*ptr == ' ') || (*ptr == '\n')					      || (*ptr == '-') ))		      ptr++;                   /* skip spaces and newlines */		    if (*ptr == '>')                /* completed end comment */		      {			*endp = ptr;			mark->is_end = 1;			mark->type = M_COMMENT;			mark->start = NULL;			mark->text = NULL;			mark->end = NULL;			mark->next = NULL;			return(mark);		      }		  }		else                         /* if no double dash (--) found */		  ptr++;	      }	    /* if we get here, this document must use the old broken	       comment style */	    if (first_gt) {		ptr = first_gt;	    }	  } /* end of: if (comment) */	while (ptr&&(*ptr != '>')&&(*ptr != '\0'))	{		ptr++;	}	if (ptr) {		*endp=ptr;	}	else {		return(NULL); /*only if EOF and no close comment -- SWP*/	}	if (*ptr != '>')	{#ifdef VERBOSE		errorlog(stderr, "error: bad mark format\n");#endif		return(NULL);	}	/*	 * Copy the mark text to its own buffer, and	 * clean it of escapes, and odd white space.	 */	tchar = *ptr;	*ptr = '\0';	text = (char *)malloc(strlen(start) + 1);	if (text == NULL)	{	        errorlog("Cannot malloc space for mark\n");		*ptr = tchar;		return(NULL);	}	strcpy(text, start);	*ptr = tchar;	clean_text(text);/* * No longer needed because the parsing code is now smarter *	clean_white_space(text); * */	/*	 * Set whether this is the start or end of a mark	 * block, as well as determining its type.	 */	if (*text == '/')	{		mark->is_end = 1;		mark->type = ParseMarkType((char *)(text + 1));		mark->start = NULL;		mark->text = NULL;		mark->end = text;	}	else	{		mark->is_end = 0;		mark->type = ParseMarkType(text);		mark->start = text;		mark->text = NULL;		mark->end = NULL;	}	mark->text = NULL;	mark->next = NULL;	return(mark);}/* * Special version of get_text.  It reads all text up to the * end of the plain text mark, or the end of the file. */char *get_plain_text(start, endp)	char *start;	char **endp;{	char *ptr;	char *text;	char tchar;	if (start == NULL)	{		return(NULL);	}	/*	 * Read until stopped by end plain text mark.	 */	ptr = start;	while (*ptr != '\0')	{		/*		 * Beginning of a mark is '<' followed by any letter,		 * or followed by '!' for a comment,		 * or '</' followed by any letter.		 */		if ((*ptr == '<')&&			((isalpha((int)(*(ptr + 1))))||			(*(ptr + 1) == '!')||			((*(ptr + 1) == '/')&&(isalpha((int)(*(ptr + 2)))))))		{			struct mark_up *mp;			char *ep;			/*			 * We think we found a mark.  If it is the			 * end of plain text, break out			 */			mp = get_mark(ptr, &ep);			if (mp != NULL)			{				if (((mp->type == M_PLAIN_TEXT)||				    (mp->type == M_LISTING_TEXT))&&(mp->is_end))				{					if (mp->end != NULL)					{						free((char *)mp->end);					}					free((char *)mp);					break;				}				if (mp->start != NULL)				{					free((char *)mp->start);				}				if (mp->end != NULL)				{					free((char *)mp->end);				}				free((char *)mp);			}		}		ptr++;	}	*endp = ptr;	if (ptr == start)	{		return(NULL);	}	/*	 * Copy text to its own malloced buffer, and clean it of	 * HTML escapes.	 */	tchar = *ptr;	*ptr = '\0';	text = (char *)malloc(strlen(start) + 1);	if (text == NULL)	{		errorlog("Cannot malloc space for text\n");		*ptr = tchar;		return(NULL);	}	strcpy(text, start);	*ptr = tchar;	clean_text(text);	return(text);}static char *atts[]={"text","bgcolor","alink","vlink","link",NULL};/* * Main parser of HTML text.  Takes raw text, and produces a linked * list of mark objects.  Mark objects are either text strings, or * starting and ending mark delimiters. * The old list is passed in so it can be freed, and in the future we * may want to add code to append to the old list. */struct mark_up *HTMLParse(old_list, str, hw)	struct mark_up *old_list;	char *str;        void *hw;{	int preformat;	char *start, *end;	char *text, *tptr;	struct mark_up *mark;	struct mark_up *list;	struct mark_up *current;	preformat = 0;	/*	 * Free up the previous Object List if one exists	 */	FreeObjList(old_list);	if (str == NULL)	{		return(NULL);	}	list = NULL;	current = NULL;	start = str;	end = str;	mark = NULL;	while (*start != '\0')	{		/*		 * Get some text (if any).  If our last mark was		 * a begin plain text we call different function		 * If last mark was <PLAINTEXT> we lump all the rest of		 * the text in.		 */		if ((mark != NULL)&&(mark->type == M_PLAIN_FILE)&&			(!mark->is_end))		{			text = start;			end = text;			while (*end != '\0')			{				end++;			}			/*			 * Copy text to its own malloced buffer, and clean it of			 * HTML escapes.			 */			tptr = (char *)malloc(strlen(text) + 1);			if (tptr == NULL)			{			        errorlog("Cannot malloc space for text\n");				return(list);			}			strcpy(tptr, text);			text = tptr;		}		else if ((mark != NULL)&&			 ((mark->type == M_PLAIN_TEXT)||			  (mark->type == M_LISTING_TEXT))&&			 (!mark->is_end))		{			text = get_plain_text(start, &end);		}		else		{			text = get_text(start, &end);		}		/*		 * If text is OK, put it into a mark structure, and add		 * it to the linked list.		 */		if (text == NULL)		{			if (start != end)			{			       	errorlog("error parsing text, bailing out\n");				return(list);			}		}		else		{			mark = (struct mark_up *)malloc(sizeof(struct mark_up));			if (mark == NULL)			{				errorlog("Cannot malloc for mark_up struct\n");				return(list);			}			mark->type = M_NONE;			mark->is_end = 0;			mark->start = NULL;			mark->text = text;			mark->end = NULL;			mark->next = NULL;			current = AddObj(&list, current, mark, preformat);		}		start = end;		if (*start == '\0')		{			break;		}		/*		 * Get the next mark if any, and if it is		 * valid, add it to the linked list.		 */		mark = get_mark(start, &end);		if (mark == NULL)		{			if (start != end)			{			        errorlog("error parsing mark, bailing out\n");				return(list);			}		}		else		{		    mark->next = NULL;                    current = AddObj(&list, current, mark, preformat);		}		start = (char *)(end + 1);		if ((mark != NULL)&&(mark->type == M_PLAIN_FILE)&&			(!mark->is_end))		{			/*			 * A linefeed immediately after the <PLAINTEXT>			 * mark is to be ignored.			 */			if (*start == '\n')			{				start++;			}		}		else if ((mark != NULL)&&((mark->type == M_PLAIN_TEXT)||			(mark->type == M_LISTING_TEXT))&&			(!mark->is_end))		{			/*			 * A linefeed immediately after the <XMP>			 * or <LISTING> mark is to be ignored.			 */			if (*start == '\n')			{				start++;			}		}		/*		 * If we are parsing pre-formatted text we need to set a		 * flag so we don't throw out needed linefeeds.		 */		else if ((mark != NULL)&&(mark->type == M_PREFORMAT))		{			if (mark->is_end)			{				preformat = 0;			}			else			{				preformat = 1;				/*				 * A linefeed immediately after the <PRE>				 * mark is to be ignored.				 */				if (*start == '\n')				{					start++;				}			}		}	}	return(list);}/* * Determine mark type from the identifying string passed */intParseMarkType(str)	char *str;{	int type;	char *tptr;	char tchar;	if (str == NULL)	{		return(M_NONE);	}	type = M_UNKNOWN;	tptr = str;	while (*tptr != '\0')	{		if (isspace((int)*tptr))		{			break;		}		tptr++;	}	tchar = *tptr;	*tptr = '\0';	if (caseless_equal(str, MT_ANCHOR))	{		type = M_ANCHOR;	}	else if (caseless_equal(str, MT_FRAME))	{		type = M_FRAME;	}	else if (caseless_equal(str, MT_AREA))	{	        type = M_AREA;        }	else if (caseless_equal(str, MT_TITLE))	{
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -