⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparse.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
		type = M_TITLE;	}	else if (caseless_equal(str, MT_FIXED))	{		type = M_FIXED;	}	else if (caseless_equal(str, MT_BOLD))	{		type = M_BOLD;	}	else if (caseless_equal(str, MT_ITALIC))	{		type = M_ITALIC;	}	else if (caseless_equal(str, MT_EMPHASIZED))	{		type = M_EMPHASIZED;	}	else if (caseless_equal(str, MT_STRONG))	{		type = M_STRONG;	}	else if (caseless_equal(str, MT_CODE))	{		type = M_CODE;	}	else if (caseless_equal(str, MT_SAMPLE))	{		type = M_SAMPLE;	}	else if (caseless_equal(str, MT_KEYBOARD))	{		type = M_KEYBOARD;	}	else if (caseless_equal(str, MT_VARIABLE))	{		type = M_VARIABLE;	}	else if (caseless_equal(str, MT_CITATION))	{		type = M_CITATION;	}	else if (caseless_equal(str, MT_STRIKEOUT))	{		type = M_STRIKEOUT;	}	else if (caseless_equal(str, MT_HEADER_1))	{		type = M_HEADER_1;	}	else if (caseless_equal(str, MT_HEADER_2))	{		type = M_HEADER_2;	}	else if (caseless_equal(str, MT_HEADER_3))	{		type = M_HEADER_3;	}	else if (caseless_equal(str, MT_HEADER_4))	{		type = M_HEADER_4;	}	else if (caseless_equal(str, MT_HEADER_5))	{		type = M_HEADER_5;	}	else if (caseless_equal(str, MT_HEADER_6))	{		type = M_HEADER_6;	}	else if (caseless_equal(str, MT_ADDRESS))	{		type = M_ADDRESS;	}	else if (caseless_equal(str, MT_PLAIN_TEXT))	{		type = M_PLAIN_TEXT;	}	else if (caseless_equal(str, MT_LISTING_TEXT))	{		type = M_LISTING_TEXT;	}	else if (caseless_equal(str, MT_PLAIN_FILE))	{		type = M_PLAIN_FILE;	}	else if (caseless_equal(str, MT_PARAGRAPH))	{		type = M_PARAGRAPH;	}	else if (caseless_equal(str, MT_UNUM_LIST))	{		type = M_UNUM_LIST;	}	else if (caseless_equal(str, MT_NUM_LIST))	{		type = M_NUM_LIST;	}	else if (caseless_equal(str, MT_MENU))	{		type = M_MENU;	}	else if (caseless_equal(str, MT_DIRECTORY))	{		type = M_DIRECTORY;	}	else if (caseless_equal(str, MT_LIST_ITEM))	{		type = M_LIST_ITEM;	}	else if (caseless_equal(str, MT_DESC_LIST))	{		type = M_DESC_LIST;	}	else if (caseless_equal(str, MT_DESC_TITLE))	{		type = M_DESC_TITLE;	}	else if (caseless_equal(str, MT_DESC_TEXT))	{		type = M_DESC_TEXT;	}	else if (caseless_equal(str, MT_PREFORMAT))	{		type = M_PREFORMAT;	}	else if (caseless_equal(str, MT_BLOCKQUOTE))	{		type = M_BLOCKQUOTE;	}	else if (caseless_equal(str, MT_INDEX))	{		type = M_INDEX;	}	else if (caseless_equal(str, MT_HRULE))	{		type = M_HRULE;	}	else if (caseless_equal(str, MT_BASE))	{		type = M_BASE;	}	else if (caseless_equal(str, MT_LINEBREAK))	{		type = M_LINEBREAK;	}	else if (caseless_equal(str, MT_IMAGE))	{		type = M_IMAGE;	}	else if (caseless_equal(str, MT_FIGURE))	{		type = M_FIGURE;	}	else if (caseless_equal(str, MT_SELECT))	{		type = M_SELECT;	}	else if (caseless_equal(str, MT_OPTION))	{		type = M_OPTION;	}	else if (caseless_equal(str, MT_INPUT))	{		type = M_INPUT;	}	else if (caseless_equal(str, MT_TEXTAREA))	{		type = M_TEXTAREA;	}	else if (caseless_equal(str, MT_FORM))	{		type = M_FORM;	}/*amb*/        else if (caseless_equal(str, MT_SUP))        {                type = M_SUP;        }        else if (caseless_equal(str, MT_SUB))        {                type = M_SUB;        }	else if (caseless_equal(str, MT_DOC_HEAD))        {	        type = M_DOC_HEAD;        }	else if (caseless_equal(str, MT_UNDERLINED))        {	        type = M_UNDERLINED;        }	else if (caseless_equal(str, MT_DOC_BODY))        {	        type = M_DOC_BODY;        }	else if (caseless_equal(str, MT_TABLE))	{		type = M_TABLE;	}	else if (caseless_equal(str, MT_CAPTION))	{		type = M_CAPTION;	}	else if (caseless_equal(str, MT_TABLE_ROW))	{		type = M_TABLE_ROW;	}	else if (caseless_equal(str, MT_TABLE_HEADER))	{		type = M_TABLE_HEADER;	}	else if (caseless_equal(str, MT_TABLE_DATA))	{		type = M_TABLE_DATA;	}	else if (caseless_equal(str, MT_MAP))	{		type=M_MAP;	}	else if (caseless_equal(str, MT_META))	{	        type=M_META;        }	else	{#ifdef VERBOSE	        errorlog("warning: unknown mark (%s)\n", str);#endif		type = M_UNKNOWN;	}	*tptr = tchar;	return(type);}/* * Parse a single anchor tag.  ptrp is a pointer to a pointer to the * string to be parsed.  On return, the ptr should be changed to * point to after the text we have parsed. * On return start and end should point to the beginning, and just * after the end of the tag's name in the original anchor string. * Finally the function returns the tag value in a malloced buffer. */char *AnchorTag(ptrp, startp, endp)	char **ptrp;	char **startp;	char **endp;{	char *tag_val;	char *ptr;	char *start;	char tchar;	int quoted;	int has_value;	quoted = 0;	/*	 * remove leading spaces, and set start	 */	ptr = *ptrp;	while (isspace((int)*ptr))	{		ptr++;	}	*startp = ptr;	/*	 * Find and set the end of the tag	 */	while ((!isspace((int)*ptr))&&(*ptr != '=')&&(*ptr != '\0'))	{		ptr++;	}	*endp = ptr;        has_value=0;	if (*ptr == '\0')	{		*ptrp = ptr;/*		return(NULL);*/                    /* try to handle <A NAME=blah></A> correctly -bjs*/	} else {	/*	 * Move to the start of the tag value, if there is one.	 */            while ((isspace((int)*ptr))||(*ptr == '='))            {		if (*ptr == '=')		{                    has_value = 1;		}		ptr++;            }        }	/*	 * For a tag with no value, this is a boolean flag.	 * Return the string "1" so we know the tag is there.	 */	if (!has_value)	{		*ptrp = *endp;		/*		 * set a tag value of 1.		 */		tag_val = (char *)malloc(strlen("1") + 1);		if (tag_val == NULL)		{		        errorlog(stderr, "can't malloc space for tag value\n");			return(NULL);		}		strcpy(tag_val, "1");		return(tag_val);	}	if (*ptr == '\"')	{		quoted = 1;		ptr++;	}	start = ptr;	/*	 * Get tag value.  Either a quoted string or a single word	 */	if (quoted)	{		while ((*ptr != '\"')&&(*ptr != '\0'))		{			ptr++;		}	}	else	{		while ((!isspace((int)*ptr))&&(*ptr != '\0'))		{			ptr++;		}	}/* amb - everyone forgets the end quotes on anchor   attributes, so we'll let it slide *//*	if ((quoted)&&(*ptr == '\0'))	{		*ptrp = ptr;		return(NULL);	}*/	/*	 * Copy the tag value out into a malloced string	 */	tchar = *ptr;	*ptr = '\0';	tag_val = (char *)malloc(strlen(start) + 1);	if (tag_val == NULL)	{       		errorlog("can't malloc space for tag value\n");		*ptr = tchar;		*ptrp = ptr;		return(NULL);	}	strcpy(tag_val, start);	*ptr = tchar;	/* If you forgot the end quote, you need to make sure you aren't		indexing ptr past the end of its own array -- SWP */	if (quoted && *ptr!='\0')	{		ptr++;	}	*ptrp = ptr;	return(tag_val);}/* * Parse mark text for the value associated with the * passed mark tag. * If the passed tag is not found, return NULL. * If the passed tag is found but has no value, return "". */char* ParseMarkTag(text, mtext, mtag)	char *text;	char *mtext;	char *mtag;{	char *ptr;	char *start;	char *end;	char *tag_val;	char tchar;	if ((text == NULL)||(mtext == NULL)||(mtag == NULL))	{		return(NULL);	}	ptr = (char *)(text + strlen(mtext));	while (*ptr != '\0')	{		tag_val = AnchorTag(&ptr, &start, &end);		tchar = *end;		*end = '\0';		if (caseless_equal(start, mtag))		{			*end = tchar;			if (tag_val == NULL)			{				tag_val = (char *)malloc(1);				*tag_val = '\0';				return(tag_val);			}			else			{				return(tag_val);			}		}		*end = tchar;		if (tag_val != NULL)		{			free(tag_val);		}	}	return(NULL);}/* HTMLlists.c *//* * Code to manage a linked list of parsed HTML objects generated * from a raw text file. * Also code to manage a linked list of formatted elements that * make up a page of a formatted document. *//* * Free up the passed linked list of parsed elements, freeing * all memory associates with each element. */static void FreeObjList(List)     struct mark_up *List;{        struct mark_up *current;        struct mark_up *mptr;        current = List;        while (current != NULL) {                mptr = current;                current = current->next;                mptr->next = NULL;                if (mptr->start != NULL) {                        free((char *) mptr->start);                }                if (mptr->text != NULL) {                        free((char *) mptr->text);                }                if (mptr->end != NULL) {                        free((char *) mptr->end);                }                free((char *) mptr);        }}/* * Add an object to the parsed object list. * return a pointer to the current (end) position in the list. * If the object is a normal text object containing nothing but * white space, throw it out, unless we have been told to keep * white space. */static struct mark_up *        AddObj(listp, current, mark, keep_wsp)     struct mark_up **listp;     struct mark_up *current;     struct mark_up *mark;     int keep_wsp;{  if (mark == NULL) {                return (current);  }  /*   * Throw out normal text blocks that are only white space,   * unless keep_wsp is set.   */  if ((mark->type == M_NONE) && (!keep_wsp)) {                char *ptr;                ptr = mark->text;                if (ptr == NULL) {                        free((char *) mark);                        return (current);                }		/*		 * No longer throw out whitespace, it is important to keep		 * white space between tags. while ((*ptr == ' ')||(*ptr == '\t')||(*ptr == '\n')) { ptr++; } * */                if (*ptr == '\0') {                        free(mark->text);                        free((char *) mark);                        return (current);                }  }  /*   * Add object to either the head of the list for a new list,   * or at the end after the current pointer.   */  if (*listp == NULL) {                *listp = mark;                current = *listp;  } else {                current->next = mark;                current = current->next;  }        current->next = NULL;        return (current);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -