⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.h

📁 网络爬虫程序
💻 H
字号:
/***************************************************************************//*    This code is part of WWW grabber called pavuk                        *//*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          *//*    Distributed under GPL 2 or later                                     *//***************************************************************************/#ifndef _htmlparser_h_#define _htmlparser_h_#include "dlhash.h"#include "dllist.h"#include "html.h"#include "url.h"typedef struct{  int rewrite;                  /* == TRUE -> wi will adjust content */  int purestyle;                /* == TRUE if document is text/css */  int purescript;               /* == TRUE if document is script */  url *doc_url;                 /* source URL of document */  ssize_t in_size;              /* size of HTML document */  char *in_content;             /* content of HTML document */  int in_offset;                /* current offset in HTML document */  ssize_t aout_size;            /* allocated size for adjusted HTML */  char *out_content;            /* content of adjusted HTML document */  int out_offset;               /* current offset in adj. HTML doc */  char *base;                   /* current BASE of HTML document */  char *baset;                  /* BASE of document derived from URL */  int stack_size;               /* allocated size of processing stack */  int stack_offset;             /* current ofset in stack */  char *stack;                  /* stack for buffering tags of HTML doc */  char *tag_attrib;             /* content of currently processed tag */  dlhash *tag_hash;             /* hash for speedup lookup of tags */  html_tag_t *current_tag;      /* tag of currently processed tag */  html_tag_atrib_t *current_attrib;     /* current attrib of current tag */  dllist *tag_funcs;            /* func chain for processing tags */  dllist *attrib_funcs;         /* func chain for processing tag attributes */  dllist *style_funcs;          /* func chain for procesing CSS */  dllist *script_funcs;         /* func chain for procesing scripts */} html_parser_t;typedef void (*html_parser_func_t) (html_parser_t *, char *, void *);/***********************************//* structure of this type holds    *//* infos about functions in chain  *//***********************************/typedef struct{  html_parser_func_t func;  void *data;} html_parser_func_info_t;/***********************************//* struct for passing informations *//* to/from URL extracting routine  *//***********************************/typedef struct{  int no_limits;  int only_inline;  int enable_js;  dllist *urls;  dllist *prev_a;} html_extract_info_t;/***********************************//* struct for passing informations *//* to/from URL rewriting routine   *//***********************************/typedef struct{  int all_to_local;  int all_to_remote;  int selected_to_local;  int store_index;  html_extract_info_t *einfo;} html_rewrite_info_t;/***********************************//* struct for passing informations *//* to/from URL changing routine    *//***********************************/typedef struct{  url *url_old;  char *url_new;} html_change_info_t;/***********************************//* struct for getting informations *//* from META Robots parser         *//***********************************/typedef struct{  int index;  int follow;  int images;} html_robots_info_t;/******************************************//* macros for safe handling of expandable *//* memory chunks required in HTML parser  *//******************************************/#define html_parser_FENDER      2048#define html_parser_MEXPAND(hpinfo, sv) \{ \  int psize = 100 + (sv) + hpinfo->in_size \    + hpinfo->out_offset - hpinfo->in_offset; \  if(hpinfo->aout_size < psize) \  { \    hpinfo->aout_size = psize + html_parser_FENDER; \    hpinfo->out_content = \      _realloc(hpinfo->out_content, hpinfo->aout_size); \  } \}#define html_parser_SEXPAND(hpinfo, sv) \  if((hpinfo->stack_size - (hpinfo->stack_offset \    + (sv) + html_parser_FENDER)) < 0)\  { \    hpinfo->stack_size += html_parser_FENDER + sv; \    hpinfo->stack = _realloc(hpinfo->stack, hpinfo->stack_size); \  }#define html_parser_SEND(hpinfo) \  while(hpinfo->stack[hpinfo->stack_offset]) \    hpinfo->stack_offset++;extern html_parser_t *html_parser_init(html_tag_t *, int, int, int, int);extern void html_parser_kill(html_parser_t *);extern void html_parser_do_cleanup(void);extern void html_parser_add_tag_func(html_parser_t *, html_parser_func_t,  void *);extern void html_parser_add_attrib_func(html_parser_t *, html_parser_func_t,  void *);extern void html_parser_add_style_func(html_parser_t *, html_parser_func_t,  void *);extern void html_parser_add_script_func(html_parser_t *, html_parser_func_t,  void *);extern void html_parser_set_document(html_parser_t *, url *, char *, ssize_t);extern void html_parser_take_document(html_parser_t *, char **, ssize_t *);extern void html_parser_set_base(html_parser_t *, char *, char *);extern void html_parser_parse(html_parser_t *);extern void html_parser_parse_tag(html_parser_t *, char *, void *);extern void html_parser_parse_tag_slash_a(html_parser_t *, char *,  html_extract_info_t *);extern void html_parser_parse_tag_meta_refresh(html_parser_t *, char *,  void *);extern void html_parser_parse_tag_meta_robots(html_parser_t *, char *,  html_robots_info_t *);extern void html_parser_parse_tag_jstransform(html_parser_t *, char *,  void *);extern void html_parser_url_to_absolute_url(html_parser_t *, char *, void *);extern void html_parser_remove_advertisement(html_parser_t *, char *, void *);extern void html_parser_get_url(html_parser_t *, char *,  html_extract_info_t *);extern void html_parser_process_base(html_parser_t *, char *, void *);extern void html_parser_process_form(html_parser_t *, char *, dllist **);extern void html_parser_url_to_local(html_parser_t *, char *,  html_rewrite_info_t *);extern void html_parser_change_url(html_parser_t *, char *,  html_change_info_t *);extern void html_parser_get_style_urls(html_parser_t *, char *,  html_extract_info_t *);extern void html_parser_style_to_absolute_urls(html_parser_t *, char *,  void *);extern void html_parser_style_to_local_urls(html_parser_t *, char *,  html_rewrite_info_t *);extern void html_parser_style_change_url(html_parser_t *, char *,  html_change_info_t *);extern void html_parser_parse_jspatterns(html_parser_t *, char *, void *);extern void html_parser_parse_body_jspatterns(html_parser_t *, char *,  void *);extern void html_parser_parse_body_jstransform(html_parser_t *, char *,  void *);#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -