⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 rast.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
/* rast.c   Translate sgmls output to RAST result format.   Written by James Clark (jjc@jclark.com). */#include "config.h"#include "std.h"#include "sgmls.h"#include "getopt.h"#ifdef USE_PROTOTYPES#define P(parms) parms#else#define P(parms) ()#endif#ifdef __GNUC__#define NO_RETURN volatile#else#define NO_RETURN /* as nothing */#endif#ifdef VARARGS#define VP(parms) ()#else#define VP(parms) P(parms)#endif#ifdef USE_ISASCII#define ISASCII(c) isascii(c)#else#define ISASCII(c) (1)#endifNO_RETURN void error VP((char *,...));static void input_error P((int, char *, unsigned long));static int do_file P((FILE *));static void usage P((void));static void init_sort_code P((void));static void output_processing_instruction P((char *, unsigned));static void output_data P((struct sgmls_data *, int));static void output_data_lines P((char *, unsigned));static void output_internal_sdata P((char *, unsigned));static void output_external_entity P((struct sgmls_external_entity *));static void output_external_entity_info P((struct sgmls_external_entity *));static void output_element_start P((char *, struct sgmls_attribute *));static void output_element_end P((char *));static void output_attribute P((struct sgmls_attribute *));static void output_attribute_list P((struct sgmls_attribute *));static void output_tokens P((char **, int));static void output_markup_chars P((char *, unsigned));static void output_markup_string P((char *));static void output_char P((int, int));static void output_flush P((int));static void output_external_id P((char *, char *));static void output_entity P((struct sgmls_entity *));static void output_external_entity_info P((struct sgmls_external_entity *));static void output_internal_entity P((struct sgmls_internal_entity *));/* Don't use a prototype here to avoid problems with qsort. */static int compare_attributes();#define output_flush_markup() output_flush('!')#define output_flush_data() output_flush('|')static FILE *outfp;static int char_count = 0;static char *program_name;static short sort_code[256];static struct sgmls_attribute **attribute_vector = 0;static int attribute_vector_length = 0;int main(argc, argv)     int argc;     char **argv;{  int c;  int opt;  char *output_file = 0;  program_name = argv[0];  while ((opt = getopt(argc, argv, "o:")) != EOF)    switch (opt) {    case 'o':      output_file = optarg;      break;    case '?':      usage();    default:      abort();    }  if (output_file) {    errno = 0;    outfp = fopen(output_file, "w");    if (!outfp)      error("couldn't open `%s' for output: %s", strerror(errno));  }  else {    outfp = tmpfile();    if (!outfp)      error("couldn't create temporary file: %s", strerror(errno));  }  if (argc - optind > 1)    usage();  if (argc - optind == 1) {    if (!freopen(argv[optind], "r", stdin))      error("couldn't open `%s' for input: %s", argv[optind], strerror(errno));  }  (void)sgmls_set_errhandler(input_error);  init_sort_code();  if (!do_file(stdin)) {    fclose(outfp);    if (output_file) {      if (!freopen(output_file, "w", stdout))	error("couldn't reopen `%s' for output: %s", strerror(errno));    }    fputs("#ERROR\n", stdout);    exit(EXIT_FAILURE);  }  if (output_file) {    errno = 0;    if (fclose(outfp) == EOF)      error("error closing `%s': %s", output_file, strerror(errno));  }  else {    errno = 0;    if (fseek(outfp, 0L, SEEK_SET))      error("couldn't rewind temporary file: %s", strerror(errno));    while ((c = getc(outfp)) != EOF)      if (putchar(c) == EOF)	error("error writing standard output: %s", strerror(errno));  }  exit(EXIT_SUCCESS);}staticvoid usage(){  fprintf(stderr, "usage: %s [-o output_file] [input_file]\n", program_name);  exit(EXIT_FAILURE);}staticvoid init_sort_code(){  int i;  static char print[] = "!\"#$%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";  for (i = 0; i < 256; i++)    sort_code[i] = i + 128;  for (i = 0; print[i]; i++)    sort_code[(unsigned char)print[i]] = i;}staticint do_file(fp)     FILE *fp;{  struct sgmls *sp;  struct sgmls_event e;  int conforming = 0;  sp = sgmls_create(fp);  while (sgmls_next(sp, &e))    switch (e.type) {    case SGMLS_EVENT_DATA:      output_data(e.u.data.v, e.u.data.n);      break;    case SGMLS_EVENT_ENTITY:      output_external_entity(e.u.entity);      break;    case SGMLS_EVENT_PI:      output_processing_instruction(e.u.pi.s, e.u.pi.len);      break;    case SGMLS_EVENT_START:      output_element_start(e.u.start.gi, e.u.start.attributes);      sgmls_free_attributes(e.u.start.attributes);      break;    case SGMLS_EVENT_END:      output_element_end(e.u.end.gi);      break;    case SGMLS_EVENT_SUBSTART:      {	int level = 1;	output_external_entity(e.u.entity);	while (level > 0) {	  if (!sgmls_next(sp, &e))	    return 0;	  switch (e.type) {	  case SGMLS_EVENT_SUBSTART:	    level++;	    break;	  case SGMLS_EVENT_SUBEND:	    level--;	    break;	  case SGMLS_EVENT_START:	    sgmls_free_attributes(e.u.start.attributes);	    break;	  default:	    /* prevent compiler warnings */	    break;	  }	}      }      break;    case SGMLS_EVENT_APPINFO:      break;    case SGMLS_EVENT_CONFORMING:      conforming = 1;      break;    default:      abort();    }  sgmls_free(sp);  return conforming;}staticvoid output_processing_instruction(s, len)     char *s;     unsigned len;{  fputs("[?", outfp);  if (len > 0) {    putc('\n', outfp);    output_data_lines(s, len);    output_flush_data();  }  fputs("]\n", outfp);}staticvoid output_data(v, n)     struct sgmls_data *v;     int n;{  int i;  for (i = 0; i < n; i++) {    if (v[i].is_sdata)      output_internal_sdata(v[i].s, v[i].len);    else if (v[i].len > 0)      output_data_lines(v[i].s, v[i].len);  }}staticvoid output_data_lines(s, n)     char *s;     unsigned n;{  assert(n > 0);  for (; n > 0; --n)    output_char((unsigned char)*s++, '|');  output_flush_data();}staticvoid output_internal_sdata(s, n)     char *s;     unsigned n;{  fputs("#SDATA-TEXT\n", outfp);  output_markup_chars(s, n);  output_flush_markup();  fputs("#END-SDATA\n", outfp);}staticvoid output_external_entity(e)     struct sgmls_external_entity *e;{  fprintf(outfp, "[&%s\n", e->name);  output_external_entity_info(e);  fputs("]\n", outfp);}staticvoid output_element_start(gi, att)     char *gi;     struct sgmls_attribute *att;{  fprintf(outfp, "[%s", gi);  if (att) {    putc('\n', outfp);    output_attribute_list(att);  }  fputs("]\n", outfp);}staticvoid output_element_end(gi)     char *gi;{  fprintf(outfp, "[/%s]\n", gi);}staticvoid output_attribute_list(att)     struct sgmls_attribute *att;{  struct sgmls_attribute *p;  int n = 0;  int i;  for (p = att; p; p = p->next)    n++;  if (attribute_vector_length < n) {    if (attribute_vector_length == 0)      attribute_vector	= (struct sgmls_attribute **)malloc(n*sizeof(*attribute_vector));    else      attribute_vector	= (struct sgmls_attribute **)realloc((UNIV)attribute_vector,					     n*sizeof(*attribute_vector));    attribute_vector_length = n;    if (!attribute_vector)      error("Out of memory");  }  i = 0;  for (p = att; p; p = p->next)    attribute_vector[i++] = p;  qsort(attribute_vector, n, sizeof(attribute_vector[0]), compare_attributes);  for (i = 0; i < n; i++)    output_attribute(attribute_vector[i]);}staticint compare_attributes(p1, p2)     UNIV p1, p2;{  char *s1 = (*(struct sgmls_attribute **)p1)->name;  char *s2 = (*(struct sgmls_attribute **)p2)->name;    for (; *s1 && *s2; s1++, s2++)    if (*s1 != *s2)      return sort_code[(unsigned char)*s1] - sort_code[(unsigned char)*s2];  if (*s1)    return 1;  else if (*s2)    return -1;  else    return 0;}staticvoid output_attribute(p)     struct sgmls_attribute *p;{  fprintf(outfp, "%s=\n", p->name);  switch (p->type) {  case SGMLS_ATTR_IMPLIED:    fputs("#IMPLIED\n", outfp);    break;  case SGMLS_ATTR_CDATA:    {      struct sgmls_data *v = p->value.data.v;      int n = p->value.data.n;      int i;      for (i = 0; i < n; i++)	if (v[i].is_sdata)	  output_internal_sdata(v[i].s, v[i].len);	else {	  output_markup_chars(v[i].s, v[i].len);	  output_flush_markup();	}    }    break;  case SGMLS_ATTR_TOKEN:    output_tokens(p->value.token.v, p->value.token.n);    break;  case SGMLS_ATTR_ENTITY:    {      int i;      for (i = 0; i < p->value.entity.n; i++) {	struct sgmls_entity *e = p->value.entity.v[i];	char *name;	if (e->is_internal)	  name = e->u.internal.name;	else	  name = e->u.external.name;	if (i > 0)	  output_markup_string(" ");	output_markup_string(name);      }      output_flush_markup();      for (i = 0; i < p->value.entity.n; i++)	output_entity(p->value.entity.v[i]);    }    break;  case SGMLS_ATTR_NOTATION:    output_tokens(&p->value.notation->name, 1);    output_external_id(p->value.notation->pubid, p->value.notation->sysid);    break;  }}static void output_tokens(v, n)     char **v;     int n;{  int i;  assert(n > 0);  output_markup_string(v[0]);  for (i = 1; i < n; i++) {    output_markup_string(" ");    output_markup_string(v[i]);  }  output_flush_markup();}staticvoid output_markup_chars(s, n)     char *s;     unsigned n;{  for (; n > 0; --n)    output_char((unsigned char)*s++, '!');}staticvoid output_markup_string(s)     char *s;{  while (*s)    output_char((unsigned char)*s++, '!');}staticvoid output_char(c, delim)     int c;     int delim;{  if (ISASCII(c) && isprint(c)) {    if (char_count == 0)      putc(delim, outfp);    putc(c, outfp);    char_count++;    if (char_count == 60) {      putc(delim, outfp);      putc('\n', outfp);      char_count = 0;    }  }  else {    output_flush(delim);    switch (c) {    case RECHAR:      fputs("#RE\n", outfp);      break;    case RSCHAR:      fputs("#RS\n", outfp);      break;    case TABCHAR:      fputs("#TAB\n", outfp);      break;    default:      fprintf(outfp, "#%d\n", c);    }  }}staticvoid output_flush(delim)     int delim;{  if (char_count > 0) {    putc(delim, outfp);    putc('\n', outfp);    char_count = 0;  }}staticvoid output_external_id(pubid, sysid)  char *pubid;  char *sysid;{  if (!pubid && !sysid)    fputs("#SYSTEM\n#NONE\n", outfp);  else {    if (pubid) {      fputs("#PUBLIC\n", outfp);      if (*pubid) {	output_markup_string(pubid);	output_flush_markup();      }      else	fputs("#EMPTY\n", outfp);    }    if (sysid) {      fputs("#SYSTEM\n", outfp);      if (*sysid) {	output_markup_string(sysid);	output_flush_markup();      }      else	fputs("#EMPTY\n", outfp);    }  }}staticvoid output_entity(e)     struct sgmls_entity *e;{  if (e->is_internal)    output_internal_entity(&e->u.internal);  else    output_external_entity_info(&e->u.external);  fputs("#END-ENTITY", outfp);#ifndef ASIS  putc('\n', outfp);#endif}staticvoid output_external_entity_info(e)     struct sgmls_external_entity *e;{  switch (e->type) {  case SGMLS_ENTITY_CDATA:    fputs("#CDATA-EXTERNAL", outfp);    break;  case SGMLS_ENTITY_SDATA:    fputs("#SDATA-EXTERNAL", outfp);    break;  case SGMLS_ENTITY_NDATA:    fputs("#NDATA-EXTERNAL", outfp);    break;  case SGMLS_ENTITY_SUBDOC:    fputs("#SUBDOC", outfp);    break;  }  putc('\n', outfp);  output_external_id(e->pubid, e->sysid);  if (e->type != SGMLS_ENTITY_SUBDOC) {    fprintf(outfp, "#NOTATION=%s\n", e->notation->name);    output_external_id(e->notation->pubid, e->notation->sysid);    output_attribute_list(e->attributes);  }}staticvoid output_internal_entity(e)     struct sgmls_internal_entity *e;{  if (e->data.is_sdata)    fputs("#SDATA-INTERNAL", outfp);  else    fputs("#CDATA-INTERNAL", outfp);  putc('\n', outfp);  output_markup_chars(e->data.s, e->data.len);  output_flush_markup();}staticvoid input_error(num, str, lineno)     int num;     char *str;     unsigned long lineno;{  error("Error at input line %lu: %s", lineno, str);}NO_RETURN#ifdef VARARGSvoid error(va_alist) va_dcl#elsevoid error(char *message,...)#endif{#ifdef VARARGS     char *message;#endif     va_list ap;          fprintf(stderr, "%s: ", program_name);#ifdef VARARGS     va_start(ap);     message = va_arg(ap, char *);#else     va_start(ap, message);#endif     vfprintf(stderr, message, ap);     va_end(ap);     fputc('\n', stderr);     fflush(stderr);     exit(EXIT_FAILURE);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -