📄 html-lax.sum.c
字号:
static char rcsid[] = "HTML-lax.sum.c,v 1.4 1996/01/16 08:44:27 duane Exp";/* * HTML-lax.sum.c - Non-strict HTML summarizer * * Usage: HTML-lax.sum [--url-only | --text-only] --body-text filename * * Outputs SOIF * * DEBUG: * * Darren Hardy, hardy@cs.colorado.edu, April 1994 * Duane Wessels, wessels@cs.colorado.edu, October 1995 * * ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include "HTML.h"#include "util.h"#include "template.h"#if 0 /* kjl/7mar2002 *//* Global */char *Url = NULL;#endif/* Local Variables */static int intype[64];static Buffer *citations, *keywords, *title, *headings, *author, *urls, *body_text; /* hw headings added */static int url_only = 0;static int text_only = 0;static int html_body_text = 0;static void usage(){ fprintf(stderr, "Usage: HTML-lax.sum [--url-only | --text-only] --body-text filename\n"); exit(1);}/* * read_file() - Reads the file fp into memory and returns a pointer to it. */Buffer *read_file(fp)FILE *fp;{ static Buffer *b; char buf[BUFSIZ]; int nread; b = create_buffer(BUFSIZ); while ((nread = fread(buf, 1, BUFSIZ, fp)) > 0) add_buffer(b, buf, nread); return(b);}/* * process_anchor() - Extracts the URL from the anchor href tag. */void process_anchor(s)char *s;{ char *p, *q, *tmps = s; while ((p = strchr(tmps, '=')) != NULL) { /* Check to see if there is a ABCD= */ if (p - 4 <= tmps) { tmps = ++p; continue; } if (strncasecmp(p-4, "href", 4) != 0) { /* href? */ tmps = ++p; continue; } p++; /* skip '=' */ while (isspace(*p) || (*p == '\"')) p++; /* skip space '"'s */ q = strdup(p); /* copy URL */ if ((p = strchr(q, '\"')) != NULL) /* terminate string */ *p = '\0'; if ((p = strchr(q, ' ')) != NULL) /* terminate string */ *p = '\0'; add_buffer(urls, q, strlen(q)); /* Add URL to urls */ add_buffer(urls, "\n", 1); xfree(q); return; }}void print_node(mp)struct mark_up *mp;{ printf("mp->type: %d\n", mp->type); printf("mp->is_end: %d\n", mp->is_end); printf("mp->text: %s\n", mp->text); printf("mp->start: %s\n", mp->start); printf("mp->end: %s\n", mp->end); printf("\n"); fflush(stdout);}void process_node(mp)struct mark_up *mp;{ char *tptr; int len; if (mp->type < 0) return; if (mp->is_end) { intype[mp->type]--; return; } else intype[mp->type]++; if (mp->text && strlen(mp->text) < 2) return; if (mp->start && strlen(mp->start) < 6) return; switch (mp->type) { case M_NONE: if (intype[M_TITLE]) { add_buffer(title, mp->text, strlen(mp->text)); add_buffer(title, "\n", 1); } if (intype[M_STRONG]) { add_buffer(keywords, mp->text, strlen(mp->text)); add_buffer(keywords, "\n", 1); } if (intype[M_CITATION]) { add_buffer(citations, mp->text, strlen(mp->text)); add_buffer(citations, "\n", 1); } if (intype[M_ANCHOR]) { add_buffer(keywords, mp->text, strlen(mp->text)); add_buffer(keywords, "\n", 1); } if (intype[M_ADDRESS]) { add_buffer(author, mp->text, strlen(mp->text)); add_buffer(author, "\n", 1); } /* hw h1-h3 are now stored as headings */ if (intype[M_HEADER_1] || intype[M_HEADER_2] || intype[M_HEADER_3]) { tptr = strdup (mp->text); if (tptr) { clean_white_space (tptr); len = strlen (tptr); if (len > 2) { add_buffer(headings, tptr, len); add_buffer(headings, "\n", 1); } free (tptr); } } if (html_body_text && mp->text) { tptr = strdup (mp->text); if (tptr) { clean_white_space (tptr); len = strlen (tptr); if (len > 2) { add_buffer (body_text, tptr, len); add_buffer (body_text, "\n", 1); } free (tptr); } } if (text_only) puts(mp->text); break; case M_ANCHOR: process_anchor(mp->start); default: /* do nothing */ break; }}static void free_struct_markup(x)struct mark_up *x;{ if (x->text) free(x->text); if (x->start) free(x->start); if (x->end) free(x->end); free(x);}int main(argc, argv)int argc;char *argv[];{ struct mark_up *mp = NULL, *walker, *t, *HTMLParse(); Buffer *b; FILE *fp; init_log(NULL, stderr); argv++; argc--; if (argc < 1) usage(); if (!strcmp(*argv, "--url-only")) { url_only = 1; argv++; argc--; if (argc < 1) usage(); } if (!strcmp(*argv, "--text-only")) { text_only = 1; argv++; argc--; if (argc < 1) usage(); } if (!strcmp(*argv, "--body-text")) { html_body_text = 1; argv++; argc--; if (argc < 1) usage(); } memset(intype, '\0', 64); /* Parse the HTML file */ if ((fp = fopen(*argv, "r")) == NULL) { log_errno(*argv); exit(1); }#if 0 /* kjl/7mar2002 */ if (getenv("ENUMERATOR_URL")) Url = xstrdup(getenv("ENUMERATOR_URL")); if (Url == (char *) NULL) Url = xstrdup(*argv);#endif b = read_file(fp); fclose(fp); mp = HTMLParse(NULL, b->data); free_buffer(b); author = create_buffer(BUFSIZ); keywords = create_buffer(BUFSIZ); citations = create_buffer(BUFSIZ); urls = create_buffer(BUFSIZ); title = create_buffer(BUFSIZ); headings = create_buffer(BUFSIZ); /* hw */ if (html_body_text) body_text = create_buffer(BUFSIZ); /* Extract important information from the parsed HTML */ for (walker = mp; walker != NULL; t = walker, walker = walker->next, free_struct_markup(t)) process_node(walker); if (url_only) { fwrite(urls->data, 1, urls->length, stdout); exit(0); } if (text_only) { exit(0); } if (author->length > 0) { printf("%s{%u}:\t", T_AUTHOR, author->length); /* Author */ fwrite(author->data, 1, author->length, stdout); } if (keywords->length > 0) { printf("%s{%u}:\t", T_KEYS, keywords->length); /* Keywords */ fwrite(keywords->data, 1, keywords->length, stdout); } if (urls->length > 0) { printf("%s{%u}:\t", T_UREFS, urls->length); /* URL-References */ fwrite(urls->data, 1, urls->length, stdout); } if (citations->length > 0) { printf("Citations{%u}:\t", citations->length); fwrite(citations->data, 1, citations->length, stdout); } if (title->length > 0) { printf("%s{%u}:\t", T_TITLE, title->length); /* Title */ fwrite(title->data, 1, title->length, stdout); } /* hw */ if (headings->length > 0) { printf("%s{%u}:\t", T_HEADINGS, headings->length); /* Headings */ fwrite(headings->data, 1, headings->length, stdout); } if (html_body_text && body_text->length > 0) { /* printf("body{%u}:\t", body_text->length); */ printf("%s{%u}:\t", T_FULLTEXT, body_text->length); /* Full-Text */ fwrite(body_text->data, 1, body_text->length, stdout); } exit(0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -