📄 mkwordlist.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "mkwordlist.c,v 1.19 1996/03/26 04:24:38 wessels Exp";/* *  mkwordlist - Generates a lists of unique words from the file * *  Darren Hardy, hardy@cs.colorado.edu, September 1994 * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. *   *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): *   *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. *   *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. *   *  TERMS OF USE *     *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. *     *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. *   *  DERIVATIVE WORKS *   *    Users may make derivative works from the Harvest software, subject  *    to the following constraints: *   *      - You must include the above copyright notice and these  *        accompanying paragraphs in all forms of derivative works,  *        and any documentation and other materials related to such  *        distribution and use acknowledge that the software was  *        developed at the above institutions. *   *      - You must notify IRTF-RD regarding your distribution of  *        the derivative work. *   *      - You must clearly notify users that your are distributing  *        a modified version and not the original Harvest software. *   *      - Any derivative product is also subject to these copyright  *        and use restrictions. *   *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. *   *  HISTORY OF FREE SOFTWARE STATUS *   *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards.   *   */#include <stdio.h>#include <unistd.h>#include <stdlib.h>#include <ctype.h>#include <string.h>#include <sys/types.h>#include <sys/stat.h>#include "util.h"#include "essence.h"/* *  mkwordlist() - Generates a list of unique words from the input string s *  or length sz bytes.  Returns the list of words as a single string *  with the words separated by newlines.  Returns NULL on error. */char *mkwordlist(s, sz)     char *s;     int sz;{	static char *result;	char buf[BUFSIZ], *tmp, *tmpfile;	int i, wordsz, notascii = 0;	struct stat sb;	FILE *fp;	if (s == NULL || sz < 3)		return NULL;        /*         *  Abort if the input buffer is non-ASCII and is also non         *  alphanumeric. (3 such characters whether in a raw or not,         *  is enough).         */        for (i = 0; i < sz; i++) {                if (!isascii((unsigned char) s[i]))                    if (!isalnum((unsigned char) s[i]))                        notascii++;                if (notascii > 2)                        return (NULL);        }	/* Grab a temporary filename */	if ((tmpfile = tempnam(NULL, "wdlst")) == NULL) {		log_errno("tempnam");		return NULL;	}	/* 	 *  Make a copy of the input buffer;	 *  Convert to upper case to lower case,	 *  and convert punctuation, numbers, etc. to \n 	 */	tmp = xmalloc(sz + 1);	memcpy(tmp, s, sz);	tmp[sz] = '\0';	for (i = 0; i < sz; i++) {		if (isalpha((unsigned char) tmp[i])) {			if (isupper((unsigned char) tmp[i]))				tmp[i] = tolower((unsigned char) tmp[i]);		} else {			tmp[i] = '\n';		}	}	/* Remove the tmpfile (if exists) and sort/uniq the word list */	if (access(tmpfile, F_OK) == 0) {		if (unlink(tmpfile) < 0) {			log_errno(tmpfile);			xfree(tmpfile);			xfree(tmp);			return (NULL);		}	}	sprintf(buf, "sort | uniq > %s", tmpfile);	if ((fp = popen(buf, "w")) == NULL) {		log_errno(buf);		(void) unlink(tmpfile);		xfree(tmpfile);		xfree(tmp);		return NULL;	}	fwrite(tmp, 1, sz, fp);	fputc('\n', fp);	pclose(fp);	/* Now read in the tmpfile to get the results */	if (stat(tmpfile, &sb) < 0) {		log_errno(tmpfile);		(void) unlink(tmpfile);		xfree(tmpfile);		xfree(tmp);		return NULL;	}	result = xmalloc(sb.st_size + 1);	if ((fp = fopen(tmpfile, "r")) == NULL) {		log_errno(tmpfile);		(void) unlink(tmpfile);		xfree(tmpfile);		xfree(tmp);		xfree(result);		return NULL;	}	/*	 *  Filter the output by removing any word with length <= 2.	 */	i = 0;	while (fgets(buf, BUFSIZ, fp) != NULL) {		wordsz = strlen(buf);		if (wordsz <= 3)	/* word of length 2 plus \n */			continue;		if (wordsz > 25)	/* word of length more than 25 */			continue;		memcpy(&result[i], buf, wordsz);		i += wordsz;	}	result[i] = '\0';	fclose(fp);	(void) unlink(tmpfile);	xfree(tmpfile);	xfree(tmp);	return result;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -