📄 mkwordlist.c
字号:
static char rcsid[] = "mkwordlist.c,v 1.19 1996/03/26 04:24:38 wessels Exp";/* * mkwordlist - Generates a lists of unique words from the file * * Darren Hardy, hardy@cs.colorado.edu, September 1994 * * ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <unistd.h>#include <stdlib.h>#include <ctype.h>#include <string.h>#include <sys/types.h>#include <sys/stat.h>#include "util.h"#include "essence.h"/* * mkwordlist() - Generates a list of unique words from the input string s * or length sz bytes. Returns the list of words as a single string * with the words separated by newlines. Returns NULL on error. */char *mkwordlist(s, sz) char *s; int sz;{ static char *result; char buf[BUFSIZ], *tmp, *tmpfile; int i, wordsz, notascii = 0; struct stat sb; FILE *fp; if (s == NULL || sz < 3) return NULL; /* * Abort if the input buffer is non-ASCII and is also non * alphanumeric. (3 such characters whether in a raw or not, * is enough). */ for (i = 0; i < sz; i++) { if (!isascii((unsigned char) s[i])) if (!isalnum((unsigned char) s[i])) notascii++; if (notascii > 2) return (NULL); } /* Grab a temporary filename */ if ((tmpfile = tempnam(NULL, "wdlst")) == NULL) { log_errno("tempnam"); return NULL; } /* * Make a copy of the input buffer; * Convert to upper case to lower case, * and convert punctuation, numbers, etc. to \n */ tmp = xmalloc(sz + 1); memcpy(tmp, s, sz); tmp[sz] = '\0'; for (i = 0; i < sz; i++) { if (isalpha((unsigned char) tmp[i])) { if (isupper((unsigned char) tmp[i])) tmp[i] = tolower((unsigned char) tmp[i]); } else { tmp[i] = '\n'; } } /* Remove the tmpfile (if exists) and sort/uniq the word list */ if (access(tmpfile, F_OK) == 0) { if (unlink(tmpfile) < 0) { log_errno(tmpfile); xfree(tmpfile); xfree(tmp); return (NULL); } } sprintf(buf, "sort | uniq > %s", tmpfile); if ((fp = popen(buf, "w")) == NULL) { log_errno(buf); (void) unlink(tmpfile); xfree(tmpfile); xfree(tmp); return NULL; } fwrite(tmp, 1, sz, fp); fputc('\n', fp); pclose(fp); /* Now read in the tmpfile to get the results */ if (stat(tmpfile, &sb) < 0) { log_errno(tmpfile); (void) unlink(tmpfile); xfree(tmpfile); xfree(tmp); return NULL; } result = xmalloc(sb.st_size + 1); if ((fp = fopen(tmpfile, "r")) == NULL) { log_errno(tmpfile); (void) unlink(tmpfile); xfree(tmpfile); xfree(tmp); xfree(result); return NULL; } /* * Filter the output by removing any word with length <= 2. */ i = 0; while (fgets(buf, BUFSIZ, fp) != NULL) { wordsz = strlen(buf); if (wordsz <= 3) /* word of length 2 plus \n */ continue; if (wordsz > 25) /* word of length more than 25 */ continue; memcpy(&result[i], buf, wordsz); i += wordsz; } result[i] = '\0'; fclose(fp); (void) unlink(tmpfile); xfree(tmpfile); xfree(tmp); return result;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -