⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 main.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
			sprintf(buf, "%s/%s", t, USE_STOPLIST);			if ((fd = open(buf, O_RDONLY)) >= 0) {				stoplist = strdup(buf);				close(fd);				break;			}		}		free(s);	}	if (stoplist == NULL) {		errorlog("Unable to locate %s in %s.\n",		    USE_STOPLIST, libpath);		exit(1);	}	if (tmpdir == NULL) {		tmpdir = getenv("TMPDIR") ? strdup(getenv("TMPDIR")) :		    strdup(USE_TMPDIR);	}	if (access(stoplist, R_OK) < 0) {		log_errno(stoplist);		exit(1);	}	if ((allowlist != NULL) && (access(allowlist, R_OK) < 0)) {		log_errno(allowlist);		exit(1);	}	if (input_file != NULL && strcmp(input_file, "-") != 0 &&	    access(input_file, R_OK) < 0) {		log_errno(input_file);		usage();	}	if (access(tmpdir, W_OK) < 0) {		log_errno(tmpdir);		exit(1);	}	do_startup();	/* NOTE: DO NOT catch SIGCHLD; we always do explict waits in Essence */	signal(SIGABRT, do_shutdown);	/* die gracefully */	signal(SIGTERM, do_shutdown);	signal(SIGINT, do_shutdown);	/* Process */	if (input_file != NULL) {		FILE *fp;		char buf[BUFSIZ], tbuf[BUFSIZ], *s;		int t;		if (!strcmp(input_file, "-"))			fp = stdin;		else {			if ((fp = fopen(input_file, "r")) == NULL) {				log_errno(input_file);				usage();			}		}		/*		 *  The input looks like:		 *     URL<tab>MD5:adfasdfasdfasdfasdfasd		 *     URL<tab>Last-Modification-Time:12345		 */		while (fgets(buf, BUFSIZ, fp) != NULL) {			strcpy(tbuf, buf);	/* make a copy */			Debug(62, 1, ("Input Line: %s", tbuf));			if (buf[0] == '#') {			  if (strncmp(buf, "#env:", 5) == 0) {			    char* ptr, *dup;			    int len;			    ptr = buf+5;			    while (isspace(*ptr)) {			      ++ptr;			    }			    len = strlen(ptr);			    while (len  &&  isspace(ptr[len-1])) {			      ptr[--len] = '\0';			    }			    if ((dup = xmalloc(len+1))) {			      strcpy(dup, ptr);			      putenv(dup);			    }			  }			  continue;	/* skip rest */			}			if ((s = strrchr(buf, '\n')) == NULL) {				errorlog("Illegal input: %s\n", tbuf);				continue;			}			*s = '\0';	/* strip newline */			if ((s = strchr(buf, '\t')) != NULL) {				*s++ = '\0';	/* delineate at the tab */			}			/*			 *  For MD5's: check database and skip if unchanged			 *  For LMT's: check database and skip if unchanged			 *  For no meta data, just pass it through			 */			if (s && !strncasecmp(s, T_MD5, strlen(T_MD5))) {				if (dbcheck_md5(buf, s + strlen(T_MD5) + 1)) {					continue;				}			} else if (s && !strncasecmp(s, T_LMT, strlen(T_LMT))) {				t = atoi(s + strlen(T_LMT) + 1);				if (dbcheck_timestamp(buf, t)) {					continue;				}			}			obj = create_data_object(buf, object_flags);			if (obj == NULL) {				errorlog("Cannot create object for %s\n", tbuf);				continue;			}			/* Type Recognition */			if (obj->type == NULL && type_recognize(obj)) {				errorlog("Cannot recognize type for %s\n",					 obj->url->url);				continue;			}			if (is_nested_type(obj->type)) {				nested = 1;				init_presentation_unnest();			} else {				nested = 0;			}			process_object(obj);			if (nested) {				finish_presentation_unnest();			}			free_data_object(obj);		}		fclose(fp);	} else {		for (; argc > 0; argc--, argv++) {			obj = create_data_object(*argv, object_flags);			if (obj == NULL) {				errorlog("Cannot create object for %s\n", *argv);				continue;			}			/* Type Recognition */			if (obj->type == NULL && type_recognize(obj)) {				errorlog("Cannot recognize type for %s\n",					 obj->url->url);				continue;			}			if (is_nested_type(obj->type)) {				nested = 1;				init_presentation_unnest();			} else {				nested = 0;			}			process_object(obj);			if (nested) {				finish_presentation_unnest();			}			free_data_object(obj);		}	}	/* Clean up */	do_shutdown(0);	exit(0);}static void do_startup(){	char *libpathbuf, *s, *t;	int fd;	char buf[BUFSIZ];	libpathbuf = xmalloc(strlen(libpath) + 64);	memset(libpathbuf, '\0', strlen(libpath) + 64);	sprintf(libpathbuf, "SUMMARIZER_LIBPATH=%s", libpath);	if (putenv(libpathbuf) < 0) {		log_errno("putenv");	}	sprintf(byname, "%s/%s", default_libpath, USE_BYNAME);	sprintf(byurl, "%s/%s", default_libpath, USE_BYURL);	sprintf(bycontent, "%s/%s", default_libpath, USE_BYCONTENT);	sprintf(magic, "%s/%s", default_libpath, USE_MAGIC);	s = strdup(libpath);	for (t = strtok(s, ":"); t; t = strtok(NULL, ":")) {		sprintf(buf, "%s/%s", t, USE_BYNAME);		if ((fd = open(buf, O_RDONLY)) >= 0) {			strcpy(byname, buf);			close(fd);			break;		}	}	free(s);	s = strdup(libpath);	for (t = strtok(s, ":"); t; t = strtok(NULL, ":")) {		sprintf(buf, "%s/%s", t, USE_BYURL);		if ((fd = open(buf, O_RDONLY)) >= 0) {			strcpy(byurl, buf);			close(fd);			break;		}	}	free(s);	s = strdup(libpath);	for (t = strtok(s, ":"); t; t = strtok(NULL, ":")) {		sprintf(buf, "%s/%s", t, USE_BYCONTENT);		if ((fd = open(buf, O_RDONLY)) >= 0) {			strcpy(bycontent, buf);			close(fd);			break;		}	}	free(s);	s = strdup(libpath);	for (t = strtok(s, ":"); t; t = strtok(NULL, ":")) {		sprintf(buf, "%s/%s", t, USE_MAGIC);		if ((fd = open(buf, O_RDONLY)) >= 0) {			strcpy(magic, buf);			close(fd);			break;		}	}	free(s);	init_url();	init_gatherer_id();	if (init_type_recognize(byname, bycontent, byurl, magic)) {		errorlog("init_type_recognize(%s, %s, %s, %s) failed.\n",		    byname, bycontent, byurl, magic);		exit(1);	}	init_stoplist();	if (!do_typeonly) {		init_summarize();		init_db(dbdir, max_deletions);	}}static void print_memory_stats(){#if defined(DEBUG) && defined(_HARVEST_OSF_)	struct mallinfo mi = mallinfo();	Log("malloc statistics:\n");	Log("  total space in arena: %d\n", mi.arena);	Log("  number of ordinary blocks: %d\n", mi.ordblks);	Log("  number of small blocks: %d\n", mi.smblks);	Log("  number of holding blocks: %d\n", mi.hblks);	Log("  space in holding blocks: %d\n", mi.hblkhd);	Log("  space in small blocks in use: %d\n", mi.usmblks);	Log("  space in free blocks: %d\n", mi.fsmblks);	Log("  space in ordinary blocks in use: %d\n", mi.uordblks);	Log("  space in free blocks: %d\n", mi.fordblks);	Log("  cost of enabling keep option: %d\n", mi.keepcost);#endif	return;}static void do_shutdown(x)     int x;{	finish_url();	finish_type_recognize();	finish_stoplist();	if (!do_typeonly) {		finish_summarize();		finish_db();	}	if (x != 0)		Log("Terminated abnormally (%d)...\n", x);	else		Log("Terminated normally.\n");	print_memory_stats();	exit(x);}static void init_gatherer_id(){	gatherer_id = xmalloc(sizeof(struct GID));	gatherer_id->name = strdup(gname ? gname : "Essence");	gatherer_id->version = strdup(gver ? gver : HARVEST_VERSION);	if (ghost) {		gatherer_id->host = strdup(ghost);	} else {		ghost = strdup(getfullhostname());		gatherer_id->host = strdup(ghost);	}	Log("Running Gatherer...\n");	Log("Gatherer-Name:\t%s\n", gatherer_id->name);	Log("Gatherer-Host:\t%s\n", gatherer_id->host);	Log("Gatherer-Version:\t%s\n", gatherer_id->version);}/* *  process_object() - Main guts of Essence.  First, types the object, *  performs candidate selection, then either unnest it or summarizes it. */static void process_object(object)     DataObject *object;{#ifdef DEBUG	print_memory_stats();#endif	Debug(62, 1, ("process_object(%s)\n", object->url->url));	/* Candidate Selection by Name */	if (allowlist == NULL && stop_byname(object)) {		Log("Removing %s from candidate list -- name.\n",		    object->url->url);		return;	}	/* Type Recognition */	if (object->type == NULL && type_recognize(object)) {		errorlog("Cannot recognize type for %s\n", object->url->url);		return;	}	/* Print the type and return if type's only; print directly to stdout */	if (do_typeonly) {		printf("Type: %s %s\n", object->type, object->url->url);		return;	}	/* print "URL <TAB> Type" */	/* print (L) if local mapping worked */	Log("%s\t%s%s\n",		object->url->url,		object->type,		object->url->flags & URL_FLAG_LOCAL_MAPPED ? " [L]" : "");	/* Candidate Selection by Type and by Duplicate */	if (allowlist != NULL && !allow_bytype(object)) {		Log("Removing %s (%s) from candidate list -- type.\n",		    object->url->url, object->type);		return;	}	if (allowlist == NULL && stop_bytype(object)) {		Log("Removing %s (%s) from candidate list -- type.\n",		    object->url->url, object->type);		return;	}	if (allowlist == NULL && stop_byduplicate(object)) {		if (do_dupremove) {			db_delete_byurl(object->url->url);		} else {			Log("Removing %s (%s) from candidate list -- duplicate.\n", object->url->url, object->type);			return;		}	}	/* Summarize or Presentation Unnest */	if (object->flags & F_MANUAL) {		summarize(object);	} else if (is_nested_type(object->type)) {		nested_feeder(object);	} else if (!do_typeonly) {		summarize(object);	}}/* *  nested_feeder() - Takes a nested object an unnests it. *  XXX: Should re-write so that the unnester is an iterator. */static void nested_feeder(object)     DataObject *object;{	DataObjectList *ol, *walker, *tol;	int nc = 0, nmakefile = 0;	/* Summarize it first */	summarize_nested_object(object);	/* Unnest the object */	if ((ol = presentation_unnest(object)) == NULL) {		errorlog("Cannot unnest %s\n", object->url->url);		return;	}	/* Type the extracted data first */	for (walker = ol; walker; walker = walker->next) {		if (walker->object == NULL) {			errorlog("Fatal Internal: NULL object from unnest.\n");			exit(1);		}		Debug(62, 1, ("Extracted: %s %p\n", walker->object->url->url,			walker->object->type));		if (walker->object->type == NULL)			(void) type_recognize(walker->object);	}	/* Recognize bundles */	for (walker = ol; walker; walker = walker->next) {		if (walker->object->type == NULL)			continue;		if (!strcmp(walker->object->type, "C"))			nc++;		else if (!strcmp(walker->object->type, "CHeader"))			nc++;		else if (!strcmp(walker->object->type, "Makefile"))			nmakefile++;	}	if (nc > 1 && nmakefile > 0 && !strcmp(object->type, "Directory")) {		xfree(object->type);		object->type = strdup("SourceDistribution");		process_object(object);		free_dol(ol);		return;	}	/* Process the extracted files */	walker = ol;	while (walker != NULL) {		process_object(walker->object);		tol = walker;		walker = walker->next;		free_data_object(tol->object);		xfree(tol);	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -