⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 convert.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
/**********************************************************************************************//* convert.c: Program to inter-convert different representations of neighbourhood sets        *//*											      *//* Uses: to compress neighbourhood sets for faster search/uncompress for viewing/editing them *//* Author: Burra Gopal, bgopal@cs.arizona.edu, Sep 7-8 1996: WebGlimpse support               *//**********************************************************************************************/#include "glimpse.h"#include <stdlib.h>#include <sys/time.h>#if	ISO_CHAR_SET#include <locale.h>			/* support for 8bit character set:ew@senate.be */#endif#include <errno.h>#define IS_LITTLE_ENDIAN	1#define IS_BIG_ENDIAN		0#define IS_INDICES		1#define IS_BITS			2#define IS_NAMES		3#define USUALBUFFER_SIZE	(MAX_LINE_LEN*64)/* Exported routines */int		element2name(/*int, out char*, int, int, int*/);int		mem_element2name(/*int, char*, unsigned char*, unsigned char*, int*/);int		name2element(/*out int*, char*, int, int, int, int*/);int		mem_name2element(/*out int*, char*, int, unsigned char*, unsigned char*, int*/);int		do_conversion(/*FILE*, FILE*, int, int, int, int, int, unsigned int *, int, int*/);int		change_format(/*int, int, int, int, int, int, char *, char **/);/* Imported routines */int		hashNk(/*char *, int*/);/* from io.c *//* Internal routines */int		discardinfo(/*char **/);int		allocate_and_fill(/* out unsigned char **, int, char *, int*/);/* Imported variables */extern int	errno;extern int	get_index_type();	/* from io.c */extern int	file_num;		/* from io.c */extern int	mask_int[32];		/* from io.c */extern int	BigFilenameHashTable;	/* from io.c */extern int	InfoAfterFilename;	/* from io.c *//* Internal variables *//* Variables related to options (i/p-->o/p types)*/int		InputType, OutputType, InputEndian, OutputEndian, InputFilenames, ReadIntoMemory;char		glimpseindex_dir[MAX_LINE_LEN];char		filename_prefix[MAX_LINE_LEN];/* Variables related to ReadIntoMemory option (I/O efficiency) */unsigned char	*filenames_buffer, *filenames_index_buffer, *filehash_buffer, *filehash_index_buffer;int		filenames_len, filenames_index_len, filehash_len, filehash_index_len;int		fdname, fdname_index, fdhash, fdhash_index;unsigned char	usualbuffer[USUALBUFFER_SIZE];/* Variables for statistics */int		hash_misses = 0;/******************************************************** * Discards information after ' ' in filename           * * Returns: 0 if it found info to discard, -1 otherwise * * Assumes: file ends with '\0'                         * * CHANGED from ' ' to FILE_END_MARK 6/7/99 --GB	* ********************************************************/intdiscardinfo(file)	char		file[];{	int		k;	if (InfoAfterFilename) {		k = 0;		while (file[k] != '\0') {			if (file[k] == '\\') {				k ++;				if (file[k] == '\0') break;				k++;				continue;			}			else {				if (file[k] == FILE_END_MARK) {					file[k] = '\0';					return 0;				}				k++;				continue;			}		}	}	/* pab23feb98: return -1 if !InfoAfterFilename */	return -1;}/******************************************************************************************** * Allocates the "buffer" of size "len" and fills it up with "len" amount of data from "fd" * * Returns: 0 on success, -1 on failure (i.e., if allocation fails or can't read fully)     * ********************************************************************************************/intallocate_and_fill(buffer, len, filename, fd)	unsigned char	**buffer;	int		len;	char		*filename;	int		fd;{	if ((len <= 0) || ((*buffer = (unsigned char *)my_malloc(len)) == NULL)) {		fprintf(stderr, "Disable -M option: cannot allocate memory for %s\n", filename);		return -1;	}	if (len != read(fd, *buffer, len)) {		fprintf(stderr, "Disable -M option: cannot read %s\n", filename);		return -1;	}	return 0;}/************************************************************************************** * Finds filename for given element (index#:  every element points to indexed object) * * Returns: -1 if error and 0 on success                                              * * See glimpse/index/io.c/save_datastructures() for the format of the names-file      * **************************************************************************************/intelement2name(element, file, fd, fdi, files_used)	int		element;	char		file[];		/* out */	int		fd, fdi;	/* fd=filenames fd, fdi=filenames_index fd */	int		files_used;{	int		k, offset, lastoffset = -1, len;	unsigned char	array[4];	if ((element < 0) || (element >= files_used)) {		errno = EINVAL;		return -1;	}	lseek(fdi, (long)element*4, SEEK_SET);	if (read(fdi, array, 4) != 4) {		errno = ENOENT;		return -1;	}	offset = (array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3];	if (read(fdi, array, 4) == 4) {		lastoffset = (array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3];	}	if (lseek(fd, (long)offset, SEEK_SET) == -1) {		fprintf(stderr, ".glimpse_filenames: can't seek to %d\n", offset);		return -1;	}	if (lastoffset != -1)		len = read(fd, file, lastoffset - offset);	else len = read(fd, file, MAX_LINE_LEN);	if (len == -1) {		errno = ENOENT;		return -1;	}	file[len - 1] = '\0';	/* separated by '\n', so zero that out: if empty file, will get its strlen() to be 0, as expected */	if (InfoAfterFilename) discardinfo(file);	return 0;}/************************************************************************************** * Finds filename for given element (index#:  every element points to indexed object) * * Returns: -1 if error and 0 on success                                              * * See glimpse/index/io.c/save_datastructures() for the format of the names-file      * * Works by reading in-memory copy of the files 				      * **************************************************************************************/intmem_element2name(element, file, filenames_buffer, filenames_index_buffer, files_used)	int		element;	char		file[];		/* out */	unsigned char	*filenames_buffer, *filenames_index_buffer;	int		files_used;{	int		i, offset, lastoffset = -1, len;	if ((element < 0) || (element >= files_used) || (element >= filenames_index_len)) {		errno = EINVAL;		return -1;	}	i = element*4;	offset = (filenames_index_buffer[i] << 24) | (filenames_index_buffer[i+1] << 16) |			(filenames_index_buffer[i+2] << 8) | filenames_index_buffer[i+3];	if (element == files_used - 1) lastoffset = filenames_len;	else lastoffset = (filenames_index_buffer[i+4] << 24) | (filenames_index_buffer[i+5] << 16) |				(filenames_index_buffer[i+6] << 8) | filenames_index_buffer[i+7];/* fprintf(stderr, "element=%d offset=%d, lastoffset=%d, filenames_len=%d, files_used=%d\n", element, offset, lastoffset, filenames_len, files_used); */	if ((offset < 0) || (offset > filenames_len) || (lastoffset < 0) || (lastoffset > filenames_len) || (offset >= lastoffset)) {		errno = ENOENT;		return -1;	}	if (lastoffset - offset >= MAX_LINE_LEN) {		errno = EINVAL;		return -1;	}	memcpy(file, &filenames_buffer[offset], lastoffset-offset);	file[lastoffset - offset - 1] = '\0';	/* separated by '\n', so zero that out: if empty file, will get its strlen() to be 0, as expected */	if (InfoAfterFilename) discardinfo(file);	return 0;}/***************************************************************************************** * Returns: element (index#) for given filename (every element points to indexed object) * * Returns: -1 if error (assuming that element#s are >= 0, ofcourse...)                  * * See glimpse/index/io.c/save_datastructures() for the format of the hash-file          * *****************************************************************************************/intname2element(pelement, file, len, fd, fdi, files_used)	int		*pelement;	/* out */	char		file[];	int		len;	int 		fd, fdi;	/* fd=filehash fd, fdi=filehash_index fd */	int		files_used;{	int		malloced = 0, ret, i, k, foundblank=0, offset, lastoffset = -1, hash, size;	unsigned char	*buffer, array[4];	if ((len <= 0) || (len >= MAX_LINE_LEN)) {		errno = EINVAL;		return -1;	}	hash = hashNk(file, len);/* fprintf(stderr, "len=%d file=%s hash=%d\n", len, file, hash); */	if (lseek(fdi, (long)hash*4, SEEK_SET) == -1) {		fprintf(stderr, ".glimpse_filehash_index: can't seek to %d\n", hash*4);		return -1;	}	if ((ret = read(fdi, array, 4)) != 4) {		fprintf(stderr, "read only %d bytes from %d\n", ret, hash*4);		errno = ENOENT;		return -1;	}	offset = (array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3];/* fprintf(stderr, "offset=%d\n", offset); */	if (read(fdi, array, 4) == 4) {		lastoffset = (array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3];	}	else lastoffset = lseek(fd, (long)0, SEEK_END /*2*/ /* from end */);	/* so that next time I get prev-value = file size *//* fprintf(stderr, "lastoffset=%d\n", lastoffset); */	size = lastoffset - offset;	if (size <= 1) {		errno = ENOENT;		return -1;	}	if (size < USUALBUFFER_SIZE) buffer = usualbuffer;	else {		buffer = (unsigned char *)my_malloc(size);		malloced = 1;	}/* fprintf(stderr, "hash=%d offset=%d lastoffset=%d size=%d\n", hash, offset, lastoffset, size); */	lseek(fd, (long)offset, SEEK_SET);	if (size != read(fd, buffer, size)) {		if (malloced) my_free((char *)buffer, size);		errno = ENOENT;		return -1;	}/* fprintf(stderr, "buffer=%s\n", buffer+4); */	for (i=0; i<size; i+=4+strlen((char *)&buffer[i+4])+1) {		if (InfoAfterFilename) {			k = i+4;			while (buffer[k] != '\0') {				if (buffer[k] == '\\') {					k ++;					if (buffer[k] == '\0') break;					k++;					continue;				}				else {					if (buffer[k] == FILE_END_MARK) {						buffer[k] = '\0';						foundblank = 1;						break;					}					k++;					continue;				}			}		}		if (!strcmp((char *)&buffer[i+4], file)) {			*pelement = (buffer[i] << 24) | (buffer[i+1] << 16) | (buffer[i+2] << 8) | buffer[i+3];			if (InfoAfterFilename && foundblank) {				buffer[k] = FILE_END_MARK;			}			if (malloced) my_free((char *)buffer, size);			return 0;		}		if (InfoAfterFilename && foundblank) {			buffer[k] = FILE_END_MARK;		}		hash_misses ++;	}	if (malloced) my_free((char *)buffer, size);	errno = ENOENT;	return -1;}/***************************************************************************************** * Returns: element (index#) for given filename (every element points to indexed object) * * Returns: -1 if error (assuming that element#s are >= 0, ofcourse...)                  * * See glimpse/index/io.c/save_datastructures() for the format of the hash-file          * * Works by reading in-memory copy of the files						 * *****************************************************************************************/mem_name2element(pelement, file, len, filehash_buffer, filehash_index_buffer, files_used)	int		*pelement;	/* out */	char		*file;	int		len;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -