⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utils.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
#include "glimpse.h"int BigFilenameHashTable = OFF;#define SIGNIFICANT_HASH_REGION	24/* n is guaranteed to be < MaxNum4bPartition */intencode4b(n)	int	n;{	if (n=='\0') return MaxNum4bPartition;	if (n=='\n') return MaxNum4bPartition+1;	return n;}intdecode4b(n)	int n;{	if (n==MaxNum4bPartition) return '\0';	if (n==MaxNum4bPartition+1) return '\n';	return n;}/* n is guaranteed to be < MaxNum8bPartition */intencode8b(n)	int n;{	if (n=='\0') return MaxNum8bPartition;	if (n=='\n') return MaxNum8bPartition+1;	return n;}intdecode8b(n)	int n;{	if (n==MaxNum8bPartition) return '\0';	if (n==MaxNum8bPartition+1) return '\n';	return n;}/* n is guaranteed to be < MaxNum12bPartition */intencode12b(n)	int n;{	unsigned char msb, lsb;	msb = (n / MaxNum8bPartition);	lsb = (n % MaxNum8bPartition);	msb = encode4b(msb);	lsb = encode8b(lsb);	return (msb<<8)|lsb;}intdecode12b(n)	int n;{	unsigned char msb, lsb;	msb = ((n&0x00000f00) >> 8);	lsb = (n&0x000000ff);	msb = decode4b(msb);	lsb = decode8b(lsb);	return (msb * MaxNum8bPartition) + lsb;}/* n is guaranteed to be < MaxNum16bPartition */intencode16b(n)	int n;{	unsigned char msb, lsb;	msb = (n / MaxNum8bPartition);	lsb = (n % MaxNum8bPartition);	msb = encode8b(msb);	lsb = encode8b(lsb);	return (msb<<8)|lsb;}intdecode16b(n)	int n;{	unsigned char msb, lsb;	msb = ((n&0x0000ff00) >> 8);	lsb = (n&0x000000ff);	msb = decode8b(msb);	lsb = decode8b(lsb);	return (msb * MaxNum8bPartition) + lsb;}/* n is guaranteed to be < MaxNum24bPartition */intencode24b(n)	int n;{	unsigned short msb, lsb;	msb = (n / MaxNum16bPartition);	lsb = (n % MaxNum16bPartition);	msb = encode8b(msb);	lsb = encode16b(lsb);	return (msb<<16)|lsb;}intdecode24b(n)	int n;{	unsigned short msb, lsb;	msb = ((n&0x00ff0000) >> 16);	lsb = (n&0x0000ffff);	msb = decode8b(msb);	lsb = decode16b(lsb);	return (msb * MaxNum16bPartition) + lsb;}/* n is guaranteed to be < MaxNum32bPartition */intencode32b(n)	int n;{	unsigned short msb, lsb;	msb = (n / MaxNum16bPartition);	lsb = (n % MaxNum16bPartition);	msb = encode16b(msb);	lsb = encode16b(lsb);	return (msb<<16)|lsb;}intdecode32b(n)	int n;{	unsigned short msb, lsb;	msb = ((n&0xffff0000) >> 16);	lsb = (n&0x0000ffff);	msb = decode16b(msb);	lsb = decode16b(lsb);	return (msb * MaxNum16bPartition) + lsb;}/* * converts file-names with *,. and ? and converts it to # \. and ? ALL OTHER agrep-special characters are masked off. * if the filename NOT a regular expression involving ? or *, it leaves the name untouched and returns the string * length of the file name (so that we can avoid memagrep calls): otherwise, it returns the -ve strlength of the name * after performing the above conversion: hence we never need to call agrep if the length is +ve. */intconvert2agrepregexp(buf, len)	char	*buf;	int	len;{	char	tbuf[MAX_PAT];	int	i=0, j=0;	/* Ignore '*' at the beginning and '*' at the end */	if (len < 1) return 0;	if ( ((len == 1) && (buf[len-1] == '*')) || ((len >= 2) && (buf[len-1] == '*') && (buf[len-1] != '\\')) ) {		buf[len-1] = '\0';		len--;	}	if (buf[0] == '*') {		for (i=0; i<len; i++)			buf[i] = buf[i+1];		len--;	}	if (len < 1) {		buf[0] = '.';		buf[1] = '*';		buf[2] = '\0';		return -2;	}	for (i=0; i<len; i++)		if (buf[i] == '\\') i++;		else if ((buf[i] == '?') || (buf[i] == '*')			|| (buf[i] == '$') || (buf[i] == '^')) break;	if (i >= len) return len;	i = j = 0;	while ((i<len) && (j<MAX_PAT) && (buf[i] != '\0')) {		/* Consider all special characters interpreted by agrep */		if (buf[i] == '\\') {			/* copy two things without interpreting them */			tbuf[j++] = buf[i++];			tbuf[j++] = buf[i++];		}		else if ((buf[i] == '-') || (buf[i] == ',') || (buf[i] == ';')||			 (buf[i] == '.') || (buf[i] == '#') || (buf[i] == '|')||			 (buf[i] == '[') || (buf[i] == ']') || (buf[i] == '(')||			 (buf[i] == ')') || (buf[i] == '>') || (buf[i] == '<')||			 /* (buf[i] == '^') || (buf[i] == '$') || */			 (buf[i] == '+')||			 (buf[i] == '{') || (buf[i] == '}') || (buf[i] == '~')){			tbuf[j++] = '\\';			tbuf[j++] = buf[i];			i++;		}		/* Interpret ONLY ? and * in file-names */		else if (buf[i] == '?') {			tbuf[j++] = '.';			i++;		}		else if (buf[i] == '*') {			tbuf[j++] = '.';			tbuf[j++] = '*';			i++;		}		else tbuf[j++] = buf[i++];	}	if (j >= MAX_PAT) {		tbuf[j-1] = '\0';		fprintf(stderr, "glimpseindex: pattern '%s' too long\n", buf);		j--;	}	else {		tbuf[j] = '\0';	}	strcpy(buf, tbuf);#if	0	printf("%s=%d\n", buf, j);#endif	/*0*/	return -j;	/* strlen-compatible, -ve to indicate memagrep must be called */}/* -----------------------------------------------------------------input: a word (a string of ascii character terminated by NULL)output: a hash_value of the input word.hash function: if the word has length <= 4        the hash value is just a concatenation of the last four bits        of the characters.        if the word has length > 4, then after the above operation,        the hash value is updated by adding each remaining character.        (and AND with the 16-bits mask).bug-fixes in all hashing functions: Chris Dalton---------------------------------------------------------------- */inthash64k(word, len)char *word;int len;{    unsigned int hash_value=0;    unsigned int mask_4=017;    unsigned int mask_16=0177777;    int i;    if(len<=4) {	for(i=0; i<len; i++) {       	    hash_value = (hash_value << 4) | (word[i]&mask_4);	    /* hash_value = hash_value  & mask_16; */ 	}    }    else {	for(i=0; i<4; i++) {       	    hash_value = (hash_value << 4) | (word[i]&mask_4);	    /* hash_value = hash_value & mask_16;  */	}	for(i=4; i<len; i++) 	    hash_value = mask_16 & (hash_value + word[i]);    }    return(hash_value & mask_16);}/* * Explicitly used with -B option */inthash256k(word, len)char *word;int len;{    unsigned int hash_value=0;    unsigned int mask_4=017;    unsigned int mask_5=037;    unsigned int mask_18=0x3ffff;    int i;    if(len<=4) {	for(i=0; i<len; i++) {       	    if ((i % 2) == 0) hash_value = (hash_value << 5) | (word[i]&mask_5);	    else hash_value = (hash_value << 4) | (word[i]&mask_4);	    /* hash_value = hash_value  & mask_18; */ 	}    }    else {	for(i=0; i<4; i++) {       	    if ((i % 2) == 0) hash_value = (hash_value << 5) | (word[i]&mask_5);       	    else hash_value = (hash_value << 4) | (word[i]&mask_4);	    /* hash_value = hash_value & mask_18;  */	}	for(i=4; i<len; i++) 	    hash_value = mask_18 & (hash_value + word[i]);    }    return(hash_value & mask_18);}/* * Explicitly used for veryfastsearch without WORD_SORTED * Using > 5 bits is waste since there are only 26 lower case letters */inthash32k(word, len)	char 	*word;	int	len;{    unsigned int hash_value=0;    unsigned int mask_5=037;    unsigned int mask_15=077777;    int i;    if(len<=3) {	for(i=0; i<len; i++) {       	    hash_value = (hash_value << 5) | (word[i]&mask_5);	}    }    else {	for(i=0; i<3; i++) {       	    hash_value = (hash_value << 5) | (word[i]&mask_5);	}	for(i=3; i<len; i++) 	    hash_value = mask_15 & (hash_value + word[i]);    }    return(hash_value & mask_15);}/* This function is utterly disgraceful */inthash16k(word, len)	char	*word;	int	len;{	return hash32k(word, len) & 0x3fff;}/* * Explicitly used for -f and -a options: has low collisions (<=2) for filenames */inthash4k(word, len)	char 	*word;	int	len;{    unsigned int hash_value=0;    unsigned int mask_3=07;    unsigned int mask_12=07777;    int i;    if(len<=4) {	for(i=0; i<len; i++) {       	    hash_value = (hash_value << 3) | (word[i]&mask_3);	}    }    else {	for(i=0; i<4; i++) {       	    hash_value = (hash_value << 3) | (word[i]&mask_3);	}	for(i=4; i<len; i++) 	    hash_value = mask_12 & (hash_value + word[i]);    }    return(hash_value & mask_12);}/* These 2 are especially useful for filenames that have long similar-looking pathname-prefixes */inthash64k_file(file, len)char *file;int len;{    unsigned int hash_value=0;    unsigned int mask_4=017;    unsigned int mask_16=0177777;    int i;    if(len < SIGNIFICANT_HASH_REGION + 2) return hash64k(file, len);#if	1    else {	for(i=len-3; i>=len-6; i--) {       	    hash_value = (hash_value << 4) | (file[i]&mask_4);	    /* hash_value = hash_value & mask_16;  */	}	for(i=len-7; i>=len-SIGNIFICANT_HASH_REGION-2; i--) 	    hash_value = mask_16 & (hash_value + file[i]);	return(hash_value & mask_16);    }#else    else {	for(i=len-SIGNIFICANT_HASH_REGION-2; i<len-SIGNIFICANT_HASH_REGION+2; i++) {       	    hash_value = (hash_value << 4) | (file[i]&mask_4);	    /* hash_value = hash_value & mask_16;  */	}	for(i=len-SIGNIFICANT_HASH_REGION+2; i<len-2; i++) 	    hash_value = mask_16 & (hash_value + file[i]);	return(hash_value & mask_16);    }#endif}inthash4k_file(file, len)	char 	*file;	int	len;{    unsigned int hash_value=0;    unsigned int mask_3=07;    unsigned int mask_12=07777;    int i;    if (len < SIGNIFICANT_HASH_REGION + 2) return hash4k(file, len);#if	1    else {	for(i=len-3; i>=len-6; i--) {       	    hash_value = (hash_value << 3) | (file[i]&mask_3);	    /* hash_value = hash_value & mask_16;  */	}	for(i=len-7; i>=len-SIGNIFICANT_HASH_REGION-2; i--) 	    hash_value = mask_12 & (hash_value + file[i]);	return(hash_value & mask_12);    }#else    else {	for(i=len-SIGNIFICANT_HASH_REGION-2; i<len-SIGNIFICANT_HASH_REGION+2; i++) {       	    hash_value = (hash_value << 3) | (file[i]&mask_3);	}	for(i=len-SIGNIFICANT_HASH_REGION+2; i<len-2; i++) 	    hash_value = mask_12 & (hash_value + file[i]);	return(hash_value & mask_12);    }#endif}hashNk(name, len)	char	*name;	int	len;{	if (BigFilenameHashTable) return hash64k_file(name, len);	else return hash4k_file(name, len);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -