⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 msearch.cpp

📁 问题重述:有一个内含有大约40万条常用词汇的词库。现给定一篇文章
💻 CPP
字号:
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <stdlib.h>
#include <time.h>
#include <conio.h>
#include <windows.h>

#define SHIFT3 4
#define SHIFT2 10
#define SHIFT1_USED 10
#define SHIFT1 (32-SHIFT2-SHIFT3)

static char * ChangeFileExt(const char * fn, const char * fext)
{
	int l=strlen(fn);
	int le=strlen(fext);
	for(int i=l-1; fn[i]!='.' && fn[i]!='\\' && fn[i]!='/' && fn[i]!=':' && i>=0; i--);
	char * fnext;
	// if extensible name does not exist
	if(i<=0||fn[i]=='\\'||fn[i]=='/'||fn[i]==':')
	{
		// allocate memory for new name
		fnext=new char[l+le+2];
		strcpy(fnext,fn);
		fnext[l]='.';
		l++;
		strcpy(fnext+l,fext);
	}
	else
	{
		l=i+1;
		// allocate
		fnext=new char[l+le+1];
		strcpy(fnext,fn);
		strcpy(fnext+l,fext);
	}
	return fnext;
}


#define LBUF_SIZE 1024  // line buffer size

#define MEMORY_MIRROR   // process dictionary file in memory (unused)

#define DEBUG
#ifdef DEBUG
static void dumpBytes(const void * _bytes, size_t len)
{
	const unsigned char * bytes = (const unsigned char *)_bytes;
	size_t j;
	for(j=0; j<len; j++)
		printf("%02x ", (int)bytes[j] );
}
#endif

#pragma pack(1)

// Dictionary file header
typedef struct _DictHeader
{
	char maodict[8]; // string "MAODICT"
	long so;  // index start offset
	long eo;  // index end offset
} DictHeader;

// Index item structure(5 bytes)
typedef struct _IndexItem
{
	union
	{
		long offset;  // string offset
		char * str;   // string pointer(unused)
	};
	char length;  // string length
} IndexItem;


// A list node of searching result
typedef struct _ResultNode
{
	int index;  // index number in dictionary
	int count;  // match times of word
	//ResultNode * next;
} ResultNode;

// Global variables
static FILE * fpDict;
static char * lbuf;
static size_t lbufLen;
static char * buf;

static int compareItem(const void * a1, const void * a2)
{
	IndexItem * it1 = (IndexItem *)a1;
	IndexItem * it2 = (IndexItem *)a2;
	size_t len1 = it1->length;
	size_t len2 = it2->length;
	int ret;

	ret = strncmp(buf + it1->offset, buf + it2->offset, (len1<len2)?len1:len2 );
	if(ret==0)
	{
		if(len1>len2)
			ret = 1;
		else if(len1<len2)
			ret = -1;
	}
	return ret;	
}

// Create binary format dictionary (one word per line in text file, need not sorted or unique)
static int textToBinaryFile(const char * fnDictT, const char * fnDict)
{
	FILE * fpDictT;
	//FILE * fpDict;
	FILE * fpTemp;
	size_t lbufSize;
	//char * lbuf;
	//size_t lbufLen;
	DictHeader hdr;
	int rcCount;  // record count
	IndexItem it;
	IndexItem * indexes;
	size_t strAreaSize;
	int i;
	int lastWIndex;  // last written index number
	int rcCountF;  // record count after filted

	fpDictT = fopen(fnDictT, "r");
	if(fpDictT==NULL)
	{
		printf("Cannot open source file [%s].\n", fnDictT);
		return -1;
	}
	fpDict = fopen(fnDict, "wb");
	if(fpDict==NULL)
	{
		printf("Cannot open target file [%s].\n", fnDict);
		fclose(fpDictT);
		return -1;
	}
	fpTemp = tmpfile();
	if(fpTemp==NULL)
	{
		printf("Cannot create temporary file.\n");
		fclose(fpDictT);
		fclose(fpDict);
		return -1;
	}
	lbufSize = LBUF_SIZE;
	lbuf = (char *)malloc(lbufSize*sizeof(char));

	// write target file header
	memset(&hdr, 0, sizeof(hdr));
	strcpy(hdr.maodict, "MAODICT");
	fwrite(&hdr, sizeof(hdr), 1, fpDict);

	printf("Reading and creating data area... ");
	// read all lines
	rcCount = 0;
	while(!feof(fpDictT))
	{
		fgets(lbuf, lbufSize, fpDictT);
		lbufLen = strlen(lbuf);
		// trim the terminal '\n'
		if(lbufLen>0)
			if(lbuf[lbufLen-1]=='\n' || lbuf[lbufLen-1]=='\r')
				lbuf[--lbufLen] = '\0';
		if(lbufLen==0)
			continue;
		it.offset = ftell(fpDict);
		it.length = (char)lbufLen;
		fwrite(lbuf, sizeof(char), lbufLen+1, fpDict);
		fwrite(&it, sizeof(it), 1, fpTemp);
		rcCount++;
		//printf("%d: %d %s\n", rcCount, lbufLen, lbuf);
	}
	printf("done!\n");
	// get the whole string area size
	strAreaSize = ftell(fpDict);
	fclose(fpDictT);
	indexes = (IndexItem *)malloc(rcCount*sizeof(IndexItem));
	fseek(fpTemp, 0, SEEK_SET);
	fread(indexes, sizeof(IndexItem), rcCount, fpTemp);
	fclose(fpTemp);
	// now, only fpDict is open
	// reopen the file for read+write
	fclose(fpDict);
	fpDict = fopen(fnDict, "r+b");

	printf("Sorting and building index table... ");

	buf = (char *)malloc(strAreaSize);
	fseek(fpDict, 0, SEEK_SET);
	fread(buf, 1, strAreaSize, fpDict);

	qsort(indexes, rcCount, sizeof(IndexItem), compareItem);
	printf("done!\n");

	/*
	// output all records for check
	for(i=0; i<rcCount; i++)
	{
		fseek(fpDict, indexes[i].offset, SEEK_SET);
		fread(lbuf, sizeof(char), indexes[i].length, fpDict);
		lbuf[indexes[i].length] = '\0';
		printf("%6d: %s\n", i, lbuf);
	}
	*/

	fseek(fpDict, 0, SEEK_END);
	hdr.so = ftell(fpDict);
	printf("Writing index table... ");
	lastWIndex = -1;
	rcCountF = 0;
	for(i=0; i<rcCount; i++)
	{
		if(lastWIndex != -1)
			if(indexes[i].length == indexes[lastWIndex].length)
				if(strncmp(buf + indexes[i].offset, buf + indexes[lastWIndex].offset, indexes[i].length) == 0)
					continue;
		fwrite(&indexes[i], sizeof(IndexItem), 1, fpDict);
		lastWIndex = i;
		rcCountF++;
	}
	printf("done!\n");
	hdr.eo = ftell(fpDict);
	free(buf);
	
	fseek(fpDict, 0, SEEK_SET);
	fwrite(&hdr, sizeof(hdr), 1, fpDict);

	/*
	fseek(fpDict, hdr.so, SEEK_SET);
	fread(indexes, sizeof(IndexItem), rcCountF, fpDict);
	// output all records for check
	for(i=0; i<rcCountF; i++)
	{
		fseek(fpDict, indexes[i].offset, SEEK_SET);
		fread(lbuf, sizeof(char), indexes[i].length, fpDict);
		lbuf[indexes[i].length] = '\0';
		printf("%s\n", lbuf);
	}
	*/
	fclose(fpDict);

	printf("Dictionary file [%s] is created.\n", fnDict);

	free(indexes);
	free(lbuf);
	return 0;
}

/*
 * dbuf: data area pointer
 * idx:  index area pointer
 * ch:   current character
 * j:    current character's position in word
 * _si, _li: previous range
 * return: 1 - fonnd; 0 - not found
*/
static inline int getCharIndex( const char      * dbuf,
							    const IndexItem * idx, 
							    int ch, 
							    int j, int * _si, int * _li)
{
	int si = *_si;
	int li = *_li;
	int mi;
	int ssi, lli, mmi;

#define GETCH(x) ( (unsigned char)*(dbuf + idx[x].offset + j) )

	if(ch < GETCH(si))
	{
		// above the upper border [not found - case 1]
		*_si = *_li = si;
		return 0;

	}
	else if(ch == GETCH(si))
	{
		// start position
		*_si = si;
		if(ch == GETCH(li))
		{
			// li is just the end
			*_li = li;
			return 1;
		}
		else
		{
			/* ch < GETCH(li) */
			// using binary search, find the end 
			ssi = si;
			lli = li;
			while(lli-ssi>1)
			{
				mmi = (ssi + lli) / 2;
				if(ch < GETCH(mmi))
					lli = mmi;
				else
					ssi = mmi;
			}
			*_li = ssi;
			return 1;
		}

	}
	else
	{
		/* ch > GETCH(si) */

		if(ch > GETCH(li))
		{
			// below the lower border [not found - case 2]
			*_si = *_li = li+1;
			return 0;
		}
		else if(ch == GETCH(li))
		{
			*_li = li;
			// using binary search, find the start
			ssi = si;
			lli = li;
			while(lli-ssi>1)
			{
				mmi = (ssi + lli) / 2;
				if(ch <= GETCH(mmi))
					lli = mmi;
				else
					ssi = mmi;
			}
			*_si = lli;
			return 1;
		}
		else
		{
			/* ch < GETCH(li) */
			// the most common case
			while(li-si>1)
			{
				mi = (si + li) / 2;
				if(ch < GETCH(mi))
					li = mi;
				else if(ch > GETCH(mi))
					si = mi;
				else
				{
					/* == found */

					// search the upper border
					ssi = si;
					lli = mi;
					while(lli-ssi>1)
					{
						mmi = (ssi + lli) / 2;
						if(ch <= GETCH(mmi))
							lli = mmi;
						else
							ssi = mmi;
					}
					*_si = lli;

					// search the lower border
					ssi = mi;
					lli = li;
					while(lli-ssi>1)
					{
						mmi = (ssi + lli) / 2;
						if(ch < GETCH(mmi))
							lli = mmi;
						else
							ssi = mmi;
					}
					*_li = ssi;

					return 1;

				}
			}
			// not included [not found - case 3]
			*_si = *_li = li;
			return 0;

		}

	}

}

static int compareResultNode(const void * a1, const void * a2)
{
	return -( ((ResultNode *)a1)->count - ((ResultNode *)a2)->count );
}

int SearchTextFile(const char * fnDict, FILE * fp)
{
	DictHeader hdr;
	int si, li;
	int rcCount;
	char * dbuf;     // data buffer
	IndexItem * idx; // index buffer
	//unsigned char * lbuf;
	int * cbuf;
	//size_t dbufSize;
	//size_t lbufSize = LBUF_SIZE;
	size_t cbufSize = LBUF_SIZE;
	//int ch;
	int i, j, k;
	int ret;
	int t1, t2, t3;
	size_t cs;   // calculated size
	size_t msizeCount = 0; // malloc size count
	int * * * resultMap = NULL;
	clock_t startClock, finishClock;
	int resultCount = 0;
	ResultNode * resultList = NULL;
	int resultListSize = 1024;
	//ResultNode * * pCurrentRN = &resultList;
	//ResultNode * currentRN;
	// variables for File Mapping
	HFILE  hDictFile;
	HANDLE hMapFile;
	HANDLE hMapView;
	OFSTRUCT opBuf;

	//
	lbuf = (char *)malloc(MAX_PATH);
	cbuf = (int *)malloc(cbufSize);
	
	// 3 steps to map a file: OpenFile, CreateFileMapping, MapViewOfFile
	
		hDictFile = OpenFile(fnDict, &opBuf, OF_READ);
	if(hDictFile == HFILE_ERROR)
	{
		fprintf(stderr, "Cannot open dictionary file [%s].\n", fnDict);
		return -1;
	}

	hMapFile = CreateFileMapping((HANDLE)hDictFile, NULL, PAGE_READONLY, 0, 0, "DictMap");
	if(hMapFile == NULL)
	{
		fprintf(stderr, "Mapping file failed.\n");
		CloseHandle((HANDLE)hDictFile);
		return -1;
	}
	CloseHandle((HANDLE)hDictFile);
	hDictFile = 0;

	hMapView = MapViewOfFile(hMapFile, FILE_MAP_READ, 0, 0, 0);
	if(hMapView == NULL)
	{
		fprintf(stderr, "Mapping view failed.\n");
		CloseHandle(hMapFile);
		return -1;
	}
	// memory - file mapped

	dbuf = (char *)hMapView;
	memcpy(&hdr, dbuf, sizeof(hdr));
	if(strncmp(hdr.maodict, "MAODICT", sizeof(hdr.maodict)) != 0)
	{
		printf("Invalid dictionary format, expecting \"MAODICT\".\n");
		UnmapViewOfFile(hMapView);
		CloseHandle(hMapFile);
		return -1;
	}
	
	idx = (IndexItem *)(dbuf + hdr.so);
	rcCount = (hdr.eo - hdr.so) / sizeof(IndexItem);

	// initialize the first class mapping table
	cs = (1<<SHIFT1_USED)*sizeof(*resultMap);
	resultMap = (int ***)malloc(cs);
	memset(resultMap, NULL, cs);
	msizeCount += cs;

	//fprintf(stderr, "Searching... ");

	// remember when we start
	startClock = clock();

	j = 0;  // word char index
	k = 0;  // number of buffered chars

	do
	{
		j = 0;  // return zero

		si = 0;
		li = rcCount-1;
		for(j=0; ; j++)
		{
			while(k<=j)
				cbuf[k++] = fgetc(fp);
			if(cbuf[j]==EOF)
				break;

			ret = getCharIndex(dbuf, idx, cbuf[j], j, &si, &li);

			//======================================
			// if this is a complete word, add it
			if(ret && j==idx[si].length-1)
			{
				t1 = si >> (SHIFT2+SHIFT3);
				t2 = ((unsigned int)si<<SHIFT1) >> (SHIFT1+SHIFT3);
				t3 = ((unsigned int)si << (SHIFT1+SHIFT2) ) >> (SHIFT1+SHIFT2);
				if(resultMap[t1]==NULL)
				{
					cs = (1<<SHIFT2) * sizeof(resultMap[0][0]);
					resultMap[t1] = (int **)malloc(cs);
					memset(resultMap[t1], NULL, cs);
					msizeCount += cs;
				}
				if(resultMap[t1][t2]==NULL)
				{
					cs = (1<<SHIFT3) * sizeof(resultMap[0][0][0]);
					resultMap[t1][t2] = (int *)malloc(cs);
					memset(resultMap[t1][t2], 0, cs);
					msizeCount += cs;
				}
				//printf("%d: %d %d %d\n", si, t1, t2, t3);
				resultMap[t1][t2][t3] += 1;
			}
			//======================================
			if(!ret)
				break;
			else
			{
				if(li-si==0)
					if(j==idx[si].length-1)
						break;
			}

		}
		// move buffer one step foward (overlapped spaces!!!)
		if(k>1)
			memmove(cbuf, cbuf+1, (k-1)*sizeof(cbuf[0]));
		//printf("%d\n", k);
		k --;
	} while(cbuf[0]!=EOF);

	finishClock = clock();

	//fprintf(stderr, "complete!\n");
	fprintf(stderr, "Processed in %.3fs.\n", double(finishClock-startClock)/CLOCKS_PER_SEC );

	fprintf(stderr, "Totally allocated memory: %.2fKB\n", (double)msizeCount/1024 );

	//================================================
	
	resultList = (ResultNode *)malloc(resultListSize*sizeof(ResultNode) );

	for(i=0; i<(1<<SHIFT1_USED); i++)
		if(resultMap[i])
			for(j=0; j<(1<<SHIFT2); j++)
				if(resultMap[i][j])
					for(k=0; k<(1<<SHIFT3); k++)
						if(resultMap[i][j][k]>0)
						{
							//printf("\"%s\" -- %d\n", dbuf + idx[(i<<(SHIFT2+SHIFT3))|(j<<SHIFT3)|k].offset, resultMap[i][j][k] );

							// add to result list
							if(resultCount >= resultListSize)
							{
								resultListSize *= 2;
								resultList = (ResultNode *)realloc(resultList, resultListSize*sizeof(ResultNode) );
							}
							resultList[resultCount].index = (i<<(SHIFT2+SHIFT3))|(j<<SHIFT3)|k;
							resultList[resultCount].count = resultMap[i][j][k];
							/*
							*pCurrentRN = (ResultNode *)malloc( 1*sizeof(ResultNode) );
							(*pCurrentRN)->index = (i<<(SHIFT2+SHIFT3))|(j<<SHIFT3)|k;
							(*pCurrentRN)->count = resultMap[i][j][k];
							(*pCurrentRN)->next = NULL;
							pCurrentRN = &(*pCurrentRN)->next;
							*/
							resultCount += 1;
						}

	// save them to array
	/*
	ResultNode * * results = (ResultNode * *)malloc( resultCount*sizeof(ResultNode *) );
	j = 0;
	for(currentRN=resultList; currentRN!=NULL; currentRN=currentRN->next)
	{
		results[j++] = currentRN;
	}
	*/

	// sort
	qsort(resultList, resultCount, sizeof(ResultNode), compareResultNode);

	// output result
	//fprintf(stderr, "Result:\n");
	for(j=0; j<resultCount; j++)
	{
		printf("%s -- %d\n", dbuf + idx[resultList[j].index].offset, resultList[j].count );
	}

	free(resultList);
	UnmapViewOfFile(hMapView);

	return 0;
}

int main(int argc, char * argv[])
{
	char * fileName = NULL;
	char opcode = 's';
	int i;
	const char * fnDict = "English.dat";
	//FILE * fp;
	//FILE * fpDict;

	for(i=1; i<argc && i<3; i++)
	{
		if(argv[i][0]=='-')
			opcode = argv[i][1];
		else
			fileName = argv[i];
	}

	// select an operation
	switch(opcode)
	{
	case 'c':
		if(fileName==NULL)
		{
			fprintf(stderr, "File name is needed.\n");
			return -1;
		}
		textToBinaryFile(fileName, ChangeFileExt(fileName, "dat") );
		//textToTrieFile(fileName, ChangeFileExt(fileName, "dat") );
		break;
	case 's':
		if(fileName==NULL)
		{
			fprintf(stderr, "File name is needed.\n");
			return -1;
		}
		SearchTextFile(fileName, stdin);
		break;
	case 'h':
	default:
		printf("Usage:\n");
		printf("  msearch -c <source file>  .... Convert text file to dictionary.\n");
		printf("  msearch <dict file>       .... Input text to search, ended with [Ctrl+Z].\n");
		printf("  msearch -h                .... Print help information.\n");
		printf("Examples:\n");
		printf("  msearch -c English.txt    .... Create English.dat.\n");
		printf("  msearch English.dat <gpl3.txt >result.txt \n"
		       "             .... Search keywords in gpl3.txt and write results to result.txt,\n"
			   "                  using dictionary English.dat\n");
		return -1;
	}

	return 0;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -