⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parser.c

📁 a meta serach engine
💻 C
字号:
#include <stdio.h>#include <string.h>																								// per strlen, strncpy#include <strings.h>																								// per bzero#include <stdlib.h>																								// per malloc#include "macro.h"#include "core.h"#include "parser.h"#include "utils.h"#include "searchengines.h"#include "bench.h"#include "cmdline.h"// clearParserInformations, clearNewEntry, clearAboutCornet -------------------// Reset the PARSER or the NEWENTRY or the ABOUTCOUNTER structures and pointers// ----------------------------------------------------------------------------void clearParserInformations(struct engineWork *e) {	e->parser.ic = 0;	e->parser.s = NULL;}void clearNewEntry(struct engineWork *e) {	if (e->entry.url!=NULL) 	{free(e->entry.url);		e->entry.url = NULL;}	if (e->entry.title!=NULL)	{free(e->entry.title);	e->entry.title = NULL;}	if (e->entry.text!=NULL)	{free(e->entry.text);	e->entry.text = NULL;}}void clearFoundCounter(struct engineWork *e) {	if (e->entry.found!=NULL)	{free(e->entry.found);	e->entry.found = NULL;}}// ----------------------------------------------------------------------------// printGrabbedEntry ----------------------------------------------------------// Print grabbed data, until now.// ----------------------------------------------------------------------------void printGrabbedEntry(struct engineWork *e) {		if (config.parser_print) {		printf("<URL name=\"%s\"/>\n", e->entry.url);		printf("\t<TITLE label=\"%s\"/>\n",e->entry.title);		printf("\t<SNIPPET content=\"%s\"/>\n",e->entry.text);		printf("\t<ENGINE name=\"%s\" rank=\"%d\"/>\n", e->info->name, e->entry.rank);		printf("</URL>\n");	}		stats.prs_result++;																							// increase parsed result counter	e->entry.rank++;																								// increase the rank of next NEWENTRY	e->parsed++;																									// increase number of parsed entries	clearNewEntry(e);																								// free mem occupied by old NEWENTRY}// ----------------------------------------------------------------------------// depureString ---------------------------------------------------------------// Move  the  given  pointer  to a new string cleaned by HTML TAGs. It free the// memory pointed and occupied by the old string.// ----------------------------------------------------------------------------void depureString(char **str) {	char *t, *s = *str;																							// easy pointer to the string	short a=0,b=0;																									// temporary counter variables	char skip='\0';		t = cmalloc(strlen(s)+1);																					// allocate memory for the new string		while (a<strlen(s)) {		if (skip && s[a]!=skip) {a++; continue;}															// if SKIP char not found, go to next		if (skip && s[a]==skip) {a++; skip='\0'; continue;}											// if SKIP char found, analyze		if (s[a]=='<') {skip='>'; a++; continue;}															// if '<' found, look for '>'		if (s[a]=='&') {skip=';'; a++; continue;}															// if '&' found, look for ';'		if (s[a]=='\n') {a++; continue; }																	// skip '\n'		t[b]=s[a];																									// if it's a normal char, collect it		a++; b++;	}		*str = strndup(t, strlen(t));																				// alloc & point to exact required mem	free(t); free(s);}// ----------------------------------------------------------------------------// cmdUntil -------------------------------------------------------------------// Divide the given string STR into token, using the delimiter DELIM, and after// look into the buffer for first of them, grabbing text until it's position.// ----------------------------------------------------------------------------int cmdUntil(char *str, struct engineWork *e) {	char delim = '|';																								// delimiter characters	short len, j;	char *tmp = strdup(str);																					// duplicate of str, needed for strsep	char *t = tmp;																									// temporary pointer, needed for strsep	char *k, *q, *p=NULL;																						// temporary vars, and first match ptr		while (t!=NULL) {		k = strsep2(&t, delim);																					// extract first token		q = strstr(e->ib_next, k);																				// look if it's present in the buffer		if (q==NULL) continue;																					// if not found, try with the next		if (p==NULL || q<p) {p=q; j=strlen(k);}															// if it's the closest, update "p" & "j"	}	free(tmp);	if (p==NULL) return 0;																						// if no-one found, retry	len = p - e->ib_next;																						// calculate the length of the GRAB	t = strndup(e->ib_next, len);																				// copy GRAB in a safe place & save ptr	strAttach(e->parser.s, t);																					// copy it in the memory	free(t);	e->ib_next += len + j;																						// move the NEXT pointer forward	return 1;}// ----------------------------------------------------------------------------// cmdIf ----------------------------------------------------------------------// Extract  first  number  L,  and  pattern T in the given STR string. Look for// the  pattern  P  into  the buffer. Skip follow instruction if it's not found// in the range L.// ----------------------------------------------------------------------------int cmdIf(char *str, struct engineWork *e) {	char *p, *t;	short m = strlen(e->ib_next);	short l = atoi(str);	short d;		t = strchr(str, ' ') + 1;	p = strstr(e->ib_next, t);																					// look into buffer for pattern	if (p==NULL) {		if (m<l) return 0;																						// if !found but !enought buffer, retry	} else {		d = (p - e->ib_next);																					// calculate the distance		if (d<l) return 1;																						// if found and (position<range), ok	}		return 2;																										// if !found or out of range, skip/*		l = atoi(strsep2(&t, ' '));																				// extract the search RANGE	p = strstr(e->ib_next, t);																					// extract the pattern to look for	if (p==NULL && m>=l) {free(tmp); return 2;}															// if !found & enought data, skip inst	if (p==NULL && m<l) {free(tmp); return 0;}															// if !found but !enought data, retry	if ((p - e->ib_next)<l) k=1; else k=2;																	// if !found within the RANGE skip inst	free(tmp);*/}// ----------------------------------------------------------------------------// changePtr ------------------------------------------------------------------// ----------------------------------------------------------------------------int changePtr(char *p, struct engineWork *e) {	if (strcmp(p, "URL")==0)	{e->parser.s = &e->entry.url;	 	return 1;}	if (strcmp(p, "TITLE")==0)	{e->parser.s = &e->entry.title;	return 1;}	if (strcmp(p, "TEXT")==0)	{e->parser.s = &e->entry.text;	return 1;}	if (strcmp(p, "FOUND")==0)	{e->parser.s = &e->entry.found;	return 1;}	printf("Malformed PARSER SCRIPT, grab to <unknown>! Exit!\n");	exit(-1);}// ----------------------------------------------------------------------------// executeCmd -----------------------------------------------------------------// Execute the given PARSER COMMAND. Accepted commands are//    l = LOOK FOR <tag>//    + = MOVE FORWARD OF <n>//    - = MOVE BACKWARD OF <n>//    g = GRAB <target>//    u = UNTIL <tag>//    c = CLEAN//    j = JUMP TO <instruction>//    w = WRITE <text>//    i = IF <range> <tag>//    p = PRINT// ----------------------------------------------------------------------------char executeCmd(char *cmd, struct engineWork *e) {	char *p;		if (cmd==NULL) {printf("Malformed PARSER SCRIPT, inexpected reach of end! Exit!\n"); exit(-1);}		switch(cmd[0]) {		case 'l':																									// LOOK FOR <tag>			p = strstr(e->ib_next, cmd+2);																	// calculate position of <tag>			if (p==NULL) return 0;																				// if not found, abort			e->ib_next = p;																						// move the NEXT pointer to <tag>			return 1;			break;			case '+':																									// MOVE FORWARD OF <n>			e->ib_next += atoi(cmd+2);																			// move the NEXT pointer of <n> chars			return 1;			break;					case '-':																									// MOVE BACKWARD OF <n>			e->ib_next -= atoi(cmd+2);																			// move NEXT pointer back of <n> chars			return 1;			break;					case 'g':																									// GRAB <target>			return changePtr(cmd+2, e);			break;					case 'u':																									// UNTIL <tag>			return cmdUntil(cmd+2, e);			break;		case 'i':																									// IF <range> <pattern>			return cmdIf(cmd+2, e);			break;						case 'c':																									// CLEAN			depureString(e->parser.s);																			// clean the last grabbed string			return 1;			break;			case 'j':																									// JUMP <instruction>			e->parser.ic = atoi(cmd+2)-1;																		// set parses IC to the given instr.			return 1;			break;		case 'p':																									// PRINT			printGrabbedEntry(e);																				// print what grabbed until now			return 1;			break;				case 'w':																									// WRITE <text>			strAttach(e->parser.s, cmd+2);																	// attach to <target> the given <text>			return 1;			break;		case 'q':																									// WRITE <text>			setNoInterest(e);			return 1;			break;							default:			printf("Malformed PARSER SCRIPT COMMAND '%s'! Exit!\n",cmd);			exit(-1);			break;	}	return 0;}// ----------------------------------------------------------------------------// nextCmd --------------------------------------------------------------------// Return next command ready to be executed// ----------------------------------------------------------------------------char *nextCmd(struct engineWork  *e) {	return e->info->parser_cmd[e->parser.ic];																// return NEXT COMMAND to execute}// ----------------------------------------------------------------------------// parse ----------------------------------------------------------------------// Execute the parsing SCRIPT// ----------------------------------------------------------------------------void parse(struct engineWork *e) {	char t=1;	bench(0);	while (t) {		t = executeCmd(nextCmd(e), e);																		// execute NEXT CMD and save result		e->parser.ic += t;																						// increment the IC if done	}	stats.prs_time += bench(0);}// ----------------------------------------------------------------------------

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -