⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 d1_absyn.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
        if (file_must_exist)            return 0;    }        res = (data1_absyn *) nmem_malloc(data1_nmem_get(dh), sizeof(*res));    res->name = 0;    res->reference = VAL_NONE;    res->tagset = 0;    res->encoding = 0;    res->enable_xpath_indexing = (f ? 0 : 1);    res->systags = 0;    systagsp = &res->systags;    tagset_childp = &res->tagset;    res->attset = data1_empty_attset (dh);    attset_childp =  &res->attset->children;    res->varset = 0;    res->esetnames = 0;    esetpp = &res->esetnames;    res->maptabs = 0;    maptabp = &res->maptabs;    res->marc = 0;    marcp = &res->marc;    res->sub_elements = NULL;    res->main_elements = NULL;    res->xp_elements = NULL;        while (f && (argc = read_absyn_line(f, &lineno, line, 512, argv, 50)))    {	char *cmd = *argv;	if (!strcmp(cmd, "elm") || !strcmp(cmd, "element"))	{	    data1_element *new_element;	    int i;	    char *p, *sub_p, *path, *name, *termlists;	    int type, value;	    data1_termlist **tp;	    if (argc < 4)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args to elm", file, lineno);		continue;	    }	    path = argv[1];	    name = argv[2];	    termlists = argv[3];	    if (!cur_elements)	    {                cur_elements = (data1_sub_elements *)		    nmem_malloc(data1_nmem_get(dh), sizeof(*cur_elements));	        cur_elements->next = res->sub_elements;		cur_elements->elements = NULL;		cur_elements->name = "main";		res->sub_elements = cur_elements;				level = 0;    		ppl[level] = &cur_elements->elements;            }	    p = path;	    for (i = 1;; i++)	    {		char *e;		if ((e = strchr(p, '/')))		    p = e+1;		else		    break;	    }	    if (i > level+1)	    {		yaz_log(LOG_WARN, "%s:%d: Bad level increase", file, lineno);		fclose(f);		return 0;	    }	    level = i;	    new_element = *ppl[level-1] = (data1_element *)		nmem_malloc(data1_nmem_get(dh), sizeof(*new_element));	    new_element->next = new_element->children = 0;	    new_element->tag = 0;	    new_element->termlists = 0;	    new_element->sub_name = 0;	    	    tp = &new_element->termlists;	    ppl[level-1] = &new_element->next;	    ppl[level] = &new_element->children;	    	    /* consider subtree (if any) ... */	    if ((sub_p = strchr (p, ':')) && sub_p[1])	    {		*sub_p++ = '\0';		new_element->sub_name =		    nmem_strdup (data1_nmem_get(dh), sub_p);			    }	    /* well-defined tag */	    if (sscanf(p, "(%d,%d)", &type, &value) == 2)	    {		if (!res->tagset)		{		    yaz_log(LOG_WARN, "%s:%d: No tagset loaded", file, lineno);		    fclose(f);		    return 0;		}		if (!(new_element->tag = data1_gettagbynum (dh, res->tagset,							    type, value)))		{		    yaz_log(LOG_WARN, "%s:%d: Couldn't find tag %s in tagset",			 file, lineno, p);		    fclose(f);		    return 0;		}	    }	    /* private tag */	    else if (*p)	    {		data1_tag *nt =		    new_element->tag = (data1_tag *)		    nmem_malloc(data1_nmem_get (dh),				sizeof(*new_element->tag));		nt->which = DATA1T_string;		nt->value.string = nmem_strdup(data1_nmem_get (dh), p);		nt->names = (data1_name *)		    nmem_malloc(data1_nmem_get(dh), 				sizeof(*new_element->tag->names));		nt->names->name = nt->value.string;		nt->names->next = 0;		nt->kind = DATA1K_string;		nt->next = 0;		nt->tagset = 0;	    }	    else	    {		yaz_log(LOG_WARN, "%s:%d: Bad element", file, lineno);		fclose(f);		return 0;	    }	    /* parse termList definitions */	    p = termlists;	    if (*p != '-')	    {		assert (res->attset);				if (parse_termlists (dh, &tp, p, file, lineno, name, res, 0))		{		    fclose (f);		    return 0;		}	        *tp = all; /* append any ALL entries to the list */	    }	    new_element->name = nmem_strdup(data1_nmem_get (dh), name);	}	/* *ostrich*	   New code to support xelm directive	   for each xelm a dfa is built. xelms are stored in res->xp_elements           	   maybe we should use a simple sscanf instead of dfa?           	   pop, 2002-12-13	   Now [] predicates are supported. regexps and xpath structure is	   a bit redundant, however it's comfortable later...	   pop, 2003-01-17	*/	else if (!strcmp(cmd, "xelm")) {	    int i;	    char *p, *xpath_expr, *termlists;	    const char *regexp;	    struct DFA *dfa = dfa = dfa_init();	    data1_termlist **tp;            	    if (argc < 3)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args to xelm", file, lineno);		continue;	    }	    xpath_expr = argv[1];	    termlists = argv[2];	    regexp = mk_xpath_regexp(dh, xpath_expr);	    i = dfa_parse (dfa, &regexp);	    if (i || *regexp) {                yaz_log(LOG_WARN, "%s:%d: Bad xpath to xelm", file, lineno);                dfa_delete (&dfa);                continue;	    }            	    if (!cur_xpelement)	    {                cur_xpelement = (data1_xpelement *)		    nmem_malloc(data1_nmem_get(dh), sizeof(*cur_xpelement));		res->xp_elements = cur_xpelement;            } else {                cur_xpelement->next = (data1_xpelement *)                    nmem_malloc(data1_nmem_get(dh), sizeof(*cur_xpelement));                cur_xpelement = cur_xpelement->next;	    }	    cur_xpelement->next = NULL;	    cur_xpelement->xpath_expr = nmem_strdup(data1_nmem_get (dh), 						    xpath_expr); 	    	    dfa_mkstate (dfa);	    cur_xpelement->dfa = dfa;#ifdef ENHANCED_XELM             cur_xpelement->xpath_len =                zebra_parse_xpath_str(xpath_expr,                                       cur_xpelement->xpath, XPATH_STEP_COUNT,                                      data1_nmem_get(dh));            	    /*	    dump_xp_steps(cur_xpelement->xpath,cur_xpelement->xpath_len);	    */#endif	    cur_xpelement->termlists = 0;	    tp = &cur_xpelement->termlists;            	    /* parse termList definitions */	    p = termlists;	    if (*p != '-')	    {		assert (res->attset);				if (parse_termlists (dh, &tp, p, file, lineno,                                     xpath_expr, res, 1))		{		    fclose (f);		    return 0;		}	        *tp = all; /* append any ALL entries to the list */	    }	} 	else if (!strcmp(cmd, "section"))	{	    char *name;	    	    if (argc < 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args to section",                        file, lineno);		continue;	    }	    name = argv[1];	                cur_elements = (data1_sub_elements *)		nmem_malloc(data1_nmem_get(dh), sizeof(*cur_elements));	    cur_elements->next = res->sub_elements;	    cur_elements->elements = NULL;	    cur_elements->name = nmem_strdup (data1_nmem_get(dh), name);	    res->sub_elements = cur_elements;	    	    level = 0;    	    ppl[level] = &cur_elements->elements;	}        else if (!strcmp(cmd, "xpath"))        {            if (argc != 2)            {		yaz_log(LOG_WARN, "%s:%d: Bad # of args to 'xpath' directive",		     file, lineno);		continue;            }            if (!strcmp(argv[1], "enable"))                res->enable_xpath_indexing = 1;            else if (!strcmp (argv[1], "disable"))                res->enable_xpath_indexing = 0;            else            {		yaz_log(LOG_WARN, "%s:%d: Expecting disable/enable "                        "after 'xpath' directive", file, lineno);            }        }	else if (!strcmp(cmd, "all"))	{	    data1_termlist **tp = &all;	    if (all)	    {		yaz_log(LOG_WARN, "%s:%d: Too many 'all' directives - ignored",		     file, lineno);		continue;	    }	    if (argc != 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args to 'all' directive",		     file, lineno);		continue;	    }	    if (parse_termlists (dh, &tp, argv[1], file, lineno, 0, res, 0))	    {		fclose (f);		return 0;	    }	}	else if (!strcmp(cmd, "name"))	{	    if (argc != 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args to name directive",		     file, lineno);		continue;	    }	    res->name = nmem_strdup(data1_nmem_get(dh), argv[1]);	}	else if (!strcmp(cmd, "reference"))	{	    char *name;	    	    if (argc != 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args to reference",		     file, lineno);		continue;	    }	    name = argv[1];	    if ((res->reference = oid_getvalbyname(name)) == VAL_NONE)	    {		yaz_log(LOG_WARN, "%s:%d: Unknown tagset ref '%s'", 		     file, lineno, name);		continue;	    }	}	else if (!strcmp(cmd, "attset"))	{	    char *name;	    data1_attset *attset;	    	    if (argc != 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args to attset",		     file, lineno);		continue;	    }	    name = argv[1];	    if (!(attset = data1_get_attset (dh, name)))	    {		yaz_log(LOG_WARN, "%s:%d: Couldn't find attset  %s",		     file, lineno, name);		continue;	    }	    *attset_childp = (data1_attset_child *)		nmem_malloc (data1_nmem_get(dh), sizeof(**attset_childp));	    (*attset_childp)->child = attset;	    (*attset_childp)->next = 0;	    attset_childp = &(*attset_childp)->next;	}	else if (!strcmp(cmd, "tagset"))	{	    char *name;	    int type = 0;	    if (argc < 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args to tagset",		     file, lineno);		continue;	    }	    name = argv[1];	    if (argc == 3)		type = atoi(argv[2]);	    *tagset_childp = data1_read_tagset (dh, name, type);	    if (!(*tagset_childp))	    {		yaz_log(LOG_WARN, "%s:%d: Couldn't load tagset %s",		     file, lineno, name);		continue;	    }	    tagset_childp = &(*tagset_childp)->next;	}	else if (!strcmp(cmd, "varset"))	{	    char *name;	    if (argc != 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args in varset",		     file, lineno);		continue;	    }	    name = argv[1];	    if (!(res->varset = data1_read_varset (dh, name)))	    {		yaz_log(LOG_WARN, "%s:%d: Couldn't load Varset %s",		     file, lineno, name);		continue;	    }	}	else if (!strcmp(cmd, "esetname"))	{	    char *name, *fname;	    if (argc != 3)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args in esetname",		     file, lineno);		continue;	    }	    name = argv[1];	    fname = argv[2];	    	    *esetpp = (data1_esetname *)		nmem_malloc(data1_nmem_get(dh), sizeof(**esetpp));	    (*esetpp)->name = nmem_strdup(data1_nmem_get(dh), name);	    (*esetpp)->next = 0;	    if (*fname == '@')		(*esetpp)->spec = 0;	    else if (!((*esetpp)->spec = data1_read_espec1 (dh, fname)))	    {		yaz_log(LOG_WARN, "%s:%d: Espec-1 read failed for %s",		     file, lineno, fname);		continue;	    }	    esetpp = &(*esetpp)->next;	}	else if (!strcmp(cmd, "maptab"))	{	    char *name;	    	    if (argc != 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # of args for maptab",                     file, lineno);		continue;	    }	    name = argv[1];	    if (!(*maptabp = data1_read_maptab (dh, name)))	    {		yaz_log(LOG_WARN, "%s:%d: Couldn't load maptab %s",                     file, lineno, name);		continue;	    }	    maptabp = &(*maptabp)->next;	}	else if (!strcmp(cmd, "marc"))	{	    char *name;	    	    if (argc != 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # or args for marc",		     file, lineno);		continue;	    }	    name = argv[1];	    if (!(*marcp = data1_read_marctab (dh, name)))	    {		yaz_log(LOG_WARN, "%s:%d: Couldn't read marctab %s",                     file, lineno, name);		continue;	    }	    marcp = &(*marcp)->next;	}	else if (!strcmp(cmd, "encoding"))	{	    if (argc != 2)	    {		yaz_log(LOG_WARN, "%s:%d: Bad # or args for encoding",		     file, lineno);		continue;	    }            res->encoding = nmem_strdup (data1_nmem_get(dh), argv[1]);	}        else if (!strcmp(cmd, "systag"))        {            if (argc != 3)            {		yaz_log(LOG_WARN, "%s:%d: Bad # or args for systag",		     file, lineno);		continue;            }            *systagsp = nmem_malloc (data1_nmem_get(dh), sizeof(**systagsp));            (*systagsp)->name = nmem_strdup(data1_nmem_get(dh), argv[1]);            (*systagsp)->value = nmem_strdup(data1_nmem_get(dh), argv[2]);            systagsp = &(*systagsp)->next;        }	else	{	    yaz_log(LOG_WARN, "%s:%d: Unknown directive '%s'", file,                     lineno, cmd);	    continue;	}    }    if (f)        fclose(f);        for (cur_elements = res->sub_elements; cur_elements;	 cur_elements = cur_elements->next)    {	if (!strcmp (cur_elements->name, "main"))	    res->main_elements = cur_elements->elements;	fix_element_ref (dh, res, cur_elements->elements);    }    *systagsp = 0;    yaz_log (LOG_DEBUG, "%s: data1_read_absyn end", file);    return res;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -