📄 sexp.h
字号:
/**
This software and ancillary information (herein called "SOFTWARE")
called Supermon is made available under the terms described
here. The SOFTWARE has been approved for release with associated
LA-CC Number LA-CC 99-51.
Unless otherwise indicated, this SOFTWARE has been authored by an
employee or employees of the University of California, operator of the
Los Alamos National Laboratory under Contract No. W-7405-ENG-36 with
the U.S. Department of Energy. The U.S. Government has rights to use,
reproduce, and distribute this SOFTWARE, and to allow others to do so.
The public may copy, distribute, prepare derivative works and publicly
display this SOFTWARE without charge, provided that this Notice and
any statement of authorship are reproduced on all copies. Neither the
Government nor the University makes any warranty, express or implied,
or assumes any liability or responsibility for the use of this
SOFTWARE.
If SOFTWARE is modified to produce derivative works, such modified
SOFTWARE should be clearly marked, so as not to confuse it with the
version available from LANL.
**/
/** NOTICE: This software is licensed under the GNU Public License, which
is included as LICENSE_GPL in this source distribution. **/
/** NOTE: This library is part of the supermon project, hence the name
supermon above. **/
#ifndef __SEXP_H__
#define __SEXP_H__
#include <stdio.h> /* for BUFSIZ only */
#include "faststack.h"
#include "cstring.h"
/**
** though unlikely to affect performance based on some simple tests
** run recently, this is available for people who wish to disable
** assertion paranoia. beware: if your code has assert() used in it
** and you don't want THOSE assertions disabled, #include <assert.h>
** after the last place sexp.h is included. I put this here to
** avoid the annoyance of "#ifdef .." messes around each assert()
**/
#ifdef _NOASSERTS_
#undef assert
#define assert(x) { }
#endif
/**
* \mainpage A small and quick S-expression parsing library.
*
* \section intro Intro
*
* This library was created to provide s-expression parsing and manipulation
* facilities to C and C++ programs. The primary goals were speed and
* efficiency - low memory impact, and the highest speed we could achieve in
* parsing. Suprisingly, no other libraries on the net were found that
* were not bloated with features or involved embedding a full-fledged
* LISP or Scheme interpreter into our programs. So, this library evolved
* to fill this gap. As such, it does not guarantee that every valid LISP
* expression is parseable (although every parseable expression is valid LISP),
* and many features that are not required aren't implemented. See Rivest's
* S-expression library for an example of a much more featureful library.
*
* What features does this library include? At the heart of the code is a
* continuation-based parser implementing a basic state machine. Continuations
* allow users to accumulate multiple streams of characters, and parse
* each stream simultaneously. A continuation allows the parser to stop
* midstream, start working on a new expression, and return to the first
* without corruption of complex state management in the users code. No
* threads, no forking, nothing more than a data structure that must be passed
* in and captured as data becomes available to parse. Once an expression
* has been parsed, a simple structure is returned that represents
* the "abstract syntax tree" of the parsed expression. For the majority
* of users, the parser and this data structure will be all that they will
* ever need to see. For convenience reasons, other functions such as I/O
* wrappers and AST traversal routines have been included, but they are
* not required if users don't wish to use them.
*
* \section performance Performance Results
*
* The standard benchmark to compare versions of this parser has been the
* "torture.c" file included in the tests/ subdirectory. This file takes
* a large s-expression via STDIN, and parses, unparses, and destroys it
* 1,000 times. The following numbers reflect tests done on a
* representative expression from Supermon, the tool that this code was
* created for.
*
* - <B>v0.1.1</B> : 1.61s
* - <B>v0.1.2</B> : 1.12s
* - <B>v0.2.0</B> : 1.07s
* - <B>v0.2.1</B> : 1.09s
* - <B>v0.2.2</B> : 1.09s
* - <B>v0.2.3</B> : 1.11s
* - <B>v0.3.0</B> : 0.42s
* - <B>v0.3.1</B> : ?.??s
* - <B>v0.3.2</B> : ?.??s
*
* \section license License Information
*
* This software and ancillary information (herein called "SOFTWARE")
* called Supermon is made available under the terms described
* here. The SOFTWARE has been approved for release with associated
* LA-CC Number LA-CC 99-51.
*
* Unless otherwise indicated, this SOFTWARE has been authored by an
* employee or employees of the University of California, operator of the
* Los Alamos National Laboratory under Contract No. W-7405-ENG-36 with
* the U.S. Department of Energy. The U.S. Government has rights to use,
* reproduce, and distribute this SOFTWARE, and to allow others to do so.
* The public may copy, distribute, prepare derivative works and publicly
* display this SOFTWARE without charge, provided that this Notice and
* any statement of authorship are reproduced on all copies. Neither the
* Government nor the University makes any warranty, express or implied,
* or assumes any liability or responsibility for the use of this
* SOFTWARE.
*
* If SOFTWARE is modified to produce derivative works, such modified
* SOFTWARE should be clearly marked, so as not to confuse it with the
* version available from LANL.
*
* This software is licensed under the GNU Public License, which
* is included as LICENSE_GPL in this source distribution.
*
* This library is part of the Supermon project, hence the name
* "Supermon" above. See http://www.acl.lanl.gov/supermon/ for details on
* that project.
*/
/**
* \file sexp.h
*
* \brief API for a small, fast and portable s-expression parser library.
*
* Written by Matt Sottile (matt@lanl.gov), January 2002. See LICENSE
* file for licensing details.
*/
/*==============*/
/* ENUMERATIONS */
/*==============*/
/**
* An element in an s-expression can be one of three types: a <i>value</i>
* represents an atom with an associated text value. A <i>list</i>
* represents an s-expression, and the element contains a pointer to
* the head element of the associated s-expression.
*/
typedef enum {
/**
* An atom of some type. See atom type (aty) field of element structure
* for details as to which atom type this is.
*/
SEXP_VALUE,
/**
* A list. This means the element points to an element representing the
* head of a list.
*/
SEXP_LIST
} elt_t;
/**
* For an element that represents a value, the value can be interpreted
* as a more specific type. A <i>basic</i> value is a simple string with
* no whitespace (and therefore no quotes required). A <i>double quote</i>
* value, or <i>dquote</i>, is one that contains characters (such as
* whitespace) that requires quotation marks to contain the string. A
* <i>single quote</i> value, or <i>squote</i>, represents an element that is
* prefaced with a single tick-mark. This can be either an atom or
* s-expression, and the result is that the parser does not attempt to parse
* the element following the tick mark. It is simply stored as text. This
* is similar to the meaning of a tick mark in the Scheme or LISP family
* of programming languages.
*/
typedef enum {
/**
* Basic, unquoted value.
*/
SEXP_BASIC,
/**
* Single quote (tick-mark) value - contains a string representing
* a non-parsed portion of the s-expression.
*/
SEXP_SQUOTE,
/**
* Double-quoted string. Similar to a basic value, but potentially
* containing white-space.
*/
SEXP_DQUOTE,
/**
* Binary data. This is used when the specialized parser is active
* and supports inlining of binary blobs of data inside an expression.
*/
SEXP_BINARY
} atom_t;
/*============*/
/* STRUCTURES */
/*============*/
/**
* An s-expression is represented as a linked structure of elements,
* where each element is either an <I>atom</I> or <I>list</I>. An
* atom corresponds to a string, while a list corresponds to an
* s-expression. The following grammar represents our definition of
* an s-expression:<P>
*
* <pre>
* sexpr ::= ( sx )
* sx ::= atom sxtail | sexpr sxtail | 'sexpr sxtail | 'atom sxtail | NULL
* sxtail ::= sx | NULL
* atom ::= quoted | value
* quoted ::= "ws_string"
* value ::= nws_string
* </pre>
* <P>
*
* An atom can either be a quoted string, which is a string containing
* whitespace (possibly) surrounded by double quotes, or a non-whitespace
* string that does not require surrounding quotes. An element representing
* an atom will have a type of <i>value</i> and data stored in the <i>val</i>
* field. An element of type <i>list</i> represents an s-expression
* corresponding to <i>sexpr</i> in the grammar, and will have a pointer to
* the head of the appropriate s-expression. Details regarding these fields
* and their values given with the fields themselves. Notice that a single
* quote can appear directly before an s-expression or atom, similar to the
* use in LISP.
*/
typedef struct elt {
/**
* The element has a type that determines how the structure is used. If
* the type is <B>SEXP_VALUE</B>, then a programmer knows that the val field
* is meaningful and contains the data associated with this element of the
* s-expression. If the type is <B>SEXP_LIST</B>, then the list field
* contains a pointer to the s-expression element representing the head of
* the list. For each case, the field for the opposite case contains no
* meaningful data and using them in any way is likely to cause an error.
*/
elt_t ty;
/**
* If the type of the element is <B>SEXP_VALUE</B> this field will contain
* the actual data represented by this element.
*/
char *val;
/**
* Number of bytes allocated for val.
*/
int val_allocated;
/**
* Number of bytes used in val (<= val_allocated).
*/
int val_used;
/**
* If the type of the element is <B>SEXP_LIST</B>, this field will contain
* a pointer to the head element of the list.
*/
struct elt *list;
/**
* The <I>next</I> field is a pointer to the next element in the current
* expression. If this element is the last element in the s-expression,
* this field will be <I>NULL</I>.
*/
struct elt *next;
/**
* For elements that represent <I>values</I>, this field will specify the
* specific type of value that it represents. This can be used by
* functions to determine how this value should be printed (ie: how it
* should be quoted) or interpreted (ie: interpreting s-expressions that
* are prefixed with a tick-mark.).
*/
atom_t aty;
/**
* For elements that represent <i>binary</I> blobs, this field will
* point to a memory location where the data resides. The length
* of this memory blob is the next field. char* implies byte sized
* elements.
*/
char *bindata;
/**
* The length of the data pointed at by bindata in bytes.
*/
unsigned int binlength;
} sexp_t;
/**
* parser mode flag used by continuation to toggle special parser
* behaviour.
*/
typedef enum {
/**
* normal (LISP-style) s-expression parser behaviour.
*/
PARSER_NORMAL,
/**
* treat atoms beginning with #b# as inlined binary data. everything
* else is treated the same as in PARSER_NORMAL mode.
*/
PARSER_INLINE_BINARY
} parsermode_t;
/**
* A continuation is used by the parser to save and restore state between
* invocations to support partial parsing of strings. For example, if we
* pass the string "(foo bar)(goo car)" to the parser, we want to be able
* to retrieve each s-expression one at a time - it would be difficult to
* return all s-expressions at once without knowing how many there are in
* advance (this would require more memory management than we want...).
* So, by using a continuation-based parser, we can call it with this string
* and have it return a continuation when it has parsed the first
* s-expression. Once we have processed the s-expression (accessible
* through the <i>last_sexpr</i> field of the continuation), we can call
* the parser again with the same string and continuation, and it will be
* able to pick up where it left off.<P>
*
* We use continuations instead of a state-ful parser to allow multiple
* concurrent strings to be parsed by simply maintaining a set of
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -