📄 tokenizer.cpp
字号:
/////////////////////////////////////////////////////////////////////////////// Copyright (c) 2000 Intel Corporation// All rights reserved.//// Redistribution and use in source and binary forms, with or without// modification, are permitted provided that the following conditions are met://// * Redistributions of source code must retain the above copyright notice,// this list of conditions and the following disclaimer.// * Redistributions in binary form must reproduce the above copyright notice,// this list of conditions and the following disclaimer in the documentation// and/or other materials provided with the distribution.// * Neither name of Intel Corporation nor the names of its contributors// may be used to endorse or promote products derived from this software// without specific prior written permission.//// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE./////////////////////////////////////////////////////////////////////////////// $Revision: 1.1.1.1 $// $Date: 2004/10/19 11:54:08 $#include "../../inc/tools/config.h"#ifdef INTERNAL_WEB_SERVER#if EXCLUDE_WEB_SERVER == 0#include <stdio.h>#include <assert.h>#include <string.h>#include <ctype.h>#include <genlib/util/utilall.h>#include <genlib/meta/stream/charreader.h>#include <genlib/net/http/tokenizer.h>#define TOK_CHAR_TAB 0x9#define TOK_CHAR_SPACE 0x20#define TOK_CHAR_CR 0xD#define TOK_CHAR_LF 0xA Token::Token(){}Token::~Token(){}const char* Token::getTokenTypeStr() const{ static const char* Names[] = { "IDENTIFIER", "WHITESPACE", "CRLF", "SEPARATOR", "QUOTED_STRING" }; const int NumItems = sizeof(Names) / sizeof(char*); if ( tokType < 0 || tokType >= NumItems ) return "UNKNOWN TOKEN"; else return Names[ tokType ];}// make it from: this <-> node to this <-> t <-> node void Token::insertAfterSelf( Token* t ){ assert( t != NULL ); t->next = next; t->prev = this; next->prev = t; next = t;}// make it from: node<->this to node <-> t <-> thisvoid Token::insertBeforeSelf( Token* t ){ assert( t != NULL ); t->prev = prev; t->next = this; prev->next = t; prev = t;}// remove self from linked listvoid Token::unlinkSelf(){ prev->next = next; next->prev = prev;}//////////////////////////////////////////// Tokenizerstatic bool IsSeparator( char c ){ static const char *separators = " \t()<>@,;:\\\"/[]?={}"; return strchr( separators, c ) != NULL;}static bool IsControlChar( char c ){ return ( (c >= 0 && c <= 31) || c == 127 );}//static bool IsHexDigit( char c )//{// return ( (c >= '0' && c <= '0') ||// (c >= 'a' && c <= 'f') ||// (c >= 'A' && c <= 'F')// );//}// identifier = any char 0..127 which is not separator or ctrl charstatic bool IsIdentifierChar( char c ){ return (c >= 32 && c <= 126) && (!IsSeparator(c));}// normal quoted string charsstatic bool IsQdText( char c ){ switch (c) { case TOK_CHAR_CR: case TOK_CHAR_LF: case TOK_CHAR_TAB: return true; case '"': return false; } return !IsControlChar(c);}Tokenizer::Tokenizer( CharReader& r ) : reader(r){ head = new Token(); tail = new Token(); if ( head == NULL || tail == NULL ) { throw OutOfMemoryException( "Tokenizer::Tokenizer()" ); } // head is near oldest token; tail near newest // i.e., head points to first item in q; tail last // head <-> item1 <-> item2 <-> tail head->next = tail; head->prev = NULL; tail->next = NULL; tail->prev = head; current = head; listLen = 0; lineNum = 1;}Tokenizer::~Tokenizer(){ deleteUntilSizeEquals( 0 ); delete head; delete tail;}// throws OutOfMemoryExceptionToken* Tokenizer::getToken(){ Token *t; if ( current->next == tail ) { // no more items; read next one t = new Token(); if ( t == NULL ) { throw OutOfMemoryException( "Tokenizer::getToken()" ); } readNextToken( *t ); // insert at end of list tail->insertBeforeSelf( t ); listLen++; current = t; // point to current token // remove extra tokens saved for pushbacks //cleanupOldTokens(); deleteUntilSizeEquals( MAX_PUSHBACKS ); } else { t = current->next; current = current->next; // next item } //DBG( printf("get_token: %s\n", t->s.c_str()); ) return t;}void Tokenizer::pushBack(){ char errbuf[100]; if ( current == head ) { // error sprintf( errbuf, "Extra push back on line %d: Tokenizer::pushback()", lineNum ); throw TokenizerException( errbuf ); } else { // move back one token current = current->prev; // DBG( printf("push_back: %s\n", current->s.c_str()); ) }}int Tokenizer::read( OUT void* buffer, IN int bufsize ){ assert( buffer != NULL ); if ( bufsize <= 0 ) return 0; int addedLen = 0; Token* saveCurrent = current; Token* token; char* buf = (char *)buffer; // add data from any pending tokens while ( current->next != tail ) { int slen; token = current; slen = token->s.length(); if ( addedLen + slen > bufsize ) { // buffer full return addedLen; } // append buf memcpy( &buf[addedLen], token->s.c_str(), slen ); addedLen += slen; current = current->next; // next token } // read rest of data unparsed from stream if ( addedLen < bufsize && !reader.endOfStream() ) { int streamRead; streamRead = reader.read( &buf[addedLen], bufsize - addedLen ); if ( streamRead < 0 ) { current = saveCurrent; // restore tokenizer state return -1; } addedLen += streamRead; } return addedLen;}bool Tokenizer::endOfData() const{ // true if no token data, and no data in stream if ( (current->next == tail) && (reader.endOfStream()) ) return true; return false;}void Tokenizer::deleteUntilSizeEquals( int newSize ){ // precond: // current now points to last item // listLen = num used up tokens in list Token* t; while ( listLen > newSize ) { // delete oldest token t = head->next; t->unlinkSelf(); delete t; listLen--; }} void Tokenizer::readNextToken( Token& tok ){ char c; char errbuf[100]; // any data left in stream? if ( reader.endOfStream() ) { tok.tokType = Token::END_OF_STREAM; return; } c = reader.getChar(); if ( IsIdentifierChar(c) ) { // identifier tok.tokType = Token::IDENTIFIER; tok.s = c; while ( !reader.endOfStream() ) { c = reader.getChar(); if ( !IsIdentifierChar(c) ) { reader.pushBack(); break; } tok.s += c; } } else if ( c == ' ' || c == TOK_CHAR_TAB ) { // space or TAB tok.tokType = Token::WHITESPACE; tok.s = c; while ( !reader.endOfStream() ) { // add rest of whitespace, if any c = reader.getChar(); if ( !(c == ' ' || c == TOK_CHAR_TAB) ) { reader.pushBack(); // return unprocessed char break; } tok.s += c; } } else if ( c == TOK_CHAR_CR ) { // crlf if ( reader.endOfStream() || reader.getChar() != TOK_CHAR_LF ) { sprintf( errbuf, "line %d: Expected LF after CR", lineNum ); throw TokenizerException( errbuf ); } tok.tokType = Token::CRLF; tok.s = "\r\n"; lineNum++; // now in next line } else if ( c == TOK_CHAR_LF ) { tok.tokType = Token::CRLF; tok.s = "\r\n"; lineNum++; } else if ( c == '"' ) { // quoted string tok.tokType = Token::QUOTED_STRING; tok.s = c; bool gotEndQuote = false; while ( !reader.endOfStream() ) { c = reader.getChar(); if ( c == '"' ) { // end of string tok.s += c; gotEndQuote = true; break; } if ( c == '\\' ) { // add \char if ( reader.endOfStream() ) { break; } c = reader.getChar(); // accept ctrl, ascii text // note: deviation from spec, don't add NULL if ( c >= 1 && (unsigned char)c < 127 ) { tok.s += c; } else { // unacceptable char in string sprintf( errbuf, "line %d: unknown char %c (ascii %d)", lineNum, c, int(c) ); throw TokenizerException( errbuf ); } } else if ( IsQdText(c) ) { tok.s += c; } else { // unacceptable char in string sprintf( errbuf, "line %d: unknown char %c (ascii %d)", lineNum, c, int(c) ); throw TokenizerException( errbuf ); } } if ( !gotEndQuote ) { sprintf( errbuf, "line %d: no end quote", lineNum ); throw TokenizerException( errbuf ); } } else if ( IsControlChar(c) ) { // control char tok.tokType = Token::CTRL; tok.s = c; tok.c = c; } else if ( IsSeparator(c) ) { // separator tok.tokType = Token::SEPARATOR; tok.s = c; tok.c = c; } else { sprintf( errbuf, "line %d: unknown char %c (ascii %d)", lineNum, c, int(c) ); throw TokenizerException( errbuf ); }}#endif#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -