📄 regexx.cpp

📁 Shorthand是一个强大的脚本语言
💻 CPP
字号:
/////////////////////////////////////////////////////////////////////////////
// $Header: /shorthand/src/regexx.cpp 3     8/28/02 6:27a Arm $
//---------------------------------------------------------------------------
// This file is part of "libAndrix" library - a collection of classes
// and functions developed by Andrei Remenchuk.
//---------------------------------------------------------------------------
// While you may own complete copyright on the project with which you have
// received this file, the author reserves the right to use code contained
// in this very file for any purposes, including publishing and usage in
// any free or commercial software.
//
// You may re-distribute this file or re-use it in your own free or
// commercial software provided that this text is included in the file.
// If you change this file you must include clear notice stating that
// you changed this file and the date of change.
//
// This statement doesn't apply to other files that are part of the same
// package unless otherwise noted.
//---------------------------------------------------------------------------
// (c) 1998-2002 Andrei Remenchuk <andrei@remenchuk.com>
//---------------------------------------------------------------------------
// regexx.h - C++ wrapper of GNU regex library
/////////////////////////////////////////////////////////////////////////////
#include "regexx.h"

#include <malloc.h>
#include <string.h>
#include "regex.h"

#define m_buffer  ((re_pattern_buffer*)m_pattern_buffer)
#define m_regs  ((re_registers*)m_registers)

const int RXX_NO_BK_PARENS = RE_NO_BK_PARENS;
const int RXX_NO_BK_VBAR   = RE_NO_BK_VBAR;
const int RXX_INTERVALS    = RE_INTERVALS;
const int RXX_NO_BK_BRACES = RE_NO_BK_BRACES;



RX::RX(const char* pattern, int options)
: m_pattern(pattern), m_syntax_options(options)
{
    if (options == 0)
    {
        m_syntax_options = (RE_NO_BK_PARENS|RE_CHAR_CLASSES|RE_NO_BK_VBAR|RE_DOT_NEWLINE);
    }
    ctor();
    compile();
}

RX::RX(const RX& rx)
: m_pattern(rx.m_pattern), m_syntax_options(rx.m_syntax_options)
{
    ctor();
    compile();
}

RX* RX::clone() const
{
    return new RX(*this);
}

void RX::ctor()
{
    m_pattern_buffer = malloc(sizeof(re_pattern_buffer));
    m_registers = malloc(sizeof(re_registers));
    
    memset(m_pattern_buffer, 0, sizeof(re_pattern_buffer));
    memset(m_registers, 0, sizeof(re_registers));
}


void RX::aquire(const RX& rx)
{
    m_pattern = rx.m_pattern;
    m_syntax_options = rx.m_syntax_options;
    compile();
}

bool RX::compile()
{
    shrink();
    m_buffer->regs_allocated = REGS_UNALLOCATED;
    ::re_syntax_options = m_syntax_options;
    
    const char* err = re_compile_pattern(m_pattern, m_pattern.length(), m_buffer);
    if (err != NULL)
    {
        // rx compilation failed
        throw new ShhObjectException(1904, "%s", err);
    }
    // fastmap is a must for matching long strings
    
    m_buffer->fastmap = (char*) malloc(256);
    memset(m_buffer->fastmap, 0, 256);

    return true;
}


/**
 * Tries to search regular expression within the string.
 * 
 * If the match was found, internal sub-pattern spaces are set so that
 * you can retrieve each sub-pattern by calling submatch() method.
 * 
 * parameters:
 *    s [IN] a string to match. RX makes internal copy of this string 
 *           which is kept until next call to match(), in order to be able
 *           to extract sub-matches.
 * return value:
 *    the index within the string where the match ended. this index
 *    can be used as 'start' parameter in next successive call to search().
 *    returns -1 if no match has beens found
 */
int RX::search(const char* s, int size, int start, bool make_copy )
{
    //memset(m_pattern_buffer.fastmap, 0, 256);
    //m_pattern_buffer.fastmap = NULL;

    int off = re_search(m_buffer, s, size, start, size-start, m_regs);
    if (off == -2)
    {
        throw new ShhObjectException(9102, "internal error in rx search");
    }
    if (off == -1) return off;
    
    if (make_copy) 
        m_content = s; 
    else 
        m_content.clear();

    return m_regs->end[0];
}


int RX::search_no_exception(const char* s, int size, int start, bool make_copy)
{
    int off = re_search(m_buffer, s, size, start, size-start, m_regs);
    if (off == -2) return -2;
    if (off == -1) return off;
    
    if (make_copy) 
        m_content = s; 
    else 
        m_content.clear();

    return m_regs->end[0];
}


/**
 * Replaces every occurence of the pattern within the string by fixed string
 */
int RX::replace(const char* source, const char* replacement, string& target)
{
    int off = 0, last_off = 0;
    int source_length = strlen(source);
    int count = 0;

    target.clear();
    while((off = search(source, source_length, last_off, false)) != -1)
    {
        if (off > last_off)
        {
            target.append(source + last_off, off - last_off);
        }
        target.append(replacement);
        count++;
        last_off = off;
    }   
    if (last_off < source_length)
    {
        target.append(source + last_off, source_length - last_off);
    }
    return count;
}


int RX::replace(const char* source, replace_function_t replacer, string& target)
{
    int off = 0, last_off = 0;
    int source_length = strlen(source);
    int count = 0;
    int last_match_start = 0;

    target.clear();
    while((off = search(source, source_length, last_off, false)) != -1)
    {
        int match_start = m_regs->start[0];
        if (match_start > last_off)
        {
            target.append(source + last_off, match_start - last_off);
        }
        string m; submatch(source, 0, m);
        replacer(m, target);
        count++;
        last_off = off;
        last_match_start = match_start;
    }   
    if (last_off < source_length)
    {
        target.append(source + last_off, source_length - last_off);
    }
    return count;
}




bool RX::match(const char* s)
{
    int off = re_match(m_buffer, s, strlen(s), 0, m_regs);
    if (off <= -1)
    {
        m_buffer->re_nsub = (size_t) -1;
        return false;
    }
    else
    {
        m_content = s; // make a copy of the entire string
        return true;
    }
}

bool RX::submatch(unsigned int index, string& sub, int offset) const
{
    return submatch(m_content, index, sub, offset);
}

/**
 * extracts sub-string that has been matched by one of grouping operators '()'
 * returns true if index is valid.
 * sub-expression indexes start with 1, zero means entire matched text.
 * in case if index is out of range, the output string is emptied.
 */
bool RX::submatch(const char* origin, unsigned int index, string& sub, int offset) const
{
    if (index <= m_buffer->re_nsub)
    {
        sub.paste(origin, m_regs->start[index]+offset, m_regs->end[index]);
        return true;
    }
    else
    {
        sub.clear();
        return false;
    }
}



bool RX::match(const char* s, string& sub1)
{
    if (!match(s)) return false;
    submatch(1, sub1);
    return true;
}

bool RX::match(const char* s, string& sub1, string& sub2)
{
    if (!match(s)) return false;
    submatch(1, sub1);
    submatch(2, sub2);
    return true;
}

bool RX::match(const char* s, string& sub1, string& sub2, string& sub3)
{
    if (!match(s)) return false;
    submatch(1, sub1);
    submatch(2, sub2);
    submatch(3, sub3);
    return true;
}

bool RX::match(const char* s, string& sub1, string& sub2, string& sub3, string& sub4)
{
    if (!match(s)) return false;
    submatch(1, sub1);
    submatch(2, sub2);
    submatch(3, sub3);
    submatch(4, sub4);
    return true;
}

bool RX::match(const char* s, string& sub1, string& sub2, string& sub3, string& sub4, string& sub5)
{
    if (!match(s)) return false;
    submatch(1, sub1);
    submatch(2, sub2);
    submatch(3, sub3);
    submatch(4, sub4);
    submatch(5, sub5);
    return true;
}

bool RX::match(const char* s, string_array& subs)
{
    if (!match(s)) return false;
    subs.clear();
    for(unsigned int i=1; i<=m_buffer->re_nsub; i++)
    {
        int start = m_regs->start[i];
        int end   = m_regs->end[i];
        subs.add(new string(s + start, end - start));
    }
    return true;
}




void RX::shrink()
{
    if (m_buffer->buffer != NULL) free(m_buffer->buffer);
    if (m_buffer->fastmap != NULL) free(m_buffer->fastmap);

    if (m_regs->end != NULL) free(m_regs->end);
    if (m_regs->start != NULL) free(m_regs->start);

    m_content.clear();
    ctor();
}


RX::~RX()
{
    shrink();
    if (m_pattern_buffer != NULL) free(m_pattern_buffer);
    if (m_registers != NULL) free(m_registers);

}


static void dehtmlifier(const string& submatch, string& target)
{
    const char* sm = submatch.cstr();
    if (*sm == '<') return;
    else if (submatch.ieq("&lt;")) target.append("<");
    else if (submatch.ieq("&gt;")) target.append(">");
    else if (submatch.ieq("&amp;")) target.append("&");
    else if (submatch.ieq("&nbsp;")) target.append(" ");
    else if (sm[0] == '&' && sm[1] == '#')
    {
        int code = atoi(sm+2) & 0xff;
        target.append( (char)code );
    }
}

void dehtmlify(const char* source, string& target)
{
    RX tags("(<[^>]+>)|(&lt;)|(&gt;)|(&amp;)|(&nbsp;)|(&#[0-9]+;)");
    tags.replace(source, dehtmlifier, target);
}
💿 文件大小 482 K
👤 上传用户 jiangleip531
📂 所属分类编译器/解释器
🏷️ 相关标签

#Shorthand #脚本 #语言
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -