regexp.c

来自「开放源码的编译器open watcom 1.6.0版的源代码」· C语言 代码 · 共 1,192 行 · 第 1/3 页

C
1,192
字号
 */
static void regoptail (char* p, const char* val) {
    // "Operandless" and "op != BRANCH" are synonymous in practice.
    if (p == NULL || p == &regdummy || OP(p) != BRANCH)
        return;
    regtail(OPERAND(p), val);
}



////////////////////////////////////////////////////////////////////////
// 
//  find and friends
// 
////////////////////////////////////////////////////////////////////////


/*
 * Global work variables for find().
 */
static const char*  reginput;   // String-input pointer.
static const char*  regbol;     // Beginning of input, for ^ check.
static const char* *regstartp;  // Pointer to startp array.
static const char* *regendp;    // Ditto for endp.

/*
 * Forwards.
 */
STATIC int regtry (const char*, const char*[NSUBEXP],
                   const char*[NSUBEXP], const char*);
STATIC int regmatch (const char*);
STATIC int regrepeat (const char*);

#ifdef DEBUG
int          regnarrate = 0;
void         regdump ();
STATIC char* regprop ();
#endif


/*
 - find - match a regexpr against a string
 */
Boolean CoolRegexp::find (const char* string) {
    register const char* s;
        
    this->searchstring = string;

     // Check validity of program.
    if (UCHARAT(this->program) != MAGIC) {
        //RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
        printf ("CoolRegexp::find(): Compiled regular expression corrupted.\n");
        abort ();
    }
    
    // If there is a "must appear" string, look for it.
    if (this->regmust != NULL) {
        s = string;
        while ((s = strchr(s, this->regmust[0])) != NULL) {
            if (strncmp(s, this->regmust, this->regmlen) == 0)
                break;          // Found it.
            s++;
        }
        if (s == NULL)          // Not present.
            return (0);
    }
     
    // Mark beginning of line for ^ .
    regbol = string;

    // Simplest case:  anchored match need be tried only once.
    if (this->reganch)
        return (regtry(string, this->startp, this->endp, this->program));
    
    // Messy cases:  unanchored match.
    s = string;
    if (this->regstart != '\0')
        // We know what char it must start with.
        while ((s = strchr(s, this->regstart)) != NULL) {
            if (regtry(s, this->startp, this->endp, this->program))
                return (1);
            s++;
          
        }
    else
        // We don't -- general case.
        do {
            if (regtry(s, this->startp, this->endp, this->program))
                return (1);
        } while (*s++ != '\0');
    
    // Failure.
    return (0);
}


/*
 - regtry - try match at specific point
   0 failure, 1 success
 */
static int regtry (const char* string, const char* *start,
                   const char* *end, const char* prog) {
    register       int    i;
    register const char* *sp1;
    register const char* *ep;

    reginput = string;
    regstartp = start;
    regendp = end;

    sp1 = start;
    ep = end;
    for (i = NSUBEXP; i > 0; i--) {
        *sp1++ = NULL;
        *ep++ = NULL;
    }
    if (regmatch(prog + 1)) {
        start[0] = string;
        end[0] = reginput;
        return (1);
    }
    else
        return (0);
}


/*
 - regmatch - main matching routine
 *
 * Conceptually the strategy is simple:  check to see whether the current
 * node matches, call self recursively to see whether the rest matches,
 * and then act accordingly.  In practice we make some effort to avoid
 * recursion, in particular by going through "ordinary" nodes (that don't
 * need to know whether the rest of the match failed) by a loop instead of
 * by recursion.
 * 0 failure, 1 success
 */
static int regmatch (const char* prog) {
    register const char* scan;  // Current node.
             const char* next;  // Next node.

    scan = prog;

    while (scan != NULL) {

        next = regnext(scan);

        switch (OP(scan)) {
            case BOL:
                if (reginput != regbol)
                    return (0);
                break;
            case EOL:
                if (*reginput != '\0')
                    return (0);
                break;
            case ANY:
                if (*reginput == '\0')
                    return (0);
                reginput++;
                break;
            case EXACTLY:{
                    register int         len;
                    register const char* opnd;

                    opnd = OPERAND(scan);
                    // Inline the first character, for speed.
                    if (*opnd != *reginput)
                        return (0);
                    len = strlen(opnd);
                    if (len > 1 && strncmp(opnd, reginput, len) != 0)
                        return (0);
                    reginput += len;
                }
                break;
            case ANYOF:
                if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
                    return (0);
                reginput++;
                break;
            case ANYBUT:
                if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
                    return (0);
                reginput++;
                break;
            case NOTHING:
                break;
            case BACK:
                break;
            case OPEN + 1:
            case OPEN + 2:
            case OPEN + 3:
            case OPEN + 4:
            case OPEN + 5:
            case OPEN + 6:
            case OPEN + 7:
            case OPEN + 8:
            case OPEN + 9:{
                    register       int    no;
                    register const char* save;

                    no = OP(scan) - OPEN;
                    save = reginput;

                    if (regmatch(next)) {

                        //
                        // Don't set startp if some later invocation of the
                        // same parentheses already has. 
                        //
                        if (regstartp[no] == NULL)
                            regstartp[no] = save;
                        return (1);
                    }
                    else
                        return (0);
                }
                break;
            case CLOSE + 1:
            case CLOSE + 2:
            case CLOSE + 3:
            case CLOSE + 4:
            case CLOSE + 5:
            case CLOSE + 6:
            case CLOSE + 7:
            case CLOSE + 8:
            case CLOSE + 9:{
                    register       int    no;
                    register const char* save;

                    no = OP(scan) - CLOSE;
                    save = reginput;

                    if (regmatch(next)) {

                        //
                        // Don't set endp if some later invocation of the
                        // same parentheses already has. 
                        //
                        if (regendp[no] == NULL)
                            regendp[no] = save;
                        return (1);
                    }
                    else
                        return (0);
                }
                break;
            case BRANCH:{
              
              register const char* save;

                    if (OP(next) != BRANCH)     // No choice.
                        next = OPERAND(scan);   // Avoid recursion.
                    else {
                        do {
                            save = reginput;
                            if (regmatch(OPERAND(scan)))
                                return (1);
                            reginput = save;
                            scan = regnext(scan);
                        } while (scan != NULL && OP(scan) == BRANCH);
                        return (0);
                        // NOTREACHED
                    }
                }
                break;
            case STAR:
            case PLUS:{
              register char   nextch;
                    register int        no;
                    register const char* save;
                    register int        min_no;

                    //
                    // Lookahead to avoid useless match attempts when we know
                    // what character comes next. 
                    //
                    nextch = '\0';
                    if (OP(next) == EXACTLY)
                        nextch = *OPERAND(next);
                    min_no = (OP(scan) == STAR) ? 0 : 1;
                    save = reginput;
                    no = regrepeat(OPERAND(scan));
                    while (no >= min_no) {
                        // If it could work, try it.
                        if (nextch == '\0' || *reginput == nextch)
                            if (regmatch(next))
                                return (1);
                        // Couldn't or didn't -- back up.
                        no--;
                        reginput = save + no;
                    }
                    return (0);
                }
                break;
            case END:
                return (1);     // Success!

            default:
                //RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
                printf ("CoolRegexp::find(): Internal error -- memory corrupted.\n");
                abort ();
        }
        scan = next;
    }

    // 
    //  We get here only if there's trouble -- normally "case END" is the
    //  terminating point. 
    // 
    //RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
    printf ("CoolRegexp::find(): Internal error -- corrupted pointers.\n");
    abort ();
    return (0);
}


/*
 - regrepeat - repeatedly match something simple, report how many
 */
static int regrepeat (const char* p) {
    register       int   count = 0;
    register const char* scan;
    register const char* opnd;

    scan = reginput;
    opnd = OPERAND(p);
    switch (OP(p)) {
        case ANY:
            count = strlen(scan);
            scan += count;
            break;
        case EXACTLY:
            while (*opnd == *scan) {
                count++;
                scan++;
            }
            break;
        case ANYOF:
            while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
                count++;
                scan++;
            }
            break;
        case ANYBUT:
            while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
                count++;
                scan++;
            }
            break;
        default:                // Oh dear.  Called inappropriately.
            //RAISE (Error, SYM(CoolRegexp), SYM(Internal_Error),
            printf ("CoolRegexp::find(): Internal error.\n");
            abort ();
    }
    reginput = scan;
    return (count);
}


/*
 - regnext - dig the "next" pointer out of a node
 */
static const char* regnext (register const char* p) {
    register int offset;

    if (p == &regdummy)
        return (NULL);

    offset = NEXT(p);
    if (offset == 0)
        return (NULL);

    if (OP(p) == BACK)
        return (p - offset);
    else
        return (p + offset);
}


static char* regnext (register char* p) {
    register int offset;

    if (p == &regdummy)
        return (NULL);

    offset = NEXT(p);
    if (offset == 0)
        return (NULL);

    if (OP(p) == BACK)
        return (p - offset);
    else
        return (p + offset);
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?