📄 recompiler.java

📁 java写的crawler
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
        }

        // Must have close brace
        if (idx >= len || pattern.charAt(idx++) != '}')
        {
            syntaxError("Missing close brace");
        }
    }

    /**
     * Match an escape sequence.  Handles quoted chars and octal escapes as well
     * as normal escape characters.  Always advances the input stream by the
     * right amount. This code "understands" the subtle difference between an
     * octal escape and a backref.  You can access the type of ESC_CLASS or
     * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
     * @return ESC_* code or character if simple escape
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    char escape() throws RESyntaxException
    {
        // "Shouldn't" happen
        if (pattern.charAt(idx) != '\\')
        {
            internalError();
        }

        // Escape shouldn't occur as last character in string!
        if (idx + 1 == len)
        {
            syntaxError("Escape terminates string");
        }

        // Switch on character after backslash
        idx += 2;
        char escapeChar = pattern.charAt(idx - 1);
        switch (escapeChar)
        {
            case RE.E_BOUND:
            case RE.E_NBOUND:
                return ESC_COMPLEX;

            case RE.E_ALNUM:
            case RE.E_NALNUM:
            case RE.E_SPACE:
            case RE.E_NSPACE:
            case RE.E_DIGIT:
            case RE.E_NDIGIT:
                return ESC_CLASS;

            case 'u':
            case 'x':
                {
                    // Exact required hex digits for escape type
                    int hexDigits = (escapeChar == 'u' ? 4 : 2);

                    // Parse up to hexDigits characters from input
                    int val = 0;
                    for ( ; idx < len && hexDigits-- > 0; idx++)
                    {
                        // Get char
                        char c = pattern.charAt(idx);

                        // If it's a hexadecimal digit (0-9)
                        if (c >= '0' && c <= '9')
                        {
                            // Compute new value
                            val = (val << 4) + c - '0';
                        }
                        else
                        {
                            // If it's a hexadecimal letter (a-f)
                            c = Character.toLowerCase(c);
                            if (c >= 'a' && c <= 'f')
                            {
                                // Compute new value 
                                val = (val << 4) + (c - 'a') + 10;
                            }
                            else
                            {
                                // If it's not a valid digit or hex letter, the escape must be invalid
                                // because hexDigits of input have not been absorbed yet.
                                syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
                            }
                        }
                    }
                    return (char)val;
                }

            case 't':
                return '\t';

            case 'n':
                return '\n';

            case 'r':
                return '\r';

            case 'f':
                return '\f';

            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':

                // An octal escape starts with a 0 or has two digits in a row
                if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
                {
                    // Handle \nnn octal escapes
                    int val = escapeChar - '0';
                    if (idx < len && Character.isDigit(pattern.charAt(idx)))
                    {
                        val = ((val << 3) + (pattern.charAt(idx++) - '0'));
                        if (idx < len && Character.isDigit(pattern.charAt(idx)))
                        {
                            val = ((val << 3) + (pattern.charAt(idx++) - '0'));
                        }
                    }
                    return (char)val;
                }

                // It's actually a backreference (\[1-9]), not an escape
                return ESC_BACKREF;

            default:

                // Simple quoting of a character
                return escapeChar;
        }
    }

    /**
     * Compile a character class
     * @return Index of class node
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int characterClass() throws RESyntaxException
    {
        // Check for bad calling or empty class
        if (pattern.charAt(idx) != '[')
        {
            internalError();
        }

        // Check for unterminated or empty class
        if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
        {
            syntaxError("Empty or unterminated class");
        }

        // Check for POSIX character class
        if (idx < len && pattern.charAt(idx) == ':')
        {
            // Skip colon
            idx++;

            // POSIX character classes are denoted with lowercase ASCII strings
            int idxStart = idx;
            while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
            {
                idx++;
            }
            
            // Should be a ":]" to terminate the POSIX character class
            if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
            {
                // Get character class
                String charClass = pattern.substring(idxStart, idx);

                // Select the POSIX class id
                Character i = (Character)hashPOSIX.get(charClass);
                if (i != null)
                {
                    // Move past colon and right bracket
                    idx += 2;

                    // Return new POSIX character class node
                    return node(RE.OP_POSIXCLASS, i.charValue());
                }
                syntaxError("Invalid POSIX character class '" + charClass + "'");
            }
            syntaxError("Invalid POSIX character class syntax");
        }

        // Try to build a class.  Create OP_ANYOF node
        int ret = node(RE.OP_ANYOF, 0);

        // Parse class declaration
        char CHAR_INVALID = Character.MAX_VALUE;
        char last = CHAR_INVALID;
        char simpleChar = 0;
        boolean include = true;
        boolean definingRange = false;
        int idxFirst = idx;
        char rangeStart = Character.MIN_VALUE;
        char rangeEnd;
        RERange range = new RERange();
        while (idx < len && pattern.charAt(idx) != ']')
        {

            switchOnCharacter:

            // Switch on character
            switch (pattern.charAt(idx))
            {
                case '^':
                    include = !include;
                    if (idx == idxFirst)
                    {
                        range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
                    }
                    idx++;
                    continue;

                case '\\':
                {
                    // Escape always advances the stream
                    char c;
                    switch (c = escape ())
                    {
                        case ESC_COMPLEX:
                        case ESC_BACKREF:

                            // Word boundaries and backrefs not allowed in a character class!
                            syntaxError("Bad character class");

                        case ESC_CLASS:

                            // Classes can't be an endpoint of a range
                            if (definingRange)
                            {
                                syntaxError("Bad character class");
                            }

                            // Handle specific type of class (some are ok)
                            switch (pattern.charAt(idx - 1))
                            {
                                case RE.E_NSPACE:
                                case RE.E_NDIGIT:
                                case RE.E_NALNUM:
                                    syntaxError("Bad character class");

                                case RE.E_SPACE:
                                    range.include('\t', include);
                                    range.include('\r', include);
                                    range.include('\f', include);
                                    range.include('\n', include);
                                    range.include('\b', include);
                                    range.include(' ', include);
                                    break;

                                case RE.E_ALNUM:
                                    range.include('a', 'z', include);
                                    range.include('A', 'Z', include);
                                    range.include('_', include);

                                    // Fall through!

                                case RE.E_DIGIT:
                                    range.include('0', '9', include);
                                    break;
                            }

                            // Make last char invalid (can't be a range start)
                            last = CHAR_INVALID;
                            break;

                        default:

                            // Escape is simple so treat as a simple char
                            simpleChar = c;
                            break switchOnCharacter;
                    }
                }
                continue;

                case '-':

                    // Start a range if one isn't already started
                    if (definingRange)
                    {
                        syntaxError("Bad class range");
                    }
                    definingRange = true;

                    // If no last character, start of range is 0
                    rangeStart = (last == CHAR_INVALID ? 0 : last);

                    // Premature end of range. define up to Character.MAX_VALUE
                    if ((idx + 1) < len && pattern.charAt(++idx) == ']')
                    {
                        simpleChar = Character.MAX_VALUE;
                        break;
                    }
                    continue;

                default:
                    simpleChar = pattern.charAt(idx++);
                    break;
            }

            // Handle simple character simpleChar
            if (definingRange)
            {
                // if we are defining a range make it now
                rangeEnd = simpleChar;

                // Actually create a range if the range is ok
                if (rangeStart >= rangeEnd)
                {
                    syntaxError("Bad character class");
                }
                range.include(rangeStart, rangeEnd, include);

                // We are done defining the range
                last = CHAR_INVALID;
                definingRange = false;
            }
            else
            {
                // If simple character and not start of range, include it
                if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-')
                {
                    range.include(simpleChar, include);
                }
                last = simpleChar;
            }
        }

        // Shouldn't be out of input
        if (idx == len)
        {
            syntaxError("Unterminated character class");
        }

        // Absorb the ']' end of class marker
        idx++;

        // Emit character class definition
        instruction[ret + RE.offsetOpdata] = (char)range.num;
        for (int i = 0; i < range.num; i++)
        {
            emit((char)range.minRange[i]);
            emit((char)range.maxRange[i]);
        }
        return ret;
    }

    /**
     * Absorb an atomic character string.  This method is a little tricky because
     * it can un-include the last character of string if a closure operator follows.
     * This is correct because *+? have higher precedence than concatentation (thus
     * ABC* means AB(C*) and NOT (ABC)*).
     * @return Index of new atom node
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int atom() throws RESyntaxException
    {
        // Create a string node
        int ret = node(RE.OP_ATOM, 0);

        // Length of atom
        int lenAtom = 0;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -