⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 recompiler.java

📁 java写的crawler
💻 JAVA
📖 第 1 页 / 共 4 页
字号:

        // Loop while we've got input

        atomLoop:

        while (idx < len)
        {
            // Is there a next char?
            if ((idx + 1) < len)
            {
                char c = pattern.charAt(idx + 1);

                // If the next 'char' is an escape, look past the whole escape
                if (pattern.charAt(idx) == '\\')
                {
                    int idxEscape = idx;
                    escape();
                    if (idx < len)
                    {
                        c = pattern.charAt(idx);
                    }
                    idx = idxEscape;
                }

                // Switch on next char
                switch (c)
                {
                    case '{':
                    case '?':
                    case '*':
                    case '+':

                        // If the next character is a closure operator and our atom is non-empty, the
                        // current character should bind to the closure operator rather than the atom
                        if (lenAtom != 0)
                        {
                            break atomLoop;
                        }
                }
            }

            // Switch on current char
            switch (pattern.charAt(idx))
            {
                case ']':
                case '^':
                case '$':
                case '.':
                case '[':
                case '(':
                case ')':
                case '|':
                    break atomLoop;

                case '{':
                case '?':
                case '*':
                case '+':

                    // We should have an atom by now
                    if (lenAtom == 0)
                    {
                        // No atom before closure
                        syntaxError("Missing operand to closure");
                    }
                    break atomLoop;

                case '\\':

                    {
                        // Get the escaped character (advances input automatically)
                        int idxBeforeEscape = idx;
                        char c = escape();

                        // Check if it's a simple escape (as opposed to, say, a backreference)
                        if ((c & ESC_MASK) == ESC_MASK)
                        {
                            // Not a simple escape, so backup to where we were before the escape.
                            idx = idxBeforeEscape;
                            break atomLoop;
                        }

                        // Add escaped char to atom
                        emit(c);
                        lenAtom++;
                    }
                    break;

                default:

                    // Add normal character to atom
                    emit(pattern.charAt(idx++));
                    lenAtom++;
                    break;
            }
        }

        // This "shouldn't" happen
        if (lenAtom == 0)
        {
            internalError();
        }

        // Emit the atom length into the program
        instruction[ret + RE.offsetOpdata] = (char)lenAtom;
        return ret;
    }

    /**
     * Match a terminal node.
     * @param flags Flags
     * @return Index of terminal node (closeable)
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int terminal(int[] flags) throws RESyntaxException
    {
        switch (pattern.charAt(idx))
        {
        case RE.OP_EOL:
        case RE.OP_BOL:
        case RE.OP_ANY:
            return node(pattern.charAt(idx++), 0);

        case '[':
            return characterClass();

        case '(':
            return expr(flags);

        case ')':
            syntaxError("Unexpected close paren");

        case '|':
            internalError();

        case ']':
            syntaxError("Mismatched class");

        case 0:
            syntaxError("Unexpected end of input");

        case '?':
        case '+':
        case '{':
        case '*':
            syntaxError("Missing operand to closure");

        case '\\':
            {
                // Don't forget, escape() advances the input stream!
                int idxBeforeEscape = idx;

                // Switch on escaped character
                switch (escape())
                {
                    case ESC_CLASS:
                    case ESC_COMPLEX:
                        flags[0] &= ~NODE_NULLABLE;
                        return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));

                    case ESC_BACKREF:
                        {
                            char backreference = (char)(pattern.charAt(idx - 1) - '0');
                            if (parens <= backreference)
                            {
                                syntaxError("Bad backreference");
                            }
                            flags[0] |= NODE_NULLABLE;
                            return node(RE.OP_BACKREF, backreference);
                        }

                    default:

                        // We had a simple escape and we want to have it end up in
                        // an atom, so we back up and fall though to the default handling
                        idx = idxBeforeEscape;
                        flags[0] &= ~NODE_NULLABLE;
                        break;
                }
            }
        }

        // Everything above either fails or returns.
        // If it wasn't one of the above, it must be the start of an atom.
        flags[0] &= ~NODE_NULLABLE;
        return atom();
    }

    /**
     * Compile a possibly closured terminal
     * @param flags Flags passed by reference
     * @return Index of closured node
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int closure(int[] flags) throws RESyntaxException
    {
        // Before terminal
        int idxBeforeTerminal = idx;

        // Values to pass by reference to terminal()
        int[] terminalFlags = { NODE_NORMAL };

        // Get terminal symbol
        int ret = terminal(terminalFlags);

        // Or in flags from terminal symbol
        flags[0] |= terminalFlags[0];

        // Advance input, set NODE_NULLABLE flag and do sanity checks
        if (idx >= len)
        {
            return ret;
        }
        boolean greedy = true;
        char closureType = pattern.charAt(idx);
        switch (closureType)
        {
            case '?':
            case '*':

                // The current node can be null
                flags[0] |= NODE_NULLABLE;

            case '+':

                // Eat closure character
                idx++;

            case '{':

                // Don't allow blantant stupidity
                int opcode = instruction[ret + RE.offsetOpcode];
                if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
                {
                    syntaxError("Bad closure operand");
                }
                if ((terminalFlags[0] & NODE_NULLABLE) != 0)
                {
                    syntaxError("Closure operand can't be nullable");
                }
                break;
        }

        // If the next character is a '?', make the closure non-greedy (reluctant)
        if (idx < len && pattern.charAt(idx) == '?')
        {
            idx++;
            greedy = false;
        }

        if (greedy)
        {
            // Actually do the closure now
            switch (closureType)
            {
                case '{':
                {
                    // We look for our bracket in the list
                    boolean found = false;
                    int i;
                    allocBrackets();
                    for (i = 0; i < brackets; i++)
                    {
                        if (bracketStart[i] == idx)
                        {
                            found = true;
                            break;
                        }
                    }
                    
                    // If its not in the list we parse the {m,n}
                    if (!found)
                    {
                        if (brackets >= maxBrackets)
                        {
                            syntaxError("Too many bracketed closures (limit is 10)");
                        }
                        bracketStart[brackets] = idx;
                        bracket();
                        bracketEnd[brackets] = idx;
                        i = brackets++;
                    }
                    
                    // If there's a min, rewind stream and reparse
                    if (--bracketMin[i] > 0)
                    {
                        // Rewind stream and run it through again
                        idx = idxBeforeTerminal;
                        break;
                    }
                    
                    // Do the right thing for maximum ({m,})
                    if (bracketOpt[i] == bracketFinished)
                    {
                        // Drop through now and closure expression.
                        // We are done with the {m,} expr, so skip rest
                        closureType = '*';
                        bracketOpt[i] = 0;
                        idx = bracketEnd[i];
                    }
                    else
                        if (bracketOpt[i] == bracketUnbounded)
                        {
                            idx = idxBeforeTerminal;
                            bracketOpt[i] = bracketFinished;
                            break;
                        }
                        else
                            if (bracketOpt[i]-- > 0)
                            {
                                // Drop through to optionally close and then 'play it again sam!'
                                idx = idxBeforeTerminal;
                                closureType = '?';
                            }
                            else
                            {
                                // We are done. skip the rest of {m,n} expr
                                idx = bracketEnd[i];
                                break;
                            }
                }
                
                // Fall through!
                
                case '?':
                case '*':
                    
                    if (!greedy)
                    {
                        break;
                    }
                    
                    if (closureType == '?')
                    {
                        // X? is compiled as (X|)
                        nodeInsert(RE.OP_BRANCH, 0, ret);                 // branch before X
                        setNextOfEnd(ret, node (RE.OP_BRANCH, 0));        // inserted branch to option
                        int nothing = node (RE.OP_NOTHING, 0);            // which is OP_NOTHING
                        setNextOfEnd(ret, nothing);                       // point (second) branch to OP_NOTHING
                        setNextOfEnd(ret + RE.nodeSize, nothing);         // point the end of X to OP_NOTHING node
                    }
                    
                    if (closureType == '*')
                    {
                        // X* is compiled as (X{gotoX}|)
                        nodeInsert(RE.OP_BRANCH, 0, ret);                         // branch before X
                        setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0));   // end of X points to an option
                        setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0));     // to goto
                        setNextOfEnd(ret + RE.nodeSize, ret);                     // the start again
                        setNextOfEnd(ret, node(RE.OP_BRANCH, 0));                 // the other option is
                        setNextOfEnd(ret, node(RE.OP_NOTHING, 0));                // OP_NOTHING
                    }
                    break;
                    
                case '+':
                {
                    // X+ is compiled as X({gotoX}|)
                    int branch;
                    branch = node(RE.OP_BRANCH, 0);                   // a new branch
                    setNextOfEnd(ret, branch);                        // is added to the end of X
                    setNextOfEnd(node(RE.OP_GOTO, 0), ret);           // one option is to go back to the start
                    setNextOfEnd(branch, node(RE.OP_BRANCH, 0));      // the other option
                    setNextOfEnd(ret, node(RE.OP_NOTHING, 0));        // is OP_NOTHING
                }
                break;
            }
        }
        else
        {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -