characterclassconstructor.cpp

来自「linux下开源浏览器WebKit的源码,市面上的很多商用浏览器都是移植自Web」· C++ 代码 · 共 258 行
CPP
258 行
/* * Copyright (C) 2008, 2009 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in the *    documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */#include "config.h"#include "CharacterClassConstructor.h"#if ENABLE(WREC)#include "pcre_internal.h"#include <wtf/ASCIICType.h>using namespace WTF;namespace JSC { namespace WREC {void CharacterClassConstructor::addSorted(Vector<UChar>& matches, UChar ch){    unsigned pos = 0;    unsigned range = matches.size();    // binary chop, find position to insert char.    while (range) {        unsigned index = range >> 1;        int val = matches[pos+index] - ch;        if (!val)            return;        else if (val > 0)            range = index;        else {            pos += (index+1);            range -= (index+1);        }    }        if (pos == matches.size())        matches.append(ch);    else        matches.insert(pos, ch);}void CharacterClassConstructor::addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi){    unsigned end = ranges.size();        // Simple linear scan - I doubt there are that many ranges anyway...    // feel free to fix this with something faster (eg binary chop).    for (unsigned i = 0; i < end; ++i) {        // does the new range fall before the current position in the array        if (hi < ranges[i].begin) {            // optional optimization: concatenate appending ranges? - may not be worthwhile.            if (hi == (ranges[i].begin - 1)) {                ranges[i].begin = lo;                return;            }            CharacterRange r = {lo, hi};            ranges.insert(i, r);            return;        }        // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining        // If the new range start at or before the end of the last range, then the overlap (if it starts one after the        // end of the last range they concatenate, which is just as good.        if (lo <= (ranges[i].end + 1)) {            // found an intersect! we'll replace this entry in the array.            ranges[i].begin = std::min(ranges[i].begin, lo);            ranges[i].end = std::max(ranges[i].end, hi);            // now check if the new range can subsume any subsequent ranges.            unsigned next = i+1;            // each iteration of the loop we will either remove something from the list, or break the loop.            while (next < ranges.size()) {                if (ranges[next].begin <= (ranges[i].end + 1)) {                    // the next entry now overlaps / concatenates this one.                    ranges[i].end = std::max(ranges[i].end, ranges[next].end);                    ranges.remove(next);                } else                    break;            }                        return;        }    }    // CharacterRange comes after all existing ranges.    CharacterRange r = {lo, hi};    ranges.append(r);}void CharacterClassConstructor::put(UChar ch){    // Parsing a regular expression like [a-z], we start in an initial empty state:    //     ((m_charBuffer == -1) && !m_isPendingDash)    // When buffer the 'a' sice it may be (and is in this case) part of a range:    //     ((m_charBuffer != -1) && !m_isPendingDash)    // Having parsed the hyphen we then record that the dash is also pending:    //     ((m_charBuffer != -1) && m_isPendingDash)    // The next change will always take us back to the initial state - either because    // a complete range has been parsed (such as [a-z]), or because a flush is forced,    // due to an early end in the regexp ([a-]), or a character class escape being added    // ([a-\s]).  The fourth permutation of m_charBuffer and m_isPendingDash is not permitted.    ASSERT(!((m_charBuffer == -1) && m_isPendingDash));    if (m_charBuffer != -1) {        if (m_isPendingDash) {            // EXAMPLE: parsing [-a-c], the 'c' reaches this case - we have buffered a previous character and seen a hyphen, so this is a range.            UChar lo = m_charBuffer;            UChar hi = ch;            // Reset back to the inital state.            m_charBuffer = -1;            m_isPendingDash = false;                        // This is an error, detected lazily.  Do not proceed.            if (lo > hi) {                m_isUpsideDown = true;                return;            }                        if (lo <= 0x7f) {                char asciiLo = lo;                char asciiHi = std::min(hi, (UChar)0x7f);                addSortedRange(m_ranges, lo, asciiHi);                                if (m_isCaseInsensitive) {                    if ((asciiLo <= 'Z') && (asciiHi >= 'A'))                        addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));                    if ((asciiLo <= 'z') && (asciiHi >= 'a'))                        addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));                }            }            if (hi >= 0x80) {                UChar unicodeCurr = std::max(lo, (UChar)0x80);                addSortedRange(m_rangesUnicode, unicodeCurr, hi);                                if (m_isCaseInsensitive) {                    // we're going to scan along, updating the start of the range                    while (unicodeCurr <= hi) {                        // Spin forwards over any characters that don't have two cases.                        for (; jsc_pcre_ucp_othercase(unicodeCurr) == -1; ++unicodeCurr) {                            // if this was the last character in the range, we're done.                            if (unicodeCurr == hi)                                return;                        }                        // if we fall through to here, unicodeCurr <= hi & has another case. Get the other case.                        UChar rangeStart = unicodeCurr;                        UChar otherCurr = jsc_pcre_ucp_othercase(unicodeCurr);                                                // If unicodeCurr is not yet hi, check the next char in the range.  If it also has another case,                        // and if it's other case value is one greater then the othercase value for the current last                        // character included in the range, we can include next into the range.                        while ((unicodeCurr < hi) && (jsc_pcre_ucp_othercase(unicodeCurr + 1) == (otherCurr + 1))) {                            // increment unicodeCurr; it points to the end of the range.                            // increment otherCurr, due to the check above other for next must be 1 greater than the currrent other value.                            ++unicodeCurr;                            ++otherCurr;                        }                                                // otherChar is the last in the range of other case chars, calculate offset to get back to the start.                        addSortedRange(m_rangesUnicode, otherCurr-(unicodeCurr-rangeStart), otherCurr);                                                // unicodeCurr has been added, move on to the next char.                        ++unicodeCurr;                    }                }            }        } else if (ch == '-')            // EXAMPLE: parsing [-a-c], the second '-' reaches this case - the hyphen is treated as potentially indicating a range.            m_isPendingDash = true;        else {            // EXAMPLE: Parsing [-a-c], the 'a' reaches this case - we repace the previously buffered char with the 'a'.            flush();            m_charBuffer = ch;        }    } else        // EXAMPLE: Parsing [-a-c], the first hyphen reaches this case - there is no buffered character        // (the hyphen not treated as a special character in this case, same handling for any char).        m_charBuffer = ch;}// When a character is added to the set we do not immediately add it to the arrays, in case it is actually defining a range.// When we have determined the character is not used in specifing a range it is added, in a sorted fashion, to the appropriate// array (either ascii or unicode).// If the pattern is case insensitive we add entries for both cases.void CharacterClassConstructor::flush(){    if (m_charBuffer != -1) {        if (m_charBuffer <= 0x7f) {            if (m_isCaseInsensitive && isASCIILower(m_charBuffer))                addSorted(m_matches, toASCIIUpper(m_charBuffer));            addSorted(m_matches, m_charBuffer);            if (m_isCaseInsensitive && isASCIIUpper(m_charBuffer))                addSorted(m_matches, toASCIILower(m_charBuffer));        } else {            addSorted(m_matchesUnicode, m_charBuffer);            if (m_isCaseInsensitive) {                int other = jsc_pcre_ucp_othercase(m_charBuffer);                if (other != -1)                    addSorted(m_matchesUnicode, other);            }        }        m_charBuffer = -1;    }        if (m_isPendingDash) {        addSorted(m_matches, '-');        m_isPendingDash = false;    }}void CharacterClassConstructor::append(const CharacterClass& other){    // [x-\s] will add, 'x', '-', and all unicode spaces to new class (same as [x\s-]).    // Need to check the spec, really, but think this matches PCRE behaviour.    flush();        if (other.numMatches) {        for (size_t i = 0; i < other.numMatches; ++i)            addSorted(m_matches, other.matches[i]);    }    if (other.numRanges) {        for (size_t i = 0; i < other.numRanges; ++i)            addSortedRange(m_ranges, other.ranges[i].begin, other.ranges[i].end);    }    if (other.numMatchesUnicode) {        for (size_t i = 0; i < other.numMatchesUnicode; ++i)            addSorted(m_matchesUnicode, other.matchesUnicode[i]);    }    if (other.numRangesUnicode) {        for (size_t i = 0; i < other.numRangesUnicode; ++i)            addSortedRange(m_rangesUnicode, other.rangesUnicode[i].begin, other.rangesUnicode[i].end);    }}} } // namespace JSC::WREC#endif // ENABLE(WREC)
characterclassconstructor.cpp - 源码说明

本页面展示了「linux下开源浏览器WebKit的源码,市面上的很多商用浏览器都是移植自WebKit」中的 characterclassconstructor.cpp 源码文件，采用 C++ 编程语言编写，共 258 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与WebKit相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?