sentencebreakdata.java

来自「《移动Agent技术》一书的所有章节源代码。」· Java 代码 · 共 330 行 · 第 1/2 页

JAVA
330
字号
        (byte)(SI+2),  (byte)(SI+2),  (byte)(SI+2),  (byte)(SI+2),
        (byte)(SI+2),  (byte)(SI+2),  (byte)(SI+3),  STOP,
        (byte)(SI+2),  (byte)(SI+3),  (byte)(SI+2),  (byte)(SI+2),
        (byte)(SI+2),  (byte)(SI+1),  STOP,

        // 2
        (byte)(SI+2),  (byte)(SI+2),  (byte)(SI+2),  (byte)(SI+2),
        (byte)(SI+2),  (byte)(SI+2),  (byte)(SI+3),  STOP,
        (byte)(SI+2),  (byte)(SI+3),  (byte)(SI+2),  (byte)(SI+2),
        STOP,          (byte)(SI+2),  STOP,

        // 3
        (byte)(SI+2),  (byte)(SI+4),  (byte)(SI+2),  (byte)(SI+2),
        (byte)(SI+2),  (byte)(SI+2),  (byte)(SI+3),  STOP,
        (byte)(SI+3),  (byte)(SI+2),  (byte)(SI+2),  (byte)(SI+2),
        STOP,          (byte)(SI+3),  STOP,

        // 4
        (byte)(SI+2),  (byte)(SI+4),  SI_STOP,       SI_STOP,
        (byte)(SI+2),  (byte)(SI+2),  (byte)(SI+3),  STOP,
        (byte)(SI+2),  (byte)(SI+3),  (byte)(SI+2),  (byte)(SI+2),
        STOP,          (byte)(SI+4),  STOP
    };

    private static final WordBreakTable kSentenceBackward
        = new WordBreakTable(COL_COUNT, kSentenceBackwardData);

    private static final int kRawMapping[] =
    {
        other,        // UNASSIGNED             = 0,
        upperCase,    // UPPERCASE_LETTER       = 1,
        lowerCase,    // LOWERCASE_LETTER       = 2,
        other,        // TITLECASE_LETTER       = 3,
        other,        // MODIFIER_LETTER        = 4,
        other,        // OTHER_LETTER           = 5,
        nsm,          // NON_SPACING_MARK       = 6,
        nsm,          // ENCLOSING_MARK         = 7,
        other,        // COMBINING_SPACING_MARK = 8,
        number,       // DECIMAL_DIGIT_NUMBER   = 9,
        number,       // LETTER_NUMBER          = 10,
        number,       // OTHER_NUMBER           = 11,
        space,        // SPACE_SEPARATOR        = 12,
        space,        // LINE_SEPARATOR         = 13,
        space,        // PARAGRAPH_SEPARATOR    = 14,            ???????
        other,        // CONTROL                = 15,
        other,        // PRIVATE_USE            = 16,
        other,        // FORMAT                 = 17,
        other,        // ????                   = 18,
        other,        // SURROGATE              = 19,
        other,        // DASH_PUNCTUATION       = 20,
        openBracket,  // START_PUNCTUATION      = 21,
        closeBracket, // END_PUNCTUATION        = 22,
        other,        // CONNECTOR_PUNCTUATION  = 23,
        other,        // OTHER_PUNCTUATION      = 24,
        other,        // MATH_SYMBOL            = 25,
        other,        // CURRENCY_SYMBOL        = 26,
        other,        // MODIFIER_SYMBOL        = 27,
        other,        // OTHER_SYMBOL           = 28;
    };

    private static SpecialMapping kExceptionChar[] =
    {
        //note: the ranges in this table must be sorted in ascending order
        //as required by the UnicodeClassMapping class.
        new SpecialMapping(ASCII_HORIZONTAL_TABULATION, space),
        new SpecialMapping(ASCII_LINEFEED, space),
        new SpecialMapping(ASCII_FORM_FEED, terminator),
        new SpecialMapping(ASCII_CARRIAGE_RETURN, space),

        new SpecialMapping(ASCII_EXCLAMATION_MARK, terminator),
        new SpecialMapping(ASCII_QUOTATION_MARK, quote),

        new SpecialMapping(ASCII_APOSTROPHE, quote),

        new SpecialMapping(ASCII_FULL_STOP, ambiguosTerm),
        new SpecialMapping(ASCII_QUESTION_MARK, terminator),
        new SpecialMapping(ASCII_NONBREAKING_SPACE, other),
        new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, space),
        new SpecialMapping(PUNCTUATION_PARAGRAPH_SEPARATOR, paragraphBreak),
        new SpecialMapping(PUNCTUATION_IDEOGRAPHIC_FULL_STOP, terminator),
        new SpecialMapping(HIRAGANA_LETTER_SMALL_A, HIRAGANA_LETTER_VU, cjk),
        new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
                           HIRAGANA_SEMIVOICED_SOUND_MARK, cjk),         // cjk
        new SpecialMapping(KATAKANA_LETTER_SMALL_A, KATAKANA_LETTER_SMALL_KE,
                           cjk),   // cjk
        new SpecialMapping(UNICODE_LOW_BOUND_HAN, UNICODE_HIGH_BOUND_HAN, cjk),
        new SpecialMapping(CJK_COMPATIBILITY_F900, CJK_COMPATIBILITY_FA2D,cjk),
        new SpecialMapping(UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE, other),
        new SpecialMapping(END_OF_STRING, EOS)
    };

    private static final boolean SentenceExceptionFlags[] = {
        false,            // kNonCharacter         = 0,
        false,            // kUppercaseLetter      = 1,
        false,            // kLowercaseLetter      = 2,
        false,            // kTitlecaseLetter      = 3,
        false,            // kModifierLetter       = 4,
        true,             // kOtherLetter          = 5,
        true,             // kNonSpacingMark       = 6,
        false,            // kEnclosingMark        = 7,
        false,            // kCombiningSpacingMark = 8,
        false,            // kDecimalNumber        = 9,
        false,            // kLetterNumber         = 10,
        false,            // kOtherNumber          = 11,
        true,             // kSpaceSeparator       = 12,
        true,             // kLineSeparator        = 13,
        true,             // kParagraphSeparator   = 14,
        true,             // kControlCharacter     = 15,
        true,             // kFormatCharacter      = 16,
        false,            // UNDEFINED             = 17,
        false,            // kPrivateUseCharacter  = 18,
        false,            // kSurrogate            = 19,
        false,            // kDashPunctuation      = 20,
        false,            // kOpenPunctuation      = 21,
        false,            // kClosePunctuation     = 22,
        false,            // kConnectorPunctuation = 23,
        true,             // kOtherPunctuation     = 24,
        false,            // kMathSymbol           = 25,
        false,            // kCurrencySymbol       = 26,
        false,            // kModifierSymbol       = 27,
        false             // kOtherSymbol          = 28
    };

    private static final int kSentenceAsciiValues[] = {
        //  null    soh     stx     etx     eot     enq     ask     bell
            other,  other,  other,  other,  other,  other,  other,  other,
        //  bs      ht      lf     vt     ff          cr     so     si
            other,  space,  space, other, terminator, space, other, other,
        //  dle     dc1     dc2     dc3     dc4     nak     syn     etb
            other,  other,  other,  other,  other,  other,  other,  other,
        //  can     em      sub     esc     fs      gs      rs      us
            other,  other,  other,  other,  other,  other,  other,  other,
        //  sp      !           "      #      $      %      &      '
            space,  terminator, quote, other, other, other, other, quote,
        //  (            )             *      +      ,      -      .             /
            openBracket, closeBracket, other, other, other, other, ambiguosTerm, other,
        //  0       1       2       3       4       5       6       7
            number, number, number, number, number, number, number, number,
        //  8       9       :       ;       <       =       >       ?
            number, number, other,  other,  other,  other,  other,  terminator,
        //  @       A          B          C          D          E          F          G
            other,  upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
        //  H          I          J          K          L          M          N          O
            upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
        //  P          Q          R          S          T          U          V          W
            upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase, upperCase,
        //  X          Y          Z          [            \      ]             ^      _
            upperCase, upperCase, upperCase, openBracket, other, closeBracket, other, other,
        //  `       a          b          c          d          e          f          g
            other,  lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
        //  h          i          j          k          l          m          n          o
            lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
        //  p          q          r          s          t          u          v          w
            lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase, lowerCase,
        //  x          y          z          {            |      }             ~      del
            lowerCase, lowerCase, lowerCase, openBracket, other, closeBracket, other, other,
        //  ctrl    ctrl    ctrl    ctrl    ctrl    ctrl    ctrl    ctrl
            other,  other,  other,  other,  other,  other,  other,  other,
        //  ctrl    ctrl    ctrl    ctrl    ctrl    ctrl    ctrl    ctrl
            other,  other,  other,  other,  other,  other,  other,  other,
        //  ctrl    ctrl    ctrl    ctrl    ctrl    ctrl    ctrl    ctrl
            other,  other,  other,  other,  other,  other,  other,  other,
        //  ctrl    ctrl    ctrl    ctrl    ctrl    ctrl    ctrl    ctrl
            other,  other,  other,  other,  other,  other,  other,  other,
        //  nbsp    

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?