📄 break.c
字号:
if (makes_hangul_syllable) break_op = BREAK_IF_SPACES; else break_op = BREAK_ALLOWED; break; case G_UNICODE_BREAK_AMBIGUOUS: /* FIXME: * we need to resolve the East Asian width * to decide what to do here */ case G_UNICODE_BREAK_COMPLEX_CONTEXT: /* FIXME: * language engines should handle this case... */ case G_UNICODE_BREAK_UNKNOWN: /* treat unknown, complex, and ambiguous like ALPHABETIC * for now */ break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC); break; default: g_assert (IN_BREAK_TABLE (break_type)); break_op = BREAK_OP (prev_break_type, break_type); break; } break; } if (break_op != BREAK_ALREADY_HANDLED) { switch (break_op) { case BREAK_PROHIBITED: /* can't break here */ attrs[i].is_char_break = FALSE; break; case BREAK_IF_SPACES: /* break if prev char was space */ if (prev_was_break_space) attrs[i].is_line_break = TRUE; break; case BREAK_ALLOWED: attrs[i].is_line_break = TRUE; break; default: g_assert_not_reached (); break; } } } if (break_type != G_UNICODE_BREAK_SPACE) { prev_break_type = break_type; prev_was_break_space = FALSE; prev_jamo = jamo; } else prev_was_break_space = TRUE; /* ---- Word breaks ---- */ /* default to not a word start/end */ attrs[i].is_word_start = FALSE; attrs[i].is_word_end = FALSE; if (current_word_type != WordNone) { /* Check for a word end */ switch (type) { case G_UNICODE_COMBINING_MARK: case G_UNICODE_ENCLOSING_MARK: case G_UNICODE_NON_SPACING_MARK: case G_UNICODE_FORMAT: /* nothing, we just eat these up as part of the word */ break; case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_TITLECASE_LETTER: case G_UNICODE_UPPERCASE_LETTER: if (current_word_type == WordLetters) { /* Japanese special cases for ending the word */ if (JAPANESE (last_word_letter) || JAPANESE (wc)) { if ((HIRAGANA (last_word_letter) && !HIRAGANA (wc)) || (KATAKANA (last_word_letter) && !(KATAKANA (wc) || HIRAGANA (wc))) || (KANJI (last_word_letter) && !(HIRAGANA (wc) || KANJI (wc))) || (JAPANESE (last_word_letter) && !JAPANESE (wc)) || (!JAPANESE (last_word_letter) && JAPANESE (wc))) attrs[i].is_word_end = TRUE; } } else { /* end the number word, start the letter word */ attrs[i].is_word_end = TRUE; attrs[i].is_word_start = TRUE; current_word_type = WordLetters; } last_word_letter = wc; break; case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_LETTER_NUMBER: case G_UNICODE_OTHER_NUMBER: if (current_word_type != WordNumbers) { attrs[i].is_word_end = TRUE; attrs[i].is_word_start = TRUE; current_word_type = WordNumbers; } last_word_letter = wc; break; default: /* Punctuation, control/format chars, etc. all end a word. */ attrs[i].is_word_end = TRUE; current_word_type = WordNone; break; } } else { /* Check for a word start */ switch (type) { case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_TITLECASE_LETTER: case G_UNICODE_UPPERCASE_LETTER: current_word_type = WordLetters; last_word_letter = wc; attrs[i].is_word_start = TRUE; break; case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_LETTER_NUMBER: case G_UNICODE_OTHER_NUMBER: current_word_type = WordNumbers; last_word_letter = wc; attrs[i].is_word_start = TRUE; break; default: /* No word here */ break; } } /* ---- Sentence breaks ---- */ /* The Unicode spec specifies sentence breakpoints, so that a piece of * text would be partitioned into sentences, and all characters would * be inside some sentence. This code implements that for is_sentence_boundary, * but tries to keep leading/trailing whitespace out of sentences for * the start/end flags */ /* The Unicode spec seems to say that one trailing line/para * separator can be tacked on to a sentence ending in ! or ?, * but not a sentence ending in period; I think they're on crack * so am allowing one to be tacked onto a sentence ending in period. */#define MAYBE_START_NEW_SENTENCE \ switch (type) \ { \ case G_UNICODE_LINE_SEPARATOR: \ case G_UNICODE_PARAGRAPH_SEPARATOR: \ case G_UNICODE_CONTROL: \ case G_UNICODE_FORMAT: \ case G_UNICODE_SPACE_SEPARATOR: \ sentence_state = STATE_SENTENCE_OUTSIDE; \ break; \ \ default: \ sentence_state = STATE_SENTENCE_BODY; \ attrs[i].is_sentence_start = TRUE; \ break; \ } /* No sentence break at the start of the text */ /* default to not a sentence breakpoint */ attrs[i].is_sentence_boundary = FALSE; attrs[i].is_sentence_start = FALSE; attrs[i].is_sentence_end = FALSE; /* FIXME the Unicode spec lumps control/format chars with * line/para separators in descriptive text, but not in the * character class specs, in table 5-6, so who knows whether you * are actually supposed to break on control/format * characters. Seems semi-broken to break on tabs... */ /* Break after line/para separators except carriage return * followed by newline */ switch (prev_type) { case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_PARAGRAPH_SEPARATOR: case G_UNICODE_CONTROL: case G_UNICODE_FORMAT: if (wc == '\r') { if (next_wc != '\n') attrs[i].is_sentence_boundary = TRUE; } else attrs[i].is_sentence_boundary = TRUE; break; default: break; } /* break before para/line separators except newline following * carriage return */ switch (type) { case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_PARAGRAPH_SEPARATOR: case G_UNICODE_CONTROL: case G_UNICODE_FORMAT: if (wc == '\n') { if (prev_wc != '\r') attrs[i].is_sentence_boundary = TRUE; } else attrs[i].is_sentence_boundary = TRUE; break; default: break; } switch (sentence_state) { case STATE_SENTENCE_OUTSIDE: /* Start sentence if we have non-whitespace/format/control */ switch (type) { case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_PARAGRAPH_SEPARATOR: case G_UNICODE_CONTROL: case G_UNICODE_FORMAT: case G_UNICODE_SPACE_SEPARATOR: break; default: attrs[i].is_sentence_start = TRUE; sentence_state = STATE_SENTENCE_BODY; break; } break; case STATE_SENTENCE_BODY: /* If we already broke here due to separators, end the sentence. */ if (attrs[i].is_sentence_boundary) { attrs[i].is_sentence_end = TRUE; MAYBE_START_NEW_SENTENCE; } else { if (wc == '.') sentence_state = STATE_SENTENCE_DOT; else if (wc == '?' || wc == '!') sentence_state = STATE_SENTENCE_TERM; } break; case STATE_SENTENCE_TERM: /* End sentence on anything but close punctuation and some * loosely-specified OTHER_PUNCTUATION such as period, * comma, etc.; follow Unicode rules for breaks */ switch (type) { case G_UNICODE_OTHER_PUNCTUATION: case G_UNICODE_CLOSE_PUNCTUATION: if (type == G_UNICODE_CLOSE_PUNCTUATION || wc == '.' || wc == ',' || wc == '?' || wc == '!') sentence_state = STATE_SENTENCE_POST_TERM_CLOSE; else { attrs[i].is_sentence_end = TRUE; attrs[i].is_sentence_boundary = TRUE; MAYBE_START_NEW_SENTENCE; } break; case G_UNICODE_SPACE_SEPARATOR: attrs[i].is_sentence_end = TRUE; sentence_state = STATE_SENTENCE_POST_TERM_SPACE; break; case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_PARAGRAPH_SEPARATOR: attrs[i].is_sentence_end = TRUE; sentence_state = STATE_SENTENCE_POST_TERM_SEP; break; default: attrs[i].is_sentence_end = TRUE; attrs[i].is_sentence_boundary = TRUE; MAYBE_START_NEW_SENTENCE; break; } break; case STATE_SENTENCE_POST_TERM_CLOSE: /* End sentence on anything besides more punctuation; follow * rules for breaks */ switch (type) { case G_UNICODE_OTHER_PUNCTUATION: case G_UNICODE_CLOSE_PUNCTUATION: if (type == G_UNICODE_CLOSE_PUNCTUATION || wc == '.' || wc == ',' || wc == '?' || wc == '!') /* continue in this state */ ; else { attrs[i].is_sentence_end = TRUE; attrs[i].is_sentence_boundary = TRUE; MAYBE_START_NEW_SENTENCE; } break; case G_UNICODE_SPACE_SEPARATOR: attrs[i].is_sentence_end = TRUE; sentence_state = STATE_SENTENCE_POST_TERM_SPACE; break; case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_PARAGRAPH_SEPARATOR: attrs[i].is_sentence_end = TRUE; /* undo the unconditional break-at-all-line/para-separators * from above; I'm not sure this is what the Unicode spec * intends, but it seems right - we get to include * a single line/para separator in the sentence according * to their rules */ attrs[i].is_sentence_boundary = FALSE; sentence_state = STATE_SENTENCE_POST_TERM_SEP; break; default: attrs[i].is_sentence_end = TRUE; attrs[i].is_sentence_boundary = TRUE; MAYBE_START_NEW_SENTENCE; break; } break; case STATE_SENTENCE_POST_TERM_SPACE: /* Sentence is definitely already ended; to enter this state * we had to see a space, which ends the sentence. */ switch (type) { case G_UNICODE_SPACE_SEPARATOR: /* continue in this state */ break; case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_PARAGRAPH_SEPARATOR: /* undo the unconditional break-at-all-line/para-separators * from above; I'm not sure this is what the Unicode spec * intends, but it seems right */ attrs[i].is_sentence_boundary = FALSE; sentence_state = STATE_SENTENCE_POST_TERM_SEP; break; default: attrs[i].is_sentence_boundary = TRUE; MAYBE_START_NEW_SENTENCE; break; } break; case STATE_SENTENCE_POST_TERM_SEP: /* Break is forced at this point, unless we're a newline * after a CR, then we will break after the newline on the * next iteration. Only a single Sep can be in the * sentence. */ if (!(prev_wc == '\r' && wc == '\n')) attrs[i].is_sentence_boundary = TRUE; MAYBE_START_NEW_SENTENCE; break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -