📄 break.c
字号:
(wc) == 0x0F84 || \ (wc) == 0x1039 || \ (wc) == 0x17D2)/* Types of Japanese characters */#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)#define KANJI(wc) ((wc) >= 0x2F00 && (wc) <= 0x2FDF)#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)#define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))#define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))#define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))#define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))/* p. 132-133 of Unicode spec table 5-6 will help understand this */typedef enum{ STATE_SENTENCE_OUTSIDE, STATE_SENTENCE_BODY, STATE_SENTENCE_TERM, STATE_SENTENCE_POST_TERM_CLOSE, STATE_SENTENCE_POST_TERM_SPACE, STATE_SENTENCE_POST_TERM_SEP, STATE_SENTENCE_DOT, STATE_SENTENCE_POST_DOT_CLOSE, STATE_SENTENCE_POST_DOT_SPACE, STATE_SENTENCE_POST_DOT_OPEN, /* never include line/para separators in a sentence for now */ /* This isn't in the spec, but I can't figure out why they'd include * one line/para separator in lines ending with Term but not with * period-terminated lines, so I'm doing it for the dot lines also */ STATE_SENTENCE_POST_DOT_SEP} SentenceState;/* We call "123" and "foobar" words, but "123foo" is two words; * the Unicode spec just calls "123" a non-word */typedef enum{ WordNone, WordLetters, WordNumbers} WordType;/** * pango_default_break: * @text: text to break * @length: length of text in bytes (may be -1 if @text is nul-terminated) * @analysis: a #PangoAnalysis for the @text * @attrs: logical attributes to fill in * @attrs_len: size of the array passed as @attrs * * This is the default break algorithm, used if no language * engine overrides it. Normally you should use pango_break() * instead. Unlike pango_break(), * @analysis can be %NULL, but only do that if you know what * you're doing. If you need an analysis to pass to pango_break(), * you need to pango_itemize(). In most cases however you should * simply use pango_get_log_attrs(). **/voidpango_default_break (const gchar *text, gint length, PangoAnalysis *analysis, PangoLogAttr *attrs, int attrs_len){ /* The rationale for all this is in section 5.15 of the Unicode 3.0 book, * the line breaking stuff is also in TR14 on unicode.org */ /* This is a default break implementation that should work for nearly all * languages. Language engines can override it optionally. */ /* FIXME one cheesy optimization here would be to memset attrs to 0 * before we start, and then never assign %FALSE to anything */ const gchar *next; gint i; gunichar prev_wc; gunichar next_wc; JamoType prev_jamo; GUnicodeBreakType next_break_type; GUnicodeType prev_type; GUnicodeBreakType prev_break_type; /* skips spaces */ gboolean prev_was_break_space; WordType current_word_type = WordNone; gunichar last_word_letter = 0; gunichar base_character = 0; SentenceState sentence_state = STATE_SENTENCE_OUTSIDE; /* Tracks what will be the end of the sentence if a period is * determined to actually be a sentence-ending period. */ gint possible_sentence_end = -1; /* possible sentence break before Open* after a period-ended sentence */ gint possible_sentence_boundary = -1; gboolean almost_done = FALSE; gboolean done = FALSE; g_return_if_fail (length == 0 || text != NULL); g_return_if_fail (attrs != NULL); next = text; prev_type = (GUnicodeType) -1; prev_break_type = G_UNICODE_BREAK_UNKNOWN; prev_was_break_space = FALSE; prev_wc = 0; prev_jamo = NO_JAMO; if (length == 0 || *text == '\0') { next_wc = PARAGRAPH_SEPARATOR; almost_done = TRUE; } else next_wc = g_utf8_get_char (next); next_break_type = g_unichar_break_type (next_wc); next_break_type = BREAK_TYPE_SAFE (next_break_type); for (i = 0; !done ; i++) { GUnicodeType type; gunichar wc; GUnicodeBreakType break_type; BreakOpportunity break_op; JamoType jamo; gboolean makes_hangul_syllable; wc = next_wc; break_type = next_break_type; if (almost_done) { /* * If we have already reached the end of @text g_utf8_next_char() * may not increment next */ next_wc = 0; next_break_type = G_UNICODE_BREAK_UNKNOWN; done = TRUE; } else { next = g_utf8_next_char (next); if ((length >= 0 && next >= text + length) || *next == '\0') { /* This is how we fill in the last element (end position) of the * attr array - assume there's a paragraph separators off the end * of @text. */ next_wc = PARAGRAPH_SEPARATOR; almost_done = TRUE; } else next_wc = g_utf8_get_char (next); next_break_type = g_unichar_break_type (next_wc); next_break_type = BREAK_TYPE_SAFE (next_break_type); } type = g_unichar_type (wc); jamo = JAMO_TYPE (break_type); /* Determine wheter this forms a Hangul syllable with prev. */ if (jamo == NO_JAMO) makes_hangul_syllable = FALSE; else { JamoType prev_end = HangulJamoProps[prev_jamo].end ; JamoType this_start = HangulJamoProps[ jamo].start; /* See comments before IS_JAMO */ makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start); } /* Can't just use the type here since isspace() doesn't * correspond to a Unicode character type */ attrs[i].is_white = g_unichar_isspace (wc); /* Just few spaces have variable width. So explicitly mark them. */ attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc); /* ---- Cursor position breaks (Grapheme breaks) ---- */ if (wc == '\n') { /* Break before line feed unless prev char is a CR */ if (prev_wc != '\r') attrs[i].is_cursor_position = TRUE; else attrs[i].is_cursor_position = FALSE; } else if (i == 0 || prev_type == G_UNICODE_CONTROL || prev_type == G_UNICODE_FORMAT) { /* Break at first position (must be special cased, or if the * first char is say a combining mark there won't be a * cursor position at the start, which seems wrong to me * ???? - maybe it makes sense though, who knows) */ /* break after all format or control characters */ attrs[i].is_cursor_position = TRUE; } else { switch (type) { case G_UNICODE_CONTROL: case G_UNICODE_FORMAT: /* Break before all format or control characters */ attrs[i].is_cursor_position = TRUE; break; case G_UNICODE_COMBINING_MARK: case G_UNICODE_ENCLOSING_MARK: case G_UNICODE_NON_SPACING_MARK: /* Unicode spec includes "Combining marks plus Tibetan * subjoined characters" as joining chars, but lists the * Tibetan subjoined characters as combining marks, and * g_unichar_type() returns NON_SPACING_MARK for the Tibetan * subjoined characters. So who knows, beats me. */ /* It's a joining character, break only if preceded by * control or format; we already handled the case where * it was preceded earlier, so here we know it wasn't, * don't break */ attrs[i].is_cursor_position = FALSE; break; case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_TITLECASE_LETTER: case G_UNICODE_UPPERCASE_LETTER: if (makes_hangul_syllable) attrs[i].is_cursor_position = FALSE; else { /* Handle non-Hangul-syllable non-combining chars */ /* Break before Jamo if they are in a broken sequence or * next to non-Jamo; break if preceded by Jamo; don't * break if a letter is preceded by a virama; break in * all other cases. No need to check whether we are or are * preceded by Jamo explicitly, since a Jamo is not * a virama, we just break in all cases where we * aren't a or preceded by a virama. Don't fool with * viramas if we aren't part of a script that uses them. */ if (VIRAMA_SCRIPT (wc)) { /* Check whether we're preceded by a virama; this * could use some optimization. */ if (VIRAMA (prev_wc)) attrs[i].is_cursor_position = FALSE; else attrs[i].is_cursor_position = TRUE; } else { attrs[i].is_cursor_position = TRUE; } } break; default: /* Some weirdo char, just break here, why not */ attrs[i].is_cursor_position = TRUE; break; } } /* If this is a grapheme boundary, we have to decide if backspace * deletes a character or the whole grapheme cluster */ if (attrs[i].is_cursor_position) attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character); else attrs[i].backspace_deletes_character = FALSE; /* ---- Line breaking ---- */ break_op = BREAK_ALREADY_HANDLED; g_assert (prev_break_type != G_UNICODE_BREAK_SPACE); attrs[i].is_line_break = FALSE; attrs[i].is_mandatory_break = FALSE; if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary, * it's not a line break either */ { /* space followed by a combining mark is handled * specially; (rule 7a from TR 14) */ if (break_type == G_UNICODE_BREAK_SPACE && next_break_type == G_UNICODE_BREAK_COMBINING_MARK) break_type = G_UNICODE_BREAK_IDEOGRAPHIC; /* Unicode doesn't specify char wrap; we wrap around all chars * except where a line break is prohibited, which means we * effectively break everywhere except inside runs of spaces. */ attrs[i].is_char_break = TRUE; /* Make any necessary replacements first */ switch (prev_break_type) { case G_UNICODE_BREAK_HANGUL_L_JAMO: case G_UNICODE_BREAK_HANGUL_V_JAMO: case G_UNICODE_BREAK_HANGUL_T_JAMO: case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE: case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE: /* treat Jamo as IDEOGRAPHIC from now */ prev_break_type = G_UNICODE_BREAK_IDEOGRAPHIC; break; case G_UNICODE_BREAK_AMBIGUOUS: /* FIXME * we need to resolve the East Asian width * to decide what to do here */ case G_UNICODE_BREAK_COMPLEX_CONTEXT: /* FIXME * language engines should handle this case... */ case G_UNICODE_BREAK_UNKNOWN: /* convert unknown, complex, ambiguous to ALPHABETIC */ prev_break_type = G_UNICODE_BREAK_ALPHABETIC; break; default: ; } switch (prev_break_type) { case G_UNICODE_BREAK_MANDATORY: case G_UNICODE_BREAK_LINE_FEED: case G_UNICODE_BREAK_NEXT_LINE: attrs[i].is_line_break = TRUE; attrs[i].is_mandatory_break = TRUE; break; case G_UNICODE_BREAK_CARRIAGE_RETURN: if (wc != '\n') { attrs[i].is_line_break = TRUE; attrs[i].is_mandatory_break = TRUE; } break; case G_UNICODE_BREAK_CONTINGENT: /* can break after 0xFFFC by default, though we might want * to eventually have a PangoLayout setting or * PangoAttribute that disables this, if for some * application breaking after objects is not desired. */ break_op = BREAK_ALLOWED; break; case G_UNICODE_BREAK_SURROGATE: g_assert_not_reached (); break; default: g_assert (IN_BREAK_TABLE (prev_break_type)); /* Note that our table assumes that combining marks * are only applied to alphabetic characters; * tech report 14 explains how to remove this assumption * from the code, if anyone ever cares, but it shouldn't * be a problem. Also this issue sort of goes * away since we only look for breaks on grapheme * boundaries. */ switch (break_type) { case G_UNICODE_BREAK_MANDATORY: case G_UNICODE_BREAK_LINE_FEED: case G_UNICODE_BREAK_CARRIAGE_RETURN: case G_UNICODE_BREAK_NEXT_LINE: case G_UNICODE_BREAK_SPACE: /* These types all "pile up" at the end of lines and * get elided. */ break_op = BREAK_PROHIBITED; break; case G_UNICODE_BREAK_CONTINGENT: /* break before 0xFFFC by default, eventually * make this configurable? */ break_op = BREAK_ALLOWED; break; case G_UNICODE_BREAK_SURROGATE: g_assert_not_reached (); break; /* Hangul additions are from Unicode 4.1 UAX#14 */ case G_UNICODE_BREAK_HANGUL_L_JAMO: case G_UNICODE_BREAK_HANGUL_V_JAMO: case G_UNICODE_BREAK_HANGUL_T_JAMO: case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE: case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE: /* treat Jamo as IDEOGRAPHIC from now */ break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -