📄 break.c

📁 Pango is a library for layout and rendering of text, with an emphasis on internationalization. Pang
💻 C
📖 第 1 页 / 共 4 页
字号:
		    (wc) == 0x0F84 || \		    (wc) == 0x1039 || \		    (wc) == 0x17D2)/* Types of Japanese characters */#define JAPANESE(wc) ((wc) >= 0x2F00 && (wc) <= 0x30FF)#define KANJI(wc)    ((wc) >= 0x2F00 && (wc) <= 0x2FDF)#define HIRAGANA(wc) ((wc) >= 0x3040 && (wc) <= 0x309F)#define KATAKANA(wc) ((wc) >= 0x30A0 && (wc) <= 0x30FF)#define LATIN(wc) (((wc) >= 0x0020 && (wc) <= 0x02AF) || ((wc) >= 0x1E00 && (wc) <= 0x1EFF))#define CYRILLIC(wc) (((wc) >= 0x0400 && (wc) <= 0x052F))#define GREEK(wc) (((wc) >= 0x0370 && (wc) <= 0x3FF) || ((wc) >= 0x1F00 && (wc) <= 0x1FFF))#define KANA(wc) ((wc) >= 0x3040 && (wc) <= 0x30FF)#define HANGUL(wc) ((wc) >= 0xAC00 && (wc) <= 0xD7A3)#define BACKSPACE_DELETES_CHARACTER(wc) (!LATIN (wc) && !CYRILLIC (wc) && !GREEK (wc) && !KANA(wc) && !HANGUL(wc))/* p. 132-133 of Unicode spec table 5-6 will help understand this */typedef enum{  STATE_SENTENCE_OUTSIDE,  STATE_SENTENCE_BODY,  STATE_SENTENCE_TERM,  STATE_SENTENCE_POST_TERM_CLOSE,  STATE_SENTENCE_POST_TERM_SPACE,  STATE_SENTENCE_POST_TERM_SEP,  STATE_SENTENCE_DOT,  STATE_SENTENCE_POST_DOT_CLOSE,  STATE_SENTENCE_POST_DOT_SPACE,  STATE_SENTENCE_POST_DOT_OPEN,  /* never include line/para separators in a sentence for now */  /* This isn't in the spec, but I can't figure out why they'd include   * one line/para separator in lines ending with Term but not with   * period-terminated lines, so I'm doing it for the dot lines also   */  STATE_SENTENCE_POST_DOT_SEP} SentenceState;/* We call "123" and "foobar" words, but "123foo" is two words; * the Unicode spec just calls "123" a non-word */typedef enum{  WordNone,  WordLetters,  WordNumbers} WordType;/** * pango_default_break: * @text: text to break * @length: length of text in bytes (may be -1 if @text is nul-terminated) * @analysis: a #PangoAnalysis for the @text * @attrs: logical attributes to fill in * @attrs_len: size of the array passed as @attrs * * This is the default break algorithm, used if no language * engine overrides it. Normally you should use pango_break() * instead. Unlike pango_break(), * @analysis can be %NULL, but only do that if you know what * you're doing. If you need an analysis to pass to pango_break(), * you need to pango_itemize().  In most cases however you should * simply use pango_get_log_attrs(). **/voidpango_default_break (const gchar   *text,		     gint           length,		     PangoAnalysis *analysis,		     PangoLogAttr  *attrs,		     int            attrs_len){  /* The rationale for all this is in section 5.15 of the Unicode 3.0 book,   * the line breaking stuff is also in TR14 on unicode.org   */  /* This is a default break implementation that should work for nearly all   * languages. Language engines can override it optionally.   */  /* FIXME one cheesy optimization here would be to memset attrs to 0   * before we start, and then never assign %FALSE to anything   */  const gchar *next;  gint i;  gunichar prev_wc;  gunichar next_wc;  JamoType prev_jamo;  GUnicodeBreakType next_break_type;  GUnicodeType prev_type;  GUnicodeBreakType prev_break_type; /* skips spaces */  gboolean prev_was_break_space;  WordType current_word_type = WordNone;  gunichar last_word_letter = 0;  gunichar base_character = 0;  SentenceState sentence_state = STATE_SENTENCE_OUTSIDE;  /* Tracks what will be the end of the sentence if a period is   * determined to actually be a sentence-ending period.   */  gint possible_sentence_end = -1;  /* possible sentence break before Open* after a period-ended sentence */  gint possible_sentence_boundary = -1;  gboolean almost_done = FALSE;  gboolean done = FALSE;  g_return_if_fail (length == 0 || text != NULL);  g_return_if_fail (attrs != NULL);  next = text;  prev_type = (GUnicodeType) -1;  prev_break_type = G_UNICODE_BREAK_UNKNOWN;  prev_was_break_space = FALSE;  prev_wc = 0;  prev_jamo = NO_JAMO;  if (length == 0 || *text == '\0')    {      next_wc = PARAGRAPH_SEPARATOR;      almost_done = TRUE;    }  else    next_wc = g_utf8_get_char (next);  next_break_type = g_unichar_break_type (next_wc);  next_break_type = BREAK_TYPE_SAFE (next_break_type);  for (i = 0; !done ; i++)    {      GUnicodeType type;      gunichar wc;      GUnicodeBreakType break_type;      BreakOpportunity break_op;      JamoType jamo;      gboolean makes_hangul_syllable;      wc = next_wc;      break_type = next_break_type;      if (almost_done)	{	  /*	   * If we have already reached the end of @text g_utf8_next_char()	   * may not increment next	   */	  next_wc = 0;	  next_break_type = G_UNICODE_BREAK_UNKNOWN;	  done = TRUE;	}      else	{	  next = g_utf8_next_char (next);	  if ((length >= 0 && next >= text + length) || *next == '\0')	    {	      /* This is how we fill in the last element (end position) of the	       * attr array - assume there's a paragraph separators off the end	       * of @text.	       */	      next_wc = PARAGRAPH_SEPARATOR;	      almost_done = TRUE;	    }	  else	    next_wc = g_utf8_get_char (next);	  next_break_type = g_unichar_break_type (next_wc);	  next_break_type = BREAK_TYPE_SAFE (next_break_type);	}      type = g_unichar_type (wc);      jamo = JAMO_TYPE (break_type);      /* Determine wheter this forms a Hangul syllable with prev. */      if (jamo == NO_JAMO)	makes_hangul_syllable = FALSE;      else	{	  JamoType prev_end   = HangulJamoProps[prev_jamo].end  ;	  JamoType this_start = HangulJamoProps[     jamo].start;	  /* See comments before IS_JAMO */	  makes_hangul_syllable = (prev_end == this_start) || (prev_end + 1 == this_start);	}      /* Can't just use the type here since isspace() doesn't       * correspond to a Unicode character type       */      attrs[i].is_white = g_unichar_isspace (wc);      /* Just few spaces have variable width. So explicitly mark them.       */      attrs[i].is_expandable_space = (0x0020 == wc || 0x00A0 == wc);      /* ---- Cursor position breaks (Grapheme breaks) ---- */      if (wc == '\n')	{	  /* Break before line feed unless prev char is a CR */	  if (prev_wc != '\r')	    attrs[i].is_cursor_position = TRUE;	  else	    attrs[i].is_cursor_position = FALSE;	}      else if (i == 0 ||	       prev_type == G_UNICODE_CONTROL ||	       prev_type == G_UNICODE_FORMAT)	{	  /* Break at first position (must be special cased, or if the	   * first char is say a combining mark there won't be a	   * cursor position at the start, which seems wrong to me	   * ???? - maybe it makes sense though, who knows)	   */	  /* break after all format or control characters */	  attrs[i].is_cursor_position = TRUE;	}      else	{	  switch (type)	    {	    case G_UNICODE_CONTROL:	    case G_UNICODE_FORMAT:	      /* Break before all format or control characters */	      attrs[i].is_cursor_position = TRUE;	      break;	    case G_UNICODE_COMBINING_MARK:	    case G_UNICODE_ENCLOSING_MARK:	    case G_UNICODE_NON_SPACING_MARK:	      /* Unicode spec includes "Combining marks plus Tibetan	       * subjoined characters" as joining chars, but lists the	       * Tibetan subjoined characters as combining marks, and	       * g_unichar_type() returns NON_SPACING_MARK for the Tibetan	       * subjoined characters. So who knows, beats me.	       */	      /* It's a joining character, break only if preceded by	       * control or format; we already handled the case where	       * it was preceded earlier, so here we know it wasn't,	       * don't break	       */	      attrs[i].is_cursor_position = FALSE;	      break;	    case G_UNICODE_LOWERCASE_LETTER:	    case G_UNICODE_MODIFIER_LETTER:	    case G_UNICODE_OTHER_LETTER:	    case G_UNICODE_TITLECASE_LETTER:	    case G_UNICODE_UPPERCASE_LETTER:	      if (makes_hangul_syllable)		attrs[i].is_cursor_position = FALSE;	      else		{		  /* Handle non-Hangul-syllable non-combining chars */		  /* Break before Jamo if they are in a broken sequence or		   * next to non-Jamo; break if preceded by Jamo; don't		   * break if a letter is preceded by a virama; break in		   * all other cases. No need to check whether we are or are		   * preceded by Jamo explicitly, since a Jamo is not		   * a virama, we just break in all cases where we		   * aren't a or preceded by a virama.  Don't fool with		   * viramas if we aren't part of a script that uses them.		   */		  if (VIRAMA_SCRIPT (wc))		    {		      /* Check whether we're preceded by a virama; this		       * could use some optimization.		       */		      if (VIRAMA (prev_wc))			attrs[i].is_cursor_position = FALSE;		      else			attrs[i].is_cursor_position = TRUE;		    }		  else		    {		      attrs[i].is_cursor_position = TRUE;		    }		}	      break;	    default:	      /* Some weirdo char, just break here, why not */	      attrs[i].is_cursor_position = TRUE;	      break;	    }	}      /* If this is a grapheme boundary, we have to decide if backspace       * deletes a character or the whole grapheme cluster */      if (attrs[i].is_cursor_position)	attrs[i].backspace_deletes_character = BACKSPACE_DELETES_CHARACTER (base_character);      else	attrs[i].backspace_deletes_character = FALSE;      /* ---- Line breaking ---- */      break_op = BREAK_ALREADY_HANDLED;      g_assert (prev_break_type != G_UNICODE_BREAK_SPACE);      attrs[i].is_line_break = FALSE;      attrs[i].is_mandatory_break = FALSE;      if (attrs[i].is_cursor_position) /* If it's not a grapheme boundary,					* it's not a line break either					*/	{	  /* space followed by a combining mark is handled	   * specially; (rule 7a from TR 14)	   */	  if (break_type == G_UNICODE_BREAK_SPACE &&	      next_break_type == G_UNICODE_BREAK_COMBINING_MARK)	    break_type = G_UNICODE_BREAK_IDEOGRAPHIC;	  /* Unicode doesn't specify char wrap; we wrap around all chars	   * except where a line break is prohibited, which means we	   * effectively break everywhere except inside runs of spaces.	   */	  attrs[i].is_char_break = TRUE;	  /* Make any necessary replacements first */	  switch (prev_break_type)	    {	    case G_UNICODE_BREAK_HANGUL_L_JAMO:	    case G_UNICODE_BREAK_HANGUL_V_JAMO:	    case G_UNICODE_BREAK_HANGUL_T_JAMO:	    case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:	    case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:	      /* treat Jamo as IDEOGRAPHIC from now	       */	      prev_break_type = G_UNICODE_BREAK_IDEOGRAPHIC;	      break;	    case G_UNICODE_BREAK_AMBIGUOUS:	      /* FIXME	       * we need to resolve the East Asian width	       * to decide what to do here	       */	    case G_UNICODE_BREAK_COMPLEX_CONTEXT:	      /* FIXME	       * language engines should handle this case...	       */	    case G_UNICODE_BREAK_UNKNOWN:	      /* convert unknown, complex, ambiguous to ALPHABETIC	       */	      prev_break_type = G_UNICODE_BREAK_ALPHABETIC;	      break;	    default:	      ;	    }	  switch (prev_break_type)	    {	    case G_UNICODE_BREAK_MANDATORY:	    case G_UNICODE_BREAK_LINE_FEED:	    case G_UNICODE_BREAK_NEXT_LINE:	      attrs[i].is_line_break = TRUE;	      attrs[i].is_mandatory_break = TRUE;	      break;	    case G_UNICODE_BREAK_CARRIAGE_RETURN:	      if (wc != '\n')		{		  attrs[i].is_line_break = TRUE;		  attrs[i].is_mandatory_break = TRUE;		}	      break;	    case G_UNICODE_BREAK_CONTINGENT:	      /* can break after 0xFFFC by default, though we might want	       * to eventually have a PangoLayout setting or	       * PangoAttribute that disables this, if for some	       * application breaking after objects is not desired.	       */	      break_op = BREAK_ALLOWED;	      break;	    case G_UNICODE_BREAK_SURROGATE:	      g_assert_not_reached ();	      break;	    default:	      g_assert (IN_BREAK_TABLE (prev_break_type));	      /* Note that our table assumes that combining marks	       * are only applied to alphabetic characters;	       * tech report 14 explains how to remove this assumption	       * from the code, if anyone ever cares, but it shouldn't	       * be a problem. Also this issue sort of goes	       * away since we only look for breaks on grapheme	       * boundaries.	       */	      switch (break_type)		{		case G_UNICODE_BREAK_MANDATORY:		case G_UNICODE_BREAK_LINE_FEED:		case G_UNICODE_BREAK_CARRIAGE_RETURN:		case G_UNICODE_BREAK_NEXT_LINE:		case G_UNICODE_BREAK_SPACE:		  /* These types all "pile up" at the end of lines and		   * get elided.		   */		  break_op = BREAK_PROHIBITED;		  break;		case G_UNICODE_BREAK_CONTINGENT:		  /* break before 0xFFFC by default, eventually		   * make this configurable?		   */		  break_op = BREAK_ALLOWED;		  break;		case G_UNICODE_BREAK_SURROGATE:		  g_assert_not_reached ();		  break;		/* Hangul additions are from Unicode 4.1 UAX#14 */		case G_UNICODE_BREAK_HANGUL_L_JAMO:		case G_UNICODE_BREAK_HANGUL_V_JAMO:		case G_UNICODE_BREAK_HANGUL_T_JAMO:		case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:		case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:		  /* treat Jamo as IDEOGRAPHIC from now		   */		  break_type = G_UNICODE_BREAK_IDEOGRAPHIC;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -