📄 break.c

📁 Pango is a library for layout and rendering of text, with an emphasis on internationalization. Pang
💻 C
📖 第 1 页 / 共 4 页
字号:
		  if (makes_hangul_syllable)		    break_op = BREAK_IF_SPACES;		  else		    break_op = BREAK_ALLOWED;		  break;		case G_UNICODE_BREAK_AMBIGUOUS:		  /* FIXME:		   * we need to resolve the East Asian width		   * to decide what to do here		   */		case G_UNICODE_BREAK_COMPLEX_CONTEXT:		  /* FIXME:		   * language engines should handle this case...		   */		case G_UNICODE_BREAK_UNKNOWN:		  /* treat unknown, complex, and ambiguous like ALPHABETIC		   * for now		   */		  break_op = BREAK_OP (prev_break_type, G_UNICODE_BREAK_ALPHABETIC);		  break;		default:		  g_assert (IN_BREAK_TABLE (break_type));		  break_op = BREAK_OP (prev_break_type, break_type);		  break;		}	      break;	    }	  if (break_op != BREAK_ALREADY_HANDLED)	    {	      switch (break_op)		{		case BREAK_PROHIBITED:		  /* can't break here */		  attrs[i].is_char_break = FALSE;		  break;		case BREAK_IF_SPACES:		  /* break if prev char was space */		  if (prev_was_break_space)		    attrs[i].is_line_break = TRUE;		  break;		case BREAK_ALLOWED:		  attrs[i].is_line_break = TRUE;		  break;		default:		  g_assert_not_reached ();		  break;		}	    }	}      if (break_type != G_UNICODE_BREAK_SPACE)	{	  prev_break_type = break_type;	  prev_was_break_space = FALSE;	  prev_jamo = jamo;	}      else	prev_was_break_space = TRUE;      /* ---- Word breaks ---- */      /* default to not a word start/end */      attrs[i].is_word_start = FALSE;      attrs[i].is_word_end = FALSE;      if (current_word_type != WordNone)	{	  /* Check for a word end */	  switch (type)	    {	    case G_UNICODE_COMBINING_MARK:	    case G_UNICODE_ENCLOSING_MARK:	    case G_UNICODE_NON_SPACING_MARK:	    case G_UNICODE_FORMAT:	      /* nothing, we just eat these up as part of the word */	      break;	    case G_UNICODE_LOWERCASE_LETTER:	    case G_UNICODE_MODIFIER_LETTER:	    case G_UNICODE_OTHER_LETTER:	    case G_UNICODE_TITLECASE_LETTER:	    case G_UNICODE_UPPERCASE_LETTER:	      if (current_word_type == WordLetters)		{		  /* Japanese special cases for ending the word */		  if (JAPANESE (last_word_letter) ||		      JAPANESE (wc))		    {		      if ((HIRAGANA (last_word_letter) &&			   !HIRAGANA (wc)) ||			  (KATAKANA (last_word_letter) &&			   !(KATAKANA (wc) || HIRAGANA (wc))) ||			  (KANJI (last_word_letter) &&			   !(HIRAGANA (wc) || KANJI (wc))) ||			  (JAPANESE (last_word_letter) &&			   !JAPANESE (wc)) ||			  (!JAPANESE (last_word_letter) &&			   JAPANESE (wc)))			attrs[i].is_word_end = TRUE;		    }		}	      else		{		  /* end the number word, start the letter word */		  attrs[i].is_word_end = TRUE;		  attrs[i].is_word_start = TRUE;		  current_word_type = WordLetters;		}	      last_word_letter = wc;	      break;	    case G_UNICODE_DECIMAL_NUMBER:	    case G_UNICODE_LETTER_NUMBER:	    case G_UNICODE_OTHER_NUMBER:	      if (current_word_type != WordNumbers)		{		  attrs[i].is_word_end = TRUE;		  attrs[i].is_word_start = TRUE;		  current_word_type = WordNumbers;		}	      last_word_letter = wc;	      break;	    default:	      /* Punctuation, control/format chars, etc. all end a word. */	      attrs[i].is_word_end = TRUE;	      current_word_type = WordNone;	      break;	    }	}      else	{	  /* Check for a word start */	  switch (type)	    {	    case G_UNICODE_LOWERCASE_LETTER:	    case G_UNICODE_MODIFIER_LETTER:	    case G_UNICODE_OTHER_LETTER:	    case G_UNICODE_TITLECASE_LETTER:	    case G_UNICODE_UPPERCASE_LETTER:	      current_word_type = WordLetters;	      last_word_letter = wc;	      attrs[i].is_word_start = TRUE;	      break;	    case G_UNICODE_DECIMAL_NUMBER:	    case G_UNICODE_LETTER_NUMBER:	    case G_UNICODE_OTHER_NUMBER:	      current_word_type = WordNumbers;	      last_word_letter = wc;	      attrs[i].is_word_start = TRUE;	      break;	    default:	      /* No word here */	      break;	    }	}      /* ---- Sentence breaks ---- */      /* The Unicode spec specifies sentence breakpoints, so that a piece of       * text would be partitioned into sentences, and all characters would       * be inside some sentence. This code implements that for is_sentence_boundary,       * but tries to keep leading/trailing whitespace out of sentences for       * the start/end flags       */      /* The Unicode spec seems to say that one trailing line/para       * separator can be tacked on to a sentence ending in ! or ?,       * but not a sentence ending in period; I think they're on crack       * so am allowing one to be tacked onto a sentence ending in period.       */#define MAYBE_START_NEW_SENTENCE                                \	      switch (type)                                     \		{                                               \		case G_UNICODE_LINE_SEPARATOR:                  \		case G_UNICODE_PARAGRAPH_SEPARATOR:             \		case G_UNICODE_CONTROL:                         \		case G_UNICODE_FORMAT:                          \		case G_UNICODE_SPACE_SEPARATOR:                 \		  sentence_state = STATE_SENTENCE_OUTSIDE;      \		  break;                                        \								\		default:                                        \		  sentence_state = STATE_SENTENCE_BODY;         \		  attrs[i].is_sentence_start = TRUE;            \		  break;                                        \		}      /* No sentence break at the start of the text */      /* default to not a sentence breakpoint */      attrs[i].is_sentence_boundary = FALSE;      attrs[i].is_sentence_start = FALSE;      attrs[i].is_sentence_end = FALSE;      /* FIXME the Unicode spec lumps control/format chars with       * line/para separators in descriptive text, but not in the       * character class specs, in table 5-6, so who knows whether you       * are actually supposed to break on control/format       * characters. Seems semi-broken to break on tabs...       */      /* Break after line/para separators except carriage return       * followed by newline       */      switch (prev_type)	{	case G_UNICODE_LINE_SEPARATOR:	case G_UNICODE_PARAGRAPH_SEPARATOR:	case G_UNICODE_CONTROL:	case G_UNICODE_FORMAT:	  if (wc == '\r')	    {	      if (next_wc != '\n')		attrs[i].is_sentence_boundary = TRUE;	    }	  else	    attrs[i].is_sentence_boundary = TRUE;	  break;	default:	  break;	}      /* break before para/line separators except newline following       * carriage return       */      switch (type)	{	case G_UNICODE_LINE_SEPARATOR:	case G_UNICODE_PARAGRAPH_SEPARATOR:	case G_UNICODE_CONTROL:	case G_UNICODE_FORMAT:	  if (wc == '\n')	    {	      if (prev_wc != '\r')		attrs[i].is_sentence_boundary = TRUE;	    }	  else	    attrs[i].is_sentence_boundary = TRUE;	  break;	default:	  break;	}      switch (sentence_state)	{	case STATE_SENTENCE_OUTSIDE:	  /* Start sentence if we have non-whitespace/format/control */	  switch (type)	    {	    case G_UNICODE_LINE_SEPARATOR:	    case G_UNICODE_PARAGRAPH_SEPARATOR:	    case G_UNICODE_CONTROL:	    case G_UNICODE_FORMAT:	    case G_UNICODE_SPACE_SEPARATOR:	      break;	    default:	      attrs[i].is_sentence_start = TRUE;	      sentence_state = STATE_SENTENCE_BODY;	      break;	    }	  break;	case STATE_SENTENCE_BODY:	  /* If we already broke here due to separators, end the sentence. */	  if (attrs[i].is_sentence_boundary)	    {	      attrs[i].is_sentence_end = TRUE;	      MAYBE_START_NEW_SENTENCE;	    }	  else	    {	      if (wc == '.')		sentence_state = STATE_SENTENCE_DOT;	      else if (wc == '?' || wc == '!')		sentence_state = STATE_SENTENCE_TERM;	    }	  break;	case STATE_SENTENCE_TERM:	  /* End sentence on anything but close punctuation and some	   * loosely-specified OTHER_PUNCTUATION such as period,	   * comma, etc.; follow Unicode rules for breaks	   */	  switch (type)	    {	    case G_UNICODE_OTHER_PUNCTUATION:	    case G_UNICODE_CLOSE_PUNCTUATION:	      if (type == G_UNICODE_CLOSE_PUNCTUATION ||		  wc == '.' ||		  wc == ',' ||		  wc == '?' ||		  wc == '!')		sentence_state = STATE_SENTENCE_POST_TERM_CLOSE;	      else		{		  attrs[i].is_sentence_end = TRUE;		  attrs[i].is_sentence_boundary = TRUE;		  MAYBE_START_NEW_SENTENCE;		}	      break;	    case G_UNICODE_SPACE_SEPARATOR:	      attrs[i].is_sentence_end = TRUE;	      sentence_state = STATE_SENTENCE_POST_TERM_SPACE;	      break;	    case G_UNICODE_LINE_SEPARATOR:	    case G_UNICODE_PARAGRAPH_SEPARATOR:	      attrs[i].is_sentence_end = TRUE;	      sentence_state = STATE_SENTENCE_POST_TERM_SEP;	      break;	    default:	      attrs[i].is_sentence_end = TRUE;	      attrs[i].is_sentence_boundary = TRUE;	      MAYBE_START_NEW_SENTENCE;	      break;	    }	  break;	case STATE_SENTENCE_POST_TERM_CLOSE:	  /* End sentence on anything besides more punctuation; follow	   * rules for breaks	   */	  switch (type)	    {	    case G_UNICODE_OTHER_PUNCTUATION:	    case G_UNICODE_CLOSE_PUNCTUATION:	      if (type == G_UNICODE_CLOSE_PUNCTUATION ||		  wc == '.' ||		  wc == ',' ||		  wc == '?' ||		  wc == '!')		/* continue in this state */		;	      else		{		  attrs[i].is_sentence_end = TRUE;		  attrs[i].is_sentence_boundary = TRUE;		  MAYBE_START_NEW_SENTENCE;		}	      break;	    case G_UNICODE_SPACE_SEPARATOR:	      attrs[i].is_sentence_end = TRUE;	      sentence_state = STATE_SENTENCE_POST_TERM_SPACE;	      break;	    case G_UNICODE_LINE_SEPARATOR:	    case G_UNICODE_PARAGRAPH_SEPARATOR:	      attrs[i].is_sentence_end = TRUE;	      /* undo the unconditional break-at-all-line/para-separators	       * from above; I'm not sure this is what the Unicode spec	       * intends, but it seems right - we get to include	       * a single line/para separator in the sentence according	       * to their rules	       */	      attrs[i].is_sentence_boundary = FALSE;	      sentence_state = STATE_SENTENCE_POST_TERM_SEP;	      break;	    default:	      attrs[i].is_sentence_end = TRUE;	      attrs[i].is_sentence_boundary = TRUE;	      MAYBE_START_NEW_SENTENCE;	      break;	    }	  break;	case STATE_SENTENCE_POST_TERM_SPACE:	  /* Sentence is definitely already ended; to enter this state	   * we had to see a space, which ends the sentence.	   */	  switch (type)	    {	    case G_UNICODE_SPACE_SEPARATOR:	      /* continue in this state */	      break;	    case G_UNICODE_LINE_SEPARATOR:	    case G_UNICODE_PARAGRAPH_SEPARATOR:	      /* undo the unconditional break-at-all-line/para-separators	       * from above; I'm not sure this is what the Unicode spec	       * intends, but it seems right	       */	      attrs[i].is_sentence_boundary = FALSE;	      sentence_state = STATE_SENTENCE_POST_TERM_SEP;	      break;	    default:	      attrs[i].is_sentence_boundary = TRUE;	      MAYBE_START_NEW_SENTENCE;	      break;	    }	  break;	case STATE_SENTENCE_POST_TERM_SEP:	  /* Break is forced at this point, unless we're a newline	   * after a CR, then we will break after the newline on the	   * next iteration. Only a single Sep can be in the	   * sentence.	   */	  if (!(prev_wc == '\r' && wc == '\n'))	    attrs[i].is_sentence_boundary = TRUE;	  MAYBE_START_NEW_SENTENCE;	  break;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -