📄 ubrk.h
字号:
/** Copyright (C) 1996-2004, International Business Machines Corporation and others. All Rights Reserved.******************************************************************************************/#ifndef UBRK_H#define UBRK_H#include "unicode/utypes.h"#include "unicode/uloc.h"/** * A text-break iterator. * For usage in C programs. */#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR# define UBRK_TYPEDEF_UBREAK_ITERATOR /** * Opaque type representing an ICU Break iterator object. * @stable ICU 2.0 */ typedef void UBreakIterator;#endif#if !UCONFIG_NO_BREAK_ITERATION#include "unicode/parseerr.h"/** * \file * \brief C API: BreakIterator * * <h2> BreakIterator C API </h2> * * The BreakIterator C API defines methods for finding the location * of boundaries in text. Pointer to a UBreakIterator maintain a * current position and scan over text returning the index of characters * where boundaries occur. * <P> * Line boundary analysis determines where a text string can be broken * when line-wrapping. The mechanism correctly handles punctuation and * hyphenated words. * <P> * Sentence boundary analysis allows selection with correct * interpretation of periods within numbers and abbreviations, and * trailing punctuation marks such as quotation marks and parentheses. * <P> * Word boundary analysis is used by search and replace functions, as * well as within text editing applications that allow the user to * select words with a double click. Word selection provides correct * interpretation of punctuation marks within and following * words. Characters that are not part of a word, such as symbols or * punctuation marks, have word-breaks on both sides. * <P> * Character boundary analysis allows users to interact with * characters as they expect to, for example, when moving the cursor * through a text string. Character boundary analysis provides correct * navigation of through character strings, regardless of how the * character is stored. For example, an accented character might be * stored as a base character and a diacritical mark. What users * consider to be a character can differ between languages. * <P> * Title boundary analysis locates all positions, * typically starts of words, that should be set to Title Case * when title casing the text. * <P> * * This is the interface for all text boundaries. * <P> * Examples: * <P> * Helper function to output text * <pre> * \code * void printTextRange(UChar* str, int32_t start, int32_t end ) { * UChar* result; * UChar* temp; * const char* res; * temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1)); * result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1)); * u_strcpy(temp, &str[start]); * u_strncpy(result, temp, end-start); * res=(char*)malloc(sizeof(char) * (u_strlen(result)+1)); * u_austrcpy(res, result); * printf("%s\n", res); * } * \endcode * </pre> * Print each element in order: * <pre> * \code * void printEachForward( UBreakIterator* boundary, UChar* str) { * int32_t end; * int32_t start = ubrk_first(boundary); * for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) { * printTextRange(str, start, end ); * } * } * \endcode * </pre> * Print each element in reverse order: * <pre> * \code * void printEachBackward( UBreakIterator* boundary, UChar* str) { * int32_t start; * int32_t end = ubrk_last(boundary); * for (start = ubrk_previous(boundary); start != UBRK_DONE; end = start, start =ubrk_previous(boundary)) { * printTextRange( str, start, end ); * } * } * \endcode * </pre> * Print first element * <pre> * \code * void printFirst(UBreakIterator* boundary, UChar* str) { * int32_t end; * int32_t start = ubrk_first(boundary); * end = ubrk_next(boundary); * printTextRange( str, start, end ); * } * \endcode * </pre> * Print last element * <pre> * \code * void printLast(UBreakIterator* boundary, UChar* str) { * int32_t start; * int32_t end = ubrk_last(boundary); * start = ubrk_previous(boundary); * printTextRange(str, start, end ); * } * \endcode * </pre> * Print the element at a specified position * <pre> * \code * void printAt(UBreakIterator* boundary, int32_t pos , UChar* str) { * int32_t start; * int32_t end = ubrk_following(boundary, pos); * start = ubrk_previous(boundary); * printTextRange(str, start, end ); * } * \endcode * </pre> * Creating and using text boundaries * <pre> * \code * void BreakIterator_Example( void ) { * UBreakIterator* boundary; * UChar *stringToExamine; * stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) ); * u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff."); * printf("Examining: "Aaa bbb ccc. Ddd eee fff."); * * //print each sentence in forward and reverse order * boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status); * printf("----- forward: -----------\n"); * printEachForward(boundary, stringToExamine); * printf("----- backward: ----------\n"); * printEachBackward(boundary, stringToExamine); * ubrk_close(boundary); * * //print each word in order * boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status); * printf("----- forward: -----------\n"); * printEachForward(boundary, stringToExamine); * printf("----- backward: ----------\n"); * printEachBackward(boundary, stringToExamine); * //print first element * printf("----- first: -------------\n"); * printFirst(boundary, stringToExamine); * //print last element * printf("----- last: --------------\n"); * printLast(boundary, stringToExamine); * //print word at charpos 10 * printf("----- at pos 10: ---------\n"); * printAt(boundary, 10 , stringToExamine); * * ubrk_close(boundary); * } * \endcode * </pre> *//** The possible types of text boundaries. @stable ICU 2.0 */typedef enum UBreakIteratorType { /** Character breaks @stable ICU 2.0 */ UBRK_CHARACTER, /** Word breaks @stable ICU 2.0 */ UBRK_WORD, /** Line breaks @stable ICU 2.0 */ UBRK_LINE, /** Sentence breaks @stable ICU 2.0 */ UBRK_SENTENCE,#ifndef U_HIDE_DEPRECATED_API /** * Title Case breaks * The iterator created using this type locates title boundaries as described for * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration, * please use Word Boundary iterator. * * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later. */ UBRK_TITLE#endif /* U_HIDE_DEPRECATED_API */} UBreakIteratorType;/** Value indicating all text boundaries have been returned. * @stable ICU 2.0 */#define UBRK_DONE ((int32_t) -1)/** * Enum constants for the word break tags returned by * getRuleStatus(). A range of values is defined for each category of * word, to allow for further subdivisions of a category in future releases. * Applications should check for tag values falling within the range, rather * than for single individual values. * @stable ICU 2.2*/typedef enum UWordBreak { /** Tag value for "words" that do not fit into any of other categories. * Includes spaces and most punctuation. */ UBRK_WORD_NONE = 0, /** Upper bound for tags for uncategorized words. */ UBRK_WORD_NONE_LIMIT = 100, /** Tag value for words that appear to be numbers, lower limit. */ UBRK_WORD_NUMBER = 100, /** Tag value for words that appear to be numbers, upper limit. */ UBRK_WORD_NUMBER_LIMIT = 200, /** Tag value for words that contain letters, excluding * hiragana, katakana or ideographic characters, lower limit. */ UBRK_WORD_LETTER = 200, /** Tag value for words containing letters, upper limit */ UBRK_WORD_LETTER_LIMIT = 300, /** Tag value for words containing kana characters, lower limit */ UBRK_WORD_KANA = 300, /** Tag value for words containing kana characters, upper limit */ UBRK_WORD_KANA_LIMIT = 400, /** Tag value for words containing ideographic characters, lower limit */ UBRK_WORD_IDEO = 400, /** Tag value for words containing ideographic characters, upper limit */ UBRK_WORD_IDEO_LIMIT = 500} UWordBreak;/** * Enum constants for the line break tags returned by getRuleStatus(). * A range of values is defined for each category of * word, to allow for further subdivisions of a category in future releases. * Applications should check for tag values falling within the range, rather * than for single individual values. * @draft ICU 2.8*/typedef enum ULineBreakTag { /** Tag value for soft line breaks, positions at which a line break * is acceptable but not required */ UBRK_LINE_SOFT = 0, /** Upper bound for soft line breaks. */ UBRK_LINE_SOFT_LIMIT = 100, /** Tag value for a hard, or mandatory line break */ UBRK_LINE_HARD = 100, /** Upper bound for hard line breaks. */ UBRK_LINE_HARD_LIMIT = 200} ULineBreakTag;/** * Enum constants for the sentence break tags returned by getRuleStatus(). * A range of values is defined for each category of * sentence, to allow for further subdivisions of a category in future releases. * Applications should check for tag values falling within the range, rather * than for single individual values. * @draft ICU 2.8*/typedef enum USentenceBreakTag { /** Tag value for for sentences ending with a sentence terminator * ('.', '?', '!', etc.) character, possibly followed by a
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -