📄 nfkc.java

📁 java 的gnu包
💻 JAVA
字号:
/** * Copyright (C) 2004, 2005, 2006, 2007  Free Software Foundation, Inc. * * Author: Oliver Hitz * * This file is part of GNU Libidn. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 * USA */package gnu.inet.encoding;public class NFKC{  /**   * Applies NFKC normalization to a string.   *   * @param in The string to normalize.   * @return An NFKC normalized string.   */  public static String normalizeNFKC(String in)  {    StringBuffer out = new StringBuffer();    for (int i = 0; i < in.length(); i++) {      char code = in.charAt(i);      // In Unicode 3.0, Hangul was defined as the block from U+AC00      // to U+D7A3, however, since Unicode 3.2 the block extends until      // U+D7AF. The decomposeHangul function only decomposes until      // U+D7A3. Should this be changed?      if (code >= 0xAC00 && code <= 0xD7AF) {	out.append(decomposeHangul(code));      } else {	int index = decomposeIndex(code);	if (index == -1) {	  out.append(code);	} else {	  out.append(DecompositionMappings.m[index]);	}      }    }    // Bring the stringbuffer into canonical order.    canonicalOrdering(out);    // Do the canonical composition.    int last_cc = 0;    int last_start = 0;    for (int i = 0; i < out.length(); i++) {      int cc = combiningClass(out.charAt(i));      if (i > 0 && (last_cc == 0 || last_cc != cc)) { 	// Try to combine characters 	char a = out.charAt(last_start); 	char b = out.charAt(i); 	int c = compose(a, b); 	if (c != -1) { 	  out.setCharAt(last_start, (char) c); 	  out.deleteCharAt(i); 	  i--;	  if (i == last_start) {	    last_cc = 0;	  } else {	    last_cc = combiningClass(out.charAt(i-1));	  }	  continue;	}      }      if (cc == 0) { 	last_start = i;      }      last_cc = cc;    }    return out.toString();  }  /**   * Returns the index inside the decomposition table, implemented   * using a binary search.   *   * @param c Character to look up.   * @return Index if found, -1 otherwise.   */  static int decomposeIndex(char c)  {    int start = 0;    int end = DecompositionKeys.k.length/2;    while (true) {      int half = (start + end) / 2;      int code = DecompositionKeys.k[half*2];      if (c == code) {	return DecompositionKeys.k[half*2 + 1];      }      if (half == start) {	// Character not found	return -1;      } else if (c > code) {	start = half;      } else {	end = half;      }    }  }  /**   * Returns the combining class of a given character.   *   * @param c The character.   * @return The combining class.   */  static int combiningClass(char c)  {    int h = c >> 8;    int l = c & 0xff;    int i = CombiningClass.i[h];    if (i > -1) {      return CombiningClass.c[i][l];    } else {      return 0;    }  }  /**   * Rearranges characters in a stringbuffer in order to respect the   * canonical ordering properties.   *   * @param The StringBuffer to rearrange.   */  static void canonicalOrdering(StringBuffer in)  {    boolean isOrdered = false;    while (!isOrdered) {      isOrdered = true;      int lastCC = combiningClass(in.charAt(0));      for (int i = 0; i < in.length()-1; i++) {	int nextCC = combiningClass(in.charAt(i+1));	if (nextCC != 0 && lastCC > nextCC) {	  for (int j = i+1; j > 0; j--) {	    if (combiningClass(in.charAt(j-1)) <= nextCC) {	      break;	    }	    char t = in.charAt(j);	    in.setCharAt(j, in.charAt(j-1));	    in.setCharAt(j-1, t);	    isOrdered = false;	  }	  nextCC = lastCC;	}	lastCC = nextCC;      }    }  }  /**   * Returns the index inside the composition table.   *   * @param a Character to look up.   * @return Index if found, -1 otherwise.   */  static int composeIndex(char a)  {    if (a>>8 >= Composition.composePage.length) {      return -1;    }    int ap = Composition.composePage[a>>8];    if (ap == -1) {      return -1;    }    return Composition.composeData[ap][a & 0xff];  }  /**   * Tries to compose two characters canonically.   *   * @param a First character.   * @param b Second character.   * @return The composed character or -1 if no composition could be   * found.   */  static int compose(char a, char b)  {    int h = composeHangul(a, b);    if (h != -1) {      return h;    }    int ai = composeIndex(a);    if (ai >= Composition.singleFirstStart && ai < Composition.singleSecondStart) {      if (b == Composition.singleFirst[ai - Composition.singleFirstStart][0]) {	return Composition.singleFirst[ai - Composition.singleFirstStart][1];      } else {	return -1;      }    }    int bi = composeIndex(b);    if (bi >= Composition.singleSecondStart) {      if (a == Composition.singleSecond[bi - Composition.singleSecondStart][0]) {	return Composition.singleSecond[bi - Composition.singleSecondStart][1];      } else {	return -1;      }    }    if (ai >= 0 && ai < Composition.multiSecondStart &&	bi >= Composition.multiSecondStart && bi < Composition.singleFirstStart) {      char[] f = Composition.multiFirst[ai];      if (bi - Composition.multiSecondStart < f.length) {	char r = f[bi - Composition.multiSecondStart];	if (r == 0) {	  return -1;	} else {	  return r;	}      }    }    return -1;  }  /**   * Entire hangul code copied from:   * http://www.unicode.org/unicode/reports/tr15/   *   * Several hangul specific constants   */  static final int SBase = 0xAC00;  static final int LBase = 0x1100;  static final int VBase = 0x1161;  static final int TBase = 0x11A7;  static final int LCount = 19;  static final int VCount = 21;  static final int TCount = 28;  static final int NCount = VCount * TCount;  static final int SCount = LCount * NCount;  /**   * Decomposes a hangul character.   *   * @param s A character to decompose.   * @return A string containing the hangul decomposition of the input   * character. If no hangul decomposition can be found, a string   * containing the character itself is returned.   */  static String decomposeHangul(char s)  {    int SIndex = s - SBase;    if (SIndex < 0 || SIndex >= SCount) {      return String.valueOf(s);    }    StringBuffer result = new StringBuffer();    int L = LBase + SIndex / NCount;    int V = VBase + (SIndex % NCount) / TCount;    int T = TBase + SIndex % TCount;    result.append((char)L);    result.append((char)V);    if (T != TBase) result.append((char)T);    return result.toString();  }  /**   * Composes two hangul characters.   *   * @param a First character.   * @param b Second character.   * @return Returns the composed character or -1 if the two   * characters cannot be composed.   */  static int composeHangul(char a, char b)  {    // 1. check to see if two current characters are L and V    int LIndex = a - LBase;    if (0 <= LIndex && LIndex < LCount) {      int VIndex = b - VBase;      if (0 <= VIndex && VIndex < VCount) {	// make syllable of form LV	return SBase + (LIndex * VCount + VIndex) * TCount;      }    }        // 2. check to see if two current characters are LV and T    int SIndex = a - SBase;    if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0) {      int TIndex = b - TBase;      if (0 <= TIndex && TIndex <= TCount) {	// make syllable of form LVT	return a+TIndex;      }    }    return -1;  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -