📄 draft-ietf-idn-amc-ace-m-00.txt
字号:
/* strlen() would return, which is why it is called output_size */ /* rather than output_length). The uppercase_flags array must */ /* hold input_length boolean values, where nonzero means the */ /* corresponding Unicode character should be forced to uppercase */ /* after being decoded, and zero means it is caseless or should */ /* be forced to lowercase. Alternatively, uppercase_flags may */ /* be a null pointer, which is equivalent to all zeros. The */ /* letters a-z and A-Z are always encoded literally, regardless */ /* of the corresponding flags. The encoder always outputs */ /* lowercase base-32 characters except when nonzero values */ /* of uppercase_flags require otherwise, so the encoder is */ /* compatible with any of the case models. The return value */ /* may be any of the amc_ace_status values defined above; if */ /* not amc_ace_success, then output_size and output may contain */ /* garbage. On success, the encoder will never need to write an */ /* output_size greater than input_length*5+6, because of how the */ /* encoding is defined. */int amc_ace_m_decode( enum case_sensitivity case_sensitivity, unsigned char *scratch_space, const unsigned char *input, unsigned int *output_length, u_code_point *output, unsigned char *uppercase_flags ); /* amc_ace_m_decode() converts AMC-ACE-M to Unicode. The input */ /* must be represented as null-terminated ASCII, and the output */ /* will be represented as an array of Unicode code points. */ /* The case_sensitivity argument influences the check on the */ /* well-formedness of the input string; it must be case_sensitive */ /* if case-sensitive comparisons are allowed on encoded strings, */ /* case_insensitive otherwise (see also section "Case sensitivity */ /* models" of the AMC-ACE-M specification). The scratch_space */ /* must point to space at least as large as the input, which will */ /* get overwritten (this allows the decoder to avoid calling */ /* malloc()). The output_length is an in/out argument: the */ /* caller must pass in the maximum number of code points that */ /* may be output, and on successful return it will contain the */ /* actual number of code points output. The uppercase_flags */ /* array must have room for at least output_length values, or it */ /* may be a null pointer if the case information is not needed. */ /* A nonzero flag indicates that the corresponding Unicode */ /* character should be forced to uppercase by the caller, while */ /* zero means it is caseless or should be forced to lowercase. */ /* The letters a-z and A-Z are output already in the proper case, */ /* but their flags will be set appropriately so that applying the */ /* flags would be harmless. The return value may be any of the */ /* amc_ace_status values defined above; if not amc_ace_success, */ /* then output_length, output, and uppercase_flags may contain */ /* garbage. On success, the decoder will never need to write */ /* an output_length greater than the length of the input (not */ /* counting the null terminator), because of how the encoding is */ /* defined. *//**********************************************************//* Implementation (would normally go in its own .c file): */#include <string.h>/* Character utilities: *//* is_ldh(codept) returns 1 if the code point represents an LDH *//* character (ASCII letter, digit, or hyphen-minus), 0 otherwise. */static int is_ldh(u_code_point codept){ if (codept == 45) return 1; if (codept < 48) return 0; if (codept <= 57) return 1; if (codept < 65) return 0; if (codept <= 90) return 1; if (codept < 97) return 0; if (codept <= 122) return 1; return 0;}/* is_AtoZ(c) returns 1 if c is an *//* uppercase ASCII letter, zero otherwise. */static unsigned char is_AtoZ(unsigned char c){ return c >= 65 && c <= 90;}/* special_row_offset[n] holds the offset of the *//* bottom of special row 0xD8 + n, where n is in 0..7. */static u_code_point special_row_offset[] = { 0x0020, 0x005B, 0x007B, 0x00A0, 0x00C0, 0x00DF, 0x0134, 0x0270 };/* base32[n] is the lowercase base-32 character representing *//* the number n from the range 0 to 31. Note that we cannot *//* use string literals for ASCII characters because an ANSI C *//* compiler does not necessarily use ASCII. */static const unsigned char base32[] = { 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, /* a-k */ 109, 110, /* m-n */ 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, /* p-z */ 50, 51, 52, 53, 54, 55, 56, 57 /* 2-9 */};/* base32_decode(c) returns the value of a base-32 character, in the *//* range 0 to 31, or the constant base32_invalid if c is not a valid *//* base-32 character. */enum { base32_invalid = 32 };static unsigned int base32_decode(unsigned char c){ if (c < 50) return base32_invalid; if (c <= 57) return c - 26; if (c < 97) c += 32; if (c < 97 || c == 108 || c == 111 || c > 122) return base32_invalid; return c - 97 - (c > 108) - (c > 111);}/* unequal(case_sensitivity,a1,a2,n) returns 0 if the arrays *//* a1 and a2 are equal in the first n positions, 1 otherwise. *//* If case_sensitivity is case_insensitive, then ASCII A-Z are *//* considered equal to a-z respectively. */static int unequal( enum case_sensitivity case_sensitivity, const unsigned char *a1, const unsigned char *a2, unsigned int n ){ const unsigned char *end; unsigned char c1, c2; if (case_sensitivity != case_insensitive) return memcmp(a1,a2,n); for (end = a1 + n; a1 < end; ++a1, ++a2) { c1 = *a1; c2 = *a2; if (c1 >= 65 && c1 <= 90) c1 += 32; if (c2 >= 65 && c2 <= 90) c2 += 32; if (c1 != c2) return 1; } return 0;}/* Encoder: */int amc_ace_m_encode( unsigned int input_length, const u_code_point *input, const unsigned char *uppercase_flags, unsigned int *output_size, unsigned char *output ){ unsigned int literal, wide; /* boolean */ u_code_point codept, n, diff, morebits; u_code_point A, B, C, offsetA, offsetB, offsetC, offset; const u_code_point *input_end, *p, *pp; unsigned int count, max, next_in, next_out, max_out, codelen, i; unsigned char c; input_end = input + input_length; /* 1) Verify that only valid code points appear: */ for (p = input; p < input_end; ++p) { if (*p >> 11 == 0x1B || *p > 0x10FFFF) return amc_ace_invalid_input; } /* 2) Determine the most populous row: B and offsetB */ /* first check the special rows: */ B = 0xD8; offsetB = special_row_offset[0]; max = 0; for (n = 0; n < 8; ++n) { offset = special_row_offset[n]; count = 0; for (p = input; p < input_end; ++p) { if (*p - offset <= 0xFF && !is_ldh(*p)) ++count; } if (count > max) { B = 0xD8 + n; offsetB = offset; max = count; } } /* now check the regular rows: */ for (pp = input; pp < input_end; ++pp) { n = *pp >> 8; count = 0; for (p = input; p < input_end; ++p) { if (*p >> 8 == n && !is_ldh(*p)) ++count; } if (count > max || (count == max && n < B)) { B = n; offsetB = n << 8; max = count; } } /* 3) Determine the most populous 16-window: A and offsetA */ A = 0; max = 0; for (n = 0; n <= 0x1F; ++n) { offset = ((offsetB >> 3) + n) << 3; count = 0; for (p = input; p < input_end; ++p) { if (*p - offset <= 0xF && !is_ldh(*p)) ++count; } if (count > max) { A = n; offsetA = offset; max = count; } } /* 4) Determine the most populous 20k-window: C */ C = 0; max = 0; for (pp = input; pp < input_end; ++pp) { count = 0; n = *pp >> 11; offset = n << 11; for (p = input; p < input_end; ++p) { if (*p - offset <= 0x4FFF && !is_ldh(*p)) ++count; if (count > max || (count == max && n < C)) { C = n; max = count; } } } /* 5) Determine the style to use: wide or narrow */ /* if narrow style were used: */ offsetC = (offsetB >> 12) << 12; count = 3 + (B > 0xFF); for (p = input; p < input_end; ++p) { if (is_ldh(*p)) { } else if (*p - offsetA <= 0xF) count += 1; else if (*p - offsetB <= 0xFF) count += 2; else if (*p - offsetC <= 0xFFF) count += 3; else if (*p <= 0xFFFF) count += 4; else count += 5; } max = count; /* if wide style were used: */ offsetC = C << 11; count = B <= 0xFF && C <= 0x1F ? 3 : 5; for (p = input; p < input_end; ++p) { if (is_ldh(*p)) { } else if (*p - offsetB <= 0xFF) count += 2; else if (*p - offsetC <= 0x4FFF) count += 3; else if (*p <= 0xFFFF) count += 4; else count += 5; } wide = (count < max); /* 6) Initialize offsetC, and encode the style and offsets: */ max_out = *output_size; next_out = 0; if (wide) { offsetC = C << 11; if (B <= 0xFF && C <= 0x1F) { if (max_out - next_out < 3) return amc_ace_output_too_big; output[next_out++] = base32[0x10 | (B >> 5)]; output[next_out++] = base32[B & 0x1F]; output[next_out++] = base32[C]; } else { if (max_out - next_out < 5) return amc_ace_output_too_big; output[next_out++] = base32[0x18 | (B >> 10)]; output[next_out++] = base32[(B >> 5) & 0x1F]; output[next_out++] = base32[B & 0x1F]; output[next_out++] = base32[C >> 5]; output[next_out++] = base32[C & 0x1F]; } } else { offsetC = (offsetB >> 12) << 12; if (B <= 0xFF) { if (max_out - next_out < 3) return amc_ace_output_too_big; output[next_out++] = base32[B >> 5]; output[next_out++] = base32[B & 0x1F]; } else { if (max_out - next_out < 4) return amc_ace_output_too_big; output[next_out++] = base32[8 | (B >> 10)]; output[next_out++] = base32[(B >> 5) & 0x1F]; output[next_out++] = base32[B & 0x1F]; } output[next_out++] = base32[A]; } /* 7) Main encoding loop: */ literal = 0; for (next_in = 0; next_in < input_length; ++next_in) { codept = input[next_in]; if (codept == 45 /* hyphen-minus */) { /* case 7.1 */ if (max_out - next_out < 2) return amc_ace_output_too_big; output[next_out++] = 45; output[next_out++] = 45; continue; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -