charset.c

来自「tcpmp.src.0.72RC1 优秀的多媒体播放器TCPMP的源代码」· C语言代码 · 共 532 行
532 行
/* * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org> *  * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. *  * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. *  * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA *//* * See the corresponding header file for a description of the functions * that this file provides. * * This was first written for Ogg Vorbis but could be of general use. * * The only deliberate assumption about data sizes is that a short has * at least 16 bits, but this code has only been tested on systems with * 8-bit char, 16-bit short and 32-bit int. */#ifdef HAVE_CONFIG_H#include <config.h>#endif#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */#include <stdlib.h>#include "charset.h"#include "charmaps.h"/* * This is like the standard strcasecmp, but it does not depend * on the locale. Locale-dependent functions can be dangerous: * we once had a bug involving strcasecmp("iso", "ISO") in a * Turkish locale! * * (I'm not really sure what the official standard says * about the sign of strcasecmp("Z", "["), but usually * we're only interested in whether it's zero.) */static int ascii_strcasecmp(const char *s1, const char *s2){  char c1, c2;  for (;; s1++, s2++) {    if (!*s1 || !*s1)      break;    if (*s1 == *s2)      continue;    c1 = *s1;    if ('a' <= c1 && c1 <= 'z')      c1 += 'A' - 'a';    c2 = *s2;    if ('a' <= c2 && c2 <= 'z')      c2 += 'A' - 'a';    if (c1 != c2)      break;  }  return (unsigned char)*s1 - (unsigned char)*s2;}/* * UTF-8 equivalents of the C library's wctomb() and mbtowc(). */int utf8_mbtowc(int *pwc, const char *s, size_t n){  unsigned char c;  int wc, i, k;  if (!n || !s)    return 0;  c = *s;  if (c < 0x80) {    if (pwc)      *pwc = c;    return c ? 1 : 0;  }  else if (c < 0xc2)    return -1;  else if (c < 0xe0) {    if (n >= 2 && (s[1] & 0xc0) == 0x80) {      if (pwc)	*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);      return 2;    }    else      return -1;  }  else if (c < 0xf0)    k = 3;  else if (c < 0xf8)    k = 4;  else if (c < 0xfc)    k = 5;  else if (c < 0xfe)    k = 6;  else    return -1;  if (n < (size_t)k)    return -1;  wc = *s++ & ((1 << (7 - k)) - 1);  for (i = 1; i < k; i++) {    if ((*s & 0xc0) != 0x80)      return -1;    wc = (wc << 6) | (*s++ & 0x3f);  }  if (wc < (1 << (5 * k - 4)))    return -1;  if (pwc)    *pwc = wc;  return k;}int utf8_wctomb(char *s, int wc1){  unsigned int wc = wc1;  if (!s)    return 0;  if (wc < (1u << 7)) {    *s++ = wc;    return 1;  }  else if (wc < (1u << 11)) {    *s++ = 0xc0 | (wc >> 6);    *s++ = 0x80 | (wc & 0x3f);    return 2;  }  else if (wc < (1u << 16)) {    *s++ = 0xe0 | (wc >> 12);    *s++ = 0x80 | ((wc >> 6) & 0x3f);    *s++ = 0x80 | (wc & 0x3f);    return 3;  }  else if (wc < (1u << 21)) {    *s++ = 0xf0 | (wc >> 18);    *s++ = 0x80 | ((wc >> 12) & 0x3f);    *s++ = 0x80 | ((wc >> 6) & 0x3f);    *s++ = 0x80 | (wc & 0x3f);    return 4;  }  else if (wc < (1u << 26)) {    *s++ = 0xf8 | (wc >> 24);    *s++ = 0x80 | ((wc >> 18) & 0x3f);    *s++ = 0x80 | ((wc >> 12) & 0x3f);    *s++ = 0x80 | ((wc >> 6) & 0x3f);    *s++ = 0x80 | (wc & 0x3f);    return 5;  }  else if (wc < (1u << 31)) {    *s++ = 0xfc | (wc >> 30);    *s++ = 0x80 | ((wc >> 24) & 0x3f);    *s++ = 0x80 | ((wc >> 18) & 0x3f);    *s++ = 0x80 | ((wc >> 12) & 0x3f);    *s++ = 0x80 | ((wc >> 6) & 0x3f);    *s++ = 0x80 | (wc & 0x3f);    return 6;  }  else    return -1;}/* * The charset "object" and methods. */struct charset {  int max;  int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);  int (*wctomb)(void *table, char *s, int wc);  void *map;};int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n){  return (*charset->mbtowc)(charset->map, pwc, s, n);}int charset_wctomb(struct charset *charset, char *s, int wc){  return (*charset->wctomb)(charset->map, s, wc);}int charset_max(struct charset *charset){  return charset->max;}/* * Implementation of UTF-8. */static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n){  (void)map;  return utf8_mbtowc(pwc, s, n);}static int wctomb_utf8(void *map, char *s, int wc){  (void)map;  return utf8_wctomb(s, wc);}/* * Implementation of US-ASCII. * Probably on most architectures this compiles to less than 256 bytes * of code, so we can save space by not having a table for this one. */static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n){  int wc;  (void)map;  if (!n || !s)    return 0;  wc = (unsigned char)*s;  if (wc & ~0x7f)    return -1;  if (pwc)    *pwc = wc;  return wc ? 1 : 0;}static int wctomb_ascii(void *map, char *s, int wc){  (void)map;  if (!s)    return 0;  if (wc & ~0x7f)    return -1;  *s = wc;  return 1;}/* * Implementation of ISO-8859-1. * Probably on most architectures this compiles to less than 256 bytes * of code, so we can save space by not having a table for this one. */static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n){  int wc;  (void)map;  if (!n || !s)    return 0;  wc = (unsigned char)*s;  if (wc & ~0xff)    return -1;  if (pwc)    *pwc = wc;  return wc ? 1 : 0;}static int wctomb_iso1(void *map, char *s, int wc){  (void)map;  if (!s)    return 0;  if (wc & ~0xff)    return -1;  *s = wc;  return 1;}/* * Implementation of any 8-bit charset. */struct map {  const unsigned short *from;  struct inverse_map *to;};static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n){  struct map *map = map1;  unsigned short wc;  if (!n || !s)    return 0;  wc = map->from[(unsigned char)*s];  if (wc == 0xffff)    return -1;  if (pwc)    *pwc = (int)wc;  return wc ? 1 : 0;}/* * For the inverse map we use a hash table, which has the advantages * of small constant memory requirement and simple memory allocation, * but the disadvantage of slow conversion in the worst case. * If you need real-time performance while letting a potentially * malicious user define their own map, then the method used in * linux/drivers/char/consolemap.c would be more appropriate. */struct inverse_map {  unsigned char first[256];  unsigned char next[256];};/* * The simple hash is good enough for this application. * Use the alternative trivial hashes for testing. */#define HASH(i) ((i) & 0xff)/* #define HASH(i) 0 *//* #define HASH(i) 99 */static struct inverse_map *make_inverse_map(const unsigned short *from){  struct inverse_map *to;  char used[256];  int i, j, k;  to = (struct inverse_map *)malloc(sizeof(struct inverse_map));  if (!to)    return 0;  for (i = 0; i < 256; i++)    to->first[i] = to->next[i] = used[i] = 0;  for (i = 255; i >= 0; i--)    if (from[i] != 0xffff) {      k = HASH(from[i]);      to->next[i] = to->first[k];      to->first[k] = i;      used[k] = 1;    }  /* Point the empty buckets at an empty list. */  for (i = 0; i < 256; i++)    if (!to->next[i])      break;  if (i < 256)    for (j = 0; j < 256; j++)      if (!used[j])	to->first[j] = i;  return to;}int wctomb_8bit(void *map1, char *s, int wc1){  struct map *map = map1;  unsigned short wc = wc1;  int i;  if (!s)    return 0;  if (wc1 & ~0xffff)    return -1;  if (1) /* Change 1 to 0 to test the case where malloc fails. */    if (!map->to)      map->to = make_inverse_map(map->from);  if (map->to) {    /* Use the inverse map. */    i = map->to->first[HASH(wc)];    for (;;) {      if (map->from[i] == wc) {	*s = i;	return 1;      }      if (!(i = map->to->next[i]))	break;    }  }  else {    /* We don't have an inverse map, so do a linear search. */    for (i = 0; i < 256; i++)      if (map->from[i] == wc) {	*s = i;	return 1;      }  }  return -1;}/* * The "constructor" charset_find(). */struct charset charset_utf8 = {  6,  &mbtowc_utf8,  &wctomb_utf8,  0};struct charset charset_iso1 = {  1,  &mbtowc_iso1,  &wctomb_iso1,  0};struct charset charset_ascii = {  1,  &mbtowc_ascii,  &wctomb_ascii,  0};struct charset *charset_find(const char *code){  int i;  /* Find good (MIME) name. */  for (i = 0; names[i].bad; i++)    if (!ascii_strcasecmp(code, names[i].bad)) {      code = names[i].good;      break;    }  /* Recognise some charsets for which we avoid using a table. */  if (!ascii_strcasecmp(code, "UTF-8"))    return &charset_utf8;  if (!ascii_strcasecmp(code, "US-ASCII"))    return &charset_ascii;  if (!ascii_strcasecmp(code, "ISO-8859-1"))    return &charset_iso1;  /* Look for a mapping for a simple 8-bit encoding. */  for (i = 0; maps[i].name; i++)    if (!ascii_strcasecmp(code, maps[i].name)) {      if (!maps[i].charset) {	maps[i].charset = (struct charset *)malloc(sizeof(struct charset));	if (maps[i].charset) {	  struct map *map = (struct map *)malloc(sizeof(struct map));	  if (!map) {	    free(maps[i].charset);	    maps[i].charset = 0;	  }	  else {	    maps[i].charset->max = 1;	    maps[i].charset->mbtowc = &mbtowc_8bit;	    maps[i].charset->wctomb = &wctomb_8bit;	    maps[i].charset->map = map;	    map->from = maps[i].map;	    map->to = 0; /* inverse mapping is created when required */	  }	}      }      return maps[i].charset;    }  return 0;}/* * Function to convert a buffer from one encoding to another. * Invalid bytes are replaced by '#', and characters that are * not available in the target encoding are replaced by '?'. * Each of TO and TOLEN may be zero, if the result is not needed. * The output buffer is null-terminated, so it is all right to * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0). */int charset_convert(const char *fromcode, const char *tocode,		    const char *from, size_t fromlen,		    char **to, size_t *tolen){  int ret = 0;  struct charset *charset1, *charset2;  char *tobuf, *p, *newbuf;  int i, j, wc;  charset1 = charset_find(fromcode);  charset2 = charset_find(tocode);  if (!charset1 || !charset2 )    return -1;  tobuf = (char *)malloc(fromlen * charset2->max + 1);  if (!tobuf)    return -2;  for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {    i = charset_mbtowc(charset1, &wc, from, fromlen);    if (!i)      i = 1;    else if (i == -1) {      i  = 1;      wc = '#';      ret = 2;    }    j = charset_wctomb(charset2, p, wc);    if (j == -1) {      if (!ret)	ret = 1;      j = charset_wctomb(charset2, p, '?');      if (j == -1)	j = 0;    }  }  if (tolen)    *tolen = p - tobuf;  *p++ = '\0';  if (to) {    newbuf = realloc(tobuf, p - tobuf);    *to = newbuf ? newbuf : tobuf;  }  else    free(tobuf);  return ret;}#endif /* USE_CHARSET_ICONV */
charset.c - 源码说明

本页面展示了「tcpmp.src.0.72RC1 优秀的多媒体播放器TCPMP的源代码」中的 charset.c 源码文件，采用 C语言编程语言编写，共 532 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与tcpmp相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?