📄 charsets.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
#include <stdio.h>#include <string.h>#include <stdlib.h>#include "catdoc.h"char *charset_path=CHARSETPATH;char *source_csname=SOURCE_CHARSET, *dest_csname=TARGET_CHARSET;short int * source_charset;int unknown_as_hex=0;char bad_char[]={UNKNOWN_CHAR,0};CHARSET target_charset;/************************************************************************//* Converts char in input charset into unicode representation           *//* Should be converted to macro                                         *//************************************************************************/int to_unicode (short int *charset, int c) {  return charset[c];}/************************************************************************//* Search inverse charset record for given unicode char and returns     *//* 0-255 char value if found, -1 otherwise                              *//************************************************************************/int from_unicode (CHARSET charset, int u) {   short int *p;   if ((p=charset[(unsigned)u>>8])) {     return p[u & 0xff];   } else {     return -1;   }}/************************************************************************//*  Converts direct (charset -> unicode) to reverse map                 *//************************************************************************/CHARSET make_reverse_map(short int *charset) {   CHARSET newmap=calloc(sizeof(short int *), 256);   int i,j,k,l;   short int *p;      for (i=0;i<256;i++) {     k= charset[i];     j=  (unsigned)k>>8;     if (!newmap[j]) {       newmap[j] = malloc(sizeof(short int *)*256);       if (!newmap[j]) {           fprintf(stderr,"Insufficient memory for  charset\n");           exit(1);       }       for (l=0,p=newmap[j];l<256;l++,p++) *p=-1;     }     p=newmap[j];     p[k & 0xff]=i;   }   return newmap;}/************************************************************************//* Reads charset file (as got from ftp.unicode.org) and returns array of*//* 256 short ints (malloced) mapping from charset t unicode             *//************************************************************************/short int * read_charset(char *filename) {  char *path;  FILE *f;  short int *new=calloc(sizeof(short int),256);  int c;  long int uc;  path= find_file(stradd(filename,CHARSET_EXT),charset_path);  if (!path) {     fprintf(stderr,"Cannot load charset %s - file not found\n",filename);     return NULL;  }  f=fopen(path,"rb");  if (!f) {    perror(path);     return NULL;  }  if (input_buffer)     setvbuf(f,input_buffer,_IOFBF,FILE_BUFFER);  /* defaults */  for (c=0;c<32;c++) {    new[c]=c;  }  while (!feof(f)) {     if (fscanf(f,"%i %li",&c,&uc)==2) {       if (c<0||c>255||uc<0||(uc>0xFEFE&& uc!=0xFFFE)) {	  fprintf(stderr,"Invalid charset file %s\n",path);          fclose(f);          return NULL;       }       new[c]=uc;     }       while((fgetc(f)!='\n')&&!feof(f)) ;  }  fclose (f);  free(path);  return new;}/************************************************************************//* Reads 8-bit char and convers it from source charset                  *//************************************************************************/int get_8bit_char (FILE *f,long *offset){ int c = fgetc(f);  (*offset)++;    if (c==EOF) return c;   else return to_unicode(source_charset,c);}/************************************************************************//* Reads 16-bit unicode value. MS-Word runs on LSB-first machine only,  *//* so read lsb first always and don't care about proper bit order       *//************************************************************************/int get_utf16lsb (FILE *f,long *offset) {  int d,c = fgetc(f);  if (c == EOF) return EOF;  if ((d=fgetc(f))==EOF) return EOF;  c |= (d<<8);  (*offset)+=2;  if (c==EOF) return (int)0xFEFF;  return c;}  /************************************************************************//* Reads 16-bit unicode value written in MSB order. For processing  * non-word files            .                                          *//************************************************************************/int get_utf16msb (FILE *f,long *offset) {  int d,c = fgetc(f);  if (c == EOF) return EOF;  if ((d=fgetc(f))==EOF) return EOF;  c |= (d<<8);  (*offset)+=2;  if (c==EOF) return (int)0xFEFF;  return c;}int get_utf8 (FILE *f,long *offset) {   int c,d;   d=0;   if ((c=fgetc(f))==EOF) return EOF;   if (c<0x80)       return c;   if (c <0xC0)       return 0xfeff; /*skip corrupted sequebces*/   if (c <0xE0) {     return ((c & 0x1F)<<6 | (fgetc(f) & 0x3F));   }   if (c <0xF0) {     return ((c & 0x0F)<<12)|((fgetc(f) & 0x3f)<<6)|(fgetc(f) & 0x3f);   }     return 0xFEFF; }/**************************************************************************//*  Converts unicode char to output charset sequence. Coversion have      *//*  three steps: 1. Replacement map is searched for the character in case *//* it is not allowed for output format (% in TeX, < in HTML               *//* 2. target charset is searched for this unicode char, if it wasn't      *//*  replaced. If not found, then 3. Substitution map is searched          *//**************************************************************************/char *convert_char(int uc) {    static char plain_char[]="a"; /*placeholder for one-char sequences */    static char hexbuf[8];    char *mapped;    int c;    if ((mapped=map_subst(spec_chars,uc))) return mapped;    if (target_charset) { 	    c =from_unicode(target_charset,uc);	if (c>=0) {	    *plain_char=c;	    return plain_char;	}	if ((mapped = map_subst(replacements,uc))) return mapped;	if (unknown_as_hex) {	   sprintf(hexbuf,"\\x%04X",(unsigned)uc);	   /* This sprintf is safe, becouse uc is unicode character code,	      which cannot be greater than 0xFFFE. It is ensured by routines	      in reader.c	    */	   return hexbuf;	}   	return  bad_char;    } else {      /* NULL target charset means UTF-8 output */      return to_utf8(uc);    }  }char *to_utf8(unsigned int uc) {    static char utfbuffer[4]; /* it shouldn't overflow becouse we never deal			  with chars greater than 65535*/    int count=0;    if (uc< 0x80) {      utfbuffer[0]=uc;      count=1;    } else  {	if (uc < 0x800) {	  utfbuffer[count++]=0xC0 | (uc >> 6);	} else {	  utfbuffer[count++]=0xE0 | (uc >>12);	  utfbuffer[count++]=0x80 | ((uc >>6) &0x3F);	}	            utfbuffer[count++]=0x80 | (uc & 0x3F);    }      utfbuffer[count]=0;    return utfbuffer;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -