📄 reader.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
#include <string.h>#include <stdio.h>#include "catdoc.h"static unsigned short int buffer[PARAGRAPH_BUFFER];static unsigned char read_buf[256];static int buf_is_unicode;/**************************************************************************//* Just prints out content of input file. Called when file is not OLE     *//* stream                                                                 *//**************************************************************************/void copy_out (FILE *f,char *header) {    char *buf=(char *)buffer;    int count,i;    long offset;    if (get_unicode_char == get_word8_char) {	/* non-word file and -u specified. Trying to guess which kind of	 * unicode is used	 */	if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {	     get_unicode_char = get_utf16msb;	     fputs(convert_char(header[2]<<8|header[3]),stdout); 	     fputs(convert_char(header[4]<<8|header[5]),stdout); 	     fputs(convert_char(header[6]<<8|header[7]),stdout); 	} else if ((unsigned char)header[0]!=0xFF ||		    (unsigned char)header[1]!=0xFE) {	    int c,j,d;	    /* if it is not utf16, assume it is UTF8. We are told -u,	     * aren't we */	    get_unicode_char = get_utf8;	    i=0;	    while (i<8) {		c=(unsigned char)header[i++];				if (c >=0x80) {		    if ( c<0xE0) {			c=(c & 0x1F);			count =1;		    } else {			c=(c & 0xF);			count = 2;		    }		    for (j=0;j<count;j++) {			if (i<7) {			    d=(unsigned char) header[i++];			} else {			    d=fgetc(f);			}			c=c<<6 | (d & 0x3F);		    }		}		fputs (convert_char(c),stdout);	    }	} else {	     get_unicode_char = get_utf16lsb;	     fputs(convert_char(header[3]<<8|header[2]),stdout); 	     fputs(convert_char(header[5]<<8|header[4]),stdout); 	     fputs(convert_char(header[7]<<8|header[6]),stdout);         }	            while (!feof(f)) {	    i=get_unicode_char(f,&offset); 	    if (i!=EOF) fputs(convert_char(i),stdout);	}        } else {	/* Assuming 8-bit input text */	while ((count = fread(buf,1,PARAGRAPH_BUFFER,f))) {	    for (i=0;i<count;i++) {		fputs(convert_char(to_unicode(source_charset,			(unsigned char)buf[i])),stdout);	    }		       	}    } } /**************************************************************************//*  process_file - main process engine. Reads word file using function,   *//*  pointed by get_unicode_char, searches for things which looks like     *//*  paragraphs and print them out                                         *//**************************************************************************/void process_file(FILE *f,long stop) {    int bufptr;    int tabmode=0;    long offset=0;    unsigned short c;    /* Now we are starting to read with get_unicode_char */    while (!feof(f)&&offset<stop) {        bufptr = -1;        do {             c=get_unicode_char(f,&offset);	    /* Following symbols below 32 are allowed inside paragraph:               0x0002 - footnote mark	       0x0007 - table separator (converted to tabmode)               0x0009 - Horizontal tab ( printed as is)	       0x000B - hard return               0x000D - return - marks an end of paragraph               0x001E - IS2 for some reason means short defis in Word.	       0x001F - soft hyphen in Word               0x000B - means same as return             */	     if (tabmode) {		 tabmode=0;	         if (c==0x007) {		    buffer[++bufptr]=0x1E;		    continue;		 } else {		    buffer[++bufptr]=0x1C;		 }  	     }   	              if (c<32) {                switch (c) {		    case 0x007:			tabmode = 1;                        break;		    case 0x000D:                    case 0x000B:                        buffer[++bufptr]=0x000A;                        break;                    case 0x001E:                        buffer[++bufptr]='-';                        break;                    case 0x0002: break;		    case 0x001F:		        buffer[++bufptr]=0xAD;/* translate to Unicode			                         soft hyphen */                        break;						                      case 0x0009:                        buffer[++bufptr]=c;                        break;                    default:                        bufptr=-1; /* Any other control char - discard para*/                }	    } else if (c >=0xfeff) {                bufptr = -1;/* not a valid unicode - discard paragraph */            } else {                buffer[++bufptr]=c;            }        } while (bufptr<PARAGRAPH_BUFFER-2&&!feof(f) && buffer[bufptr]!=0x000a);	if (bufptr>0) {	   buffer[++bufptr]=0;           output_paragraph(buffer);        }    }}int get_word8_char(FILE *f,long *offset) {  int count,i,u;  char c;  if ((i=(*offset)%256) ==0) {     count=fread(read_buf,1,256,f);     memset(read_buf+count,0,256-count);     buf_is_unicode=0;     while (i<count) {       c=read_buf[i++];       if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {	   buf_is_unicode=1;	   break;       }       i++;     }        i=0;  }      if (buf_is_unicode) {      u=read_buf[i] | read_buf[i+1]<<8;      (*offset)+=2;  } else {      u=to_unicode(source_charset,read_buf[i]);      (*offset)++;  }  return u;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -