⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 rtfparser.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
/*RTF2HTML.c, Chuck Shotton - 6/21/93 *//************************************************************************ * This program takes a stab at converting RTF (Rich Text Format) files * into HTML. There are some limitations that keep RTF from being able to * easily represent things like in-line images and anchors as styles. In * particular, RTF styles apply to entire "paragraphs", so anchors or * images in the middle of a text stream can't easily be represented by * styles. The intent is to ultimately use something like embedded text * color changes to represent these constructs.  *  * In the meantime, you can take existing Word documents, apply the * correct style sheet, and convert them to HTML with this tool. * * AUTHOR: Chuck Shotton, UT-Houston Academic Computing, *         cshotton@oac.hsc.uth.tmc.edu *          *         Dmitry Potapov, CapitalSoft *         dpotapov@capitalsoft.com * * USAGE: rtf2html [rtf_filename]  * * BEHAVIOR: *        rtf2html will open the specified RTF input file or read from *        standard input, writing converted HTML to standard output. * * NOTES: *        The RTF document must be formatted with a style sheet that has *        style numberings that conform to the style_mappings table *        defined in this source file. * * MODIFICATIONS: *         6/21/93 : Chuck Shotton - created version 1.0. *        11/26/98 : Dmitry Potapov - version 1.1 beta * ************************************************************************//* Note, the source is formated with 4 character tabs */#include <stdlib.h>#include <stdio.h>#include <string.h>#include <ctype.h>#include <catdoc.h> #ifdef _MSC_VER#	define	strcasecmp _stricmp#endif#ifndef TRUE#define TRUE -1#define FALSE 0#endif#define MAX_LEVELS 40	/*defines the # of nested in-line styles (pairs of {})*/#define MAX_RTF_TOKEN 40#define MAX_INLINE_STYLES 5 /*defines # of in-line styles, bold, italic, etc.*/typedef struct tag_StyleState{	unsigned char s: MAX_INLINE_STYLES;} TStyleState;typedef enum { s_plain, s_bold, s_italic, s_underline, s_hidden, /*in-line styles*/	s_para,	s_br,	  /*pseudo style*/	s_h0, s_h1, s_h2, s_h3, s_h4, s_h5, s_h6 /*heading styles*/} StyleState;char *styles[][2] = {		/*HTML Start and end tags for styles*/	{"", ""},	{"<b>", "</b>"},	{"<i>", "</i>"},	{"<u>", "</u>"},	{"<!-- ", " -->"},	{"<p>\n", ""},	{"<br>\n",""},	{"", ""},	{"<h1>", "</h1>"},	{"<h2>", "</h2>"},	{"<h3>", "</h3>"},	{"<h4>", "</h4>"},	{"<h5>", "</h5>"},	{"<h6>", "</h6>"}};/* style_mappings maps the style numbers in a RTF style sheet into one of the*//* (currently) six paragraph-oriented HTML styles (i.e. heading 1 through 6.)*//* Additional styles for lists, etc. should be added here. Style info        *//* ultimately should be read from some sort of config file into these tables.*/#define MAX_NAME_LEN 40char style_name[MAX_NAME_LEN];#define STYLE_NUMBER 7char *style_namings[STYLE_NUMBER] = {	"", "heading 1", "heading 2", "heading 3", "heading 4", "heading 5",	"heading 6"};char style_mappings[STYLE_NUMBER][MAX_RTF_TOKEN];char style_number[MAX_RTF_TOKEN];/* RTF tokens that mean something to the parser. All others are ignored. */typedef enum {	t_start,	t_fonttbl, t_colortbl, t_stylesheet, t_info, t_s, t_b, t_ul, t_ulw, 	t_uld, t_uldb, t_i, t_v, t_plain, t_par, t_pict, t_tab, t_bullet, 	t_cell, t_row, t_line, t_endash, t_emdash, 	t_end} TokenIndex;char *tokens[] = {	"###", 	"fonttbl", "colortbl", "stylesheet", "info", "s", "b", "ul", "ulw", 	"uld", "uldb", "i", "v", "plain", "par", "pict", "tab", "bullet",	"cell", "row", "line", "endash", "emdash",	"###"};TStyleState style_state[MAX_LEVELS], curr_style;short curr_heading;void (*RTF_DoControl)(char*,char*);char isBody;char* title;FILE* f;short 	level,		/*current {} nesting level*/ 	skip_to_level,/*{} level to which parsing should skip (used to skip */ 	              /*  font tables, style sheets, color tables, etc.)    */ 	gobble,	/*Flag set to indicate all input should be discarded  */ 	ignore_styles;/*Set to ignore inline style expansions after style use*//**************************************/char RTF_GetChar(){	char ch;	do{		ch=fgetc(f);	} while ((ch=='\r')||(ch=='\n'));	return ch;}/**************************************/char RTF_UnGetChar(char ch){	return ungetc(ch,f);}/**************************************/void RTF_PutStr(char* s){	if (gobble) return;	fputs(s, stdout);}/**************************************//**************************************/void RTF_PutChar(char ch){	if (gobble) return;	if(!isBody)	{		isBody=TRUE;	}        # call catdoc function here 	}}/**************************************/void RTF_PlainStyle (TStyleState* s){	int i;	for(i=0;i<MAX_INLINE_STYLES;i++)	{		if(s->s & (1<<i))			RTF_PutStr(styles[i][1]);	}	s->s=0;}/**************************************/void RTF_SetStyle(TStyleState* s, StyleState style){	if( (!ignore_styles||(style==s_hidden)) && ((s->s&(1<<style))==0) )	{		RTF_PutStr(styles[style][0]);		s->s|=(1<<style);	}}/**************************************/void RTF_PushState(short* level){//	printf("<!--PushState=%X-->",curr_style.s);	if(*level>=MAX_LEVELS)	{		fprintf(stderr,"Exceed maximum level\n");		exit(-1);	}	style_state[*level]=curr_style;	(*level)++;}/**************************************/void RTF_PopState(short* level){	int j;	TStyleState new_style;	//	printf("<!--PopState=%X-->",curr_style.s);	if(*level<1)	{		fprintf(stderr,"RTF parse error: unexpected '}'\n");		exit(-1);	}	new_style = style_state[*level-1];	/*close off any in-line styles*/	for (j=0;j<MAX_INLINE_STYLES;j++) 	{		if ( ((curr_style.s & (1<<j))!=0) && ((new_style.s & (1<<j))==0) )		{			curr_style.s &= ~(1<<j);			RTF_PutStr(styles[j][1]);		}	}		for (j=0;j<MAX_INLINE_STYLES;j++) 	{		if( ((curr_style.s & (1<<j))==0) && ((new_style.s & (1<<j))!=0) )			RTF_PutStr(styles[j][0]);	}	(*level)--;	curr_style = new_style;	if (*level == skip_to_level) {		skip_to_level = -1;		gobble = FALSE;	}}/**************************************//* Map a style number into a HTML heading */short RTF_MapStyle(char* s){	int i;	for (i=0;i<7;i++)		if (!strcmp(style_mappings[i], s))			return (i);	return (0);}/**************************************/void RTF_AddStyleMap(char* name, char* number){	int i, len;	len=strlen(name);	if( name[len-1]==';') name[--len]=0;	for(i=0;i<STYLE_NUMBER;i++)	{		if(!strcasecmp(name,style_namings[i]))		{			strcpy(style_mappings[i],number);			return;		}	}}/**************************************/void RTF_BuildName(char* token, char ch){	int len;	len = strlen(token);	if(len>=MAX_NAME_LEN-1)		return;	token[len]=ch;	token[len+1]=0;}/**************************************/void RTF_ClearName(char* token){	token[0]=0;}/**************************************/TokenIndex GetTokenIndex(char* control){	TokenIndex i;	for (i=t_start; i<t_end; i++) 	{		if(control[0]==tokens[i][0]) /* Added for fast compare */			if (!strcmp(control, tokens[i]))				break;	}	return i;}/**************************************/void RTF_DoStyleControl (char* control, char* arg){	if(GetTokenIndex(control)==t_s)	{		strcpy(style_number,arg);	}}/**************************************/int chartoi(char ch){	if((ch>='0')&&(ch<='9'))		return ch-'0';	if((ch>='A')&&(ch<='Z'))		return ch-'A'+10;	if((ch>='a')&&(ch<='z'))		return ch-'a'+10;	return -1;}/**************************************/void RTF_BuildArg (char ch, char* arg){	int i=0;	if(feof(f))	{		arg[0]=0;		return;	}	if(ch=='-')	{		arg[i++]='-';		ch=RTF_GetChar();		if(feof(f))		{			arg[0]=0;			return;		}	}	for(;isdigit(ch);i++)	{		arg[i]=ch;		if(i>=MAX_RTF_TOKEN-1)		{			arg[MAX_RTF_TOKEN-1]=0;			while(isdigit(ch)) {				ch=RTF_GetChar();				if(feof(f))					return;			} 			break;		}		ch=RTF_GetChar();		if(feof(f))		{			arg[i+1]=0;			return;		}	}	arg[i]=0;	if(!isspace(ch))		RTF_UnGetChar(ch);}	/**************************************/void RTF_BuildToken (char ch){	int i;		for(i=1;;i++)	{		char token[MAX_RTF_TOKEN], arg[MAX_RTF_TOKEN];		token[i-1]=ch;		if(i>=MAX_RTF_TOKEN-1)		{			do {				ch=RTF_GetChar();				if(feof(f))					return;			} while (isalpha(ch)); 				RTF_BuildArg(ch,arg);			return;		}		ch=RTF_GetChar();		if(feof(f))		{			token[i]=0;			RTF_DoControl(token,"");			return;		}		if( !isalpha(ch) )		{			token[i]=0;			RTF_BuildArg(ch,arg);			RTF_DoControl(token,arg);			return;		}	}}/**************************************/void RTF_backslash(char* pch, char* pf){	char ch;	*pf=FALSE;	ch=RTF_GetChar();	if(feof(f))	{		fprintf(stderr,"Unexpected end of file\n");		return;	}	switch (ch) 	{		case '\\':		case '{':		case '}':			*pch=ch; *pf=TRUE;			break;		case '*':			gobble = TRUE;	/*perform no output, ignore commands 'til level-1*/			if(skip_to_level>level-1||skip_to_level==-1) 				skip_to_level = level-1;			break;		case '\'':		{			char ch1, ch2;			ch1 = RTF_GetChar();			ch2 = RTF_GetChar();			if(!feof(f)) 			{				if(isxdigit(ch1)&&isxdigit(ch2))				{					ch = chartoi(ch1)*16+chartoi(ch2);					*pch=ch; *pf=TRUE;				} else {					fprintf(stderr,"RTF Error: unexpected '%c%c' after \\\'\n",ch1,ch2);				}			}			break;		}		default:			if (isalpha(ch)) 			{				RTF_BuildToken(ch);			} else {				fprintf(stderr, "\nRTF Error: unexpected '%c' after \\.\n", ch);			}			break;	}}/**************************************/void RTF_ParseStyle(){	char ch, pf;	int level0;	void (*PrevDoControl)(char*,char*);		level0=level;	PrevDoControl=RTF_DoControl;	RTF_DoControl=RTF_DoStyleControl;		RTF_ClearName(style_name);	style_number[0]=0;	while (1) 	{		ch = RTF_GetChar();		if(feof(f))			break;		switch (ch) 		{			case '\\':				RTF_backslash(&ch,&pf);				if(pf)				{					RTF_BuildName(style_name,ch);				} else {					RTF_ClearName(style_name);				}				break;						case '{':				level++;				RTF_ClearName(style_name);				break;						case '}':				if(level0+1==level)				{					if(style_number[0]!=0)					{						RTF_AddStyleMap(style_name,style_number);						style_number[0]=0;					}				} else if(level0==level) {					RTF_DoControl=PrevDoControl;					RTF_UnGetChar(ch);					return;				}				level--;				RTF_ClearName(style_name); 				break;							default:				RTF_BuildName(style_name,ch);				break;		}	} /* while */}/**************************************//* Perform actions for RTF control words */void RTF_DoBodyControl (char* control,char* arg){	short style;	if (gobble) return;	switch (GetTokenIndex(control)) 	{		case t_stylesheet:			gobble = TRUE;	/*perform no output, ignore commands 'til level-1*/			skip_to_level = level-1;			RTF_ParseStyle();			break;		case t_fonttbl:	/*skip all of these and their contents!*/		case t_colortbl:		case t_info:			gobble = TRUE;	/*perform no output, ignore commands 'til level-1*/			skip_to_level = level-1;			break;		case t_pict:			gobble = TRUE;	/*perform no output, ignore commands 'til level-1*/			if(skip_to_level>=level || skip_to_level==-1) 				skip_to_level = level-1;			break;								case t_s: /*Style*/			if (!curr_heading) 			{				style = RTF_MapStyle (arg);				if(style)				{					curr_heading = s_h0 + style;					RTF_PutStr(styles[curr_heading][0]);					ignore_styles = TRUE;				}			}			break;					case t_b: /*Bold*/				RTF_SetStyle(&curr_style,s_bold);			break;					case t_ulw:		case t_uld:		case t_uldb:		case t_ul: /*Underline, maps to "emphasis" HTML style*/				RTF_SetStyle(&curr_style,s_underline);			break;					case t_i: /*Italic*/				RTF_SetStyle(&curr_style,s_italic);			break;					case t_v: /* Hidden*/				RTF_SetStyle(&curr_style,s_hidden);			break;					case t_par: /*Paragraph*/			if (curr_heading!=s_plain) {				RTF_PutStr(styles[curr_heading][1]);				curr_heading = s_plain;			} else {				RTF_PutStr(styles[s_para][0]);			}			ignore_styles = FALSE;			break;					case t_plain: /*reset inline styles*/			RTF_PlainStyle(&curr_style);			break;		case t_cell:		case t_tab:			RTF_PutChar('\t');			break;		case t_endash:			RTF_PutChar(2013);		case t_emdash:			RTF_PutChar(2014);			break;		case t_line:		case t_row:		        #RTF_PutStr(styles[s_br][0]);			break;		case t_bullet:			RTF_PutChar(2022);			break;		case t_start:		case t_end:			break;	}			}/**************************************//* RTF_Parse is a crude, ugly state machine that understands enough of *//* the RTF syntax to be dangerous.                                     */void RTF_ParseBody(){	char ch, pf;        Initialize();	RTF_DoControl=RTF_DoBodyControl;	level = 0;	skip_to_level = -1;	gobble = FALSE;	ignore_styles = FALSE;		while (1) 	{		ch = RTF_GetChar();		if(feof(f))			break;		switch (ch) 		{			case '\\':				RTF_backslash(&ch,&pf);				if(pf) RTF_PutChar(to_unicode(ch));				break;						case '{':				RTF_PushState(&level);				break;						case '}':				RTF_PopState(&level);				break;							default:				RTF_PutChar(ch);				break;		}	}/*while*/}/**************************************//**************************************/void Initialize(){	int i;	for (i=0;i<MAX_LEVELS;i++)			style_state[i].s=s_plain;	curr_style.s=s_plain;	curr_heading = s_plain;	// Set default styles maping	style_mappings[0][0]=0;	for(i=1;i<=6;i++)			sprintf(style_mappings[i],"%d",256-i);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -