⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 rtf2html.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
/************************************************************************ * This program takes a stab at converting RTF (Rich Text Format) files * into HTML. There are some limitations that keep RTF from being able to * easily represent things like in-line images and anchors as styles. In * particular, RTF styles apply to entire "paragraphs", so anchors or * images in the middle of a text stream can't easily be represented by * styles. The intent is to ultimately use something like embedded text * color changes to represent these constructs. * * In the meantime, you can take existing Word documents, apply the * correct style sheet, and convert them to HTML with this tool. * * AUTHOR: Chuck Shotton, UT-Houston Academic Computing, *         cshotton@oac.hsc.uth.tmc.edu * *         Dmitry Potapov, CapitalSoft *         dpotapov@capitalsoft.com * * USAGE: rtf2html [rtf_filename] * * BEHAVIOR: *        rtf2html will open the specified RTF input file or read from *        standard input, writing converted HTML to standard output. * * NOTES: *        The RTF document must be formatted with a style sheet that has *        style numberings that conform to the style_mappings table *        defined in this source file. * * MODIFICATIONS: *         6/21/93 : Chuck Shotton - created version 1.0. *        11/26/98 : Dmitry Potapov - version 1.1 beta *         8/20/02 : Dmitry Potapov - version 1.2 *                - read title from rtf document *                - field HYPERLINK support *                - automatic detect hyperlinks in text * ************************************************************************/#include <stdlib.h>#include <stdio.h>#include <string.h>#include <ctype.h>#ifdef _MSC_VER#	define	strcasecmp _stricmp#endif#ifndef TRUE#define TRUE -1#define FALSE 0#endif#define MAX_LEVELS 40	/*defines the # of nested in-line styles (pairs of {})*/#define MAX_RTF_TOKEN 40#define MAX_URL_LEN 256 /* defines the maximal URL length */#define MAX_INLINE_STYLES 5 /*defines # of in-line styles, bold, italic, etc.*/typedef struct tag_StyleState{	unsigned char s: MAX_INLINE_STYLES;} TStyleState;typedef enum { s_plain, s_bold, s_italic, s_underline, s_hidden, /*in-line styles*/	s_para,	s_br,	  /*pseudo style*/	s_h0, s_h1, s_h2, s_h3, s_h4, s_h5, s_h6 /*heading styles*/} StyleState;char *styles[][2] = {		/*HTML Start and end tags for styles*/	{"", ""},	{"<b>", "</b>"},	{"<i>", "</i>"},	{"<u>", "</u>"},	{"<!-- ", " -->"},	{"<p>\n", ""},	{"<br>\n",""},	{"", ""},	{"<h1>", "</h1>\n"},	{"<h2>", "</h2>\n"},	{"<h3>", "</h3>\n"},	{"<h4>", "</h4>\n"},	{"<h5>", "</h5>\n"},	{"<h6>", "</h6>\n"}};/* style_mappings maps the style numbers in a RTF style sheet into one of the*//* (currently) six paragraph-oriented HTML styles (i.e. heading 1 through 6.)*//* Additional styles for lists, etc. should be added here. Style info        *//* ultimately should be read from some sort of config file into these tables.*/#define MAX_NAME_LEN 40char style_name[MAX_NAME_LEN];#define STYLE_NUMBER 7char *style_namings[STYLE_NUMBER] = {	"", "heading 1", "heading 2", "heading 3", "heading 4", "heading 5",	"heading 6"};char style_mappings[STYLE_NUMBER][MAX_RTF_TOKEN];char style_number[MAX_RTF_TOKEN];/* RTF tokens that mean something to the parser. All others are ignored. */typedef enum {	t_start,	t_fonttbl, t_colortbl, t_stylesheet, t_info, t_s, t_b, t_ul, t_ulw,	t_uld, t_uldb, t_i, t_v, t_plain, t_par, t_pict, t_tab, t_bullet,	t_cell, t_row, t_line, t_endash, t_emdash,	t_field, t_fldinst,	t_end} TokenIndex;char *tokens[] = {	"###",	"fonttbl", "colortbl", "stylesheet", "info", "s", "b", "ul", "ulw",	"uld", "uldb", "i", "v", "plain", "par", "pict", "tab", "bullet",	"cell", "row", "line", "endash", "emdash",	"field", "fldinst",	"###"};TStyleState style_state[MAX_LEVELS], curr_style;short curr_heading;void (*RTF_DoControl)(char*,char*);char isBody, isHead;FILE* f;short 	level,		/*current {} nesting level*/	skip_to_level,/*{} level to which parsing should skip (used to skip */	              /*  font tables, style sheets, color tables, etc.)    */	gobble,	/*Flag set to indicate all input should be discarded  */	ignore_styles;/*Set to ignore inline style expansions after style use*//**************************************/char RTF_GetChar(){	char ch;	do{		ch=fgetc(f);	} while ((ch=='\r')||(ch=='\n'));	return ch;}/**************************************/char RTF_UnGetChar(char ch){	return ungetc(ch,f);}/**************************************/void RTF_StartHead(void){	fputs("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML//EN\">\n<HTML>\n", stdout);	fputs("<HEAD>\n", stdout);	isHead=TRUE;}/**************************************/void RTF_StartBody(void){	if(!isHead) RTF_StartHead();	fputs("</HEAD>\n<BODY>\n", stdout);	isBody=TRUE;}/**************************************//* The list of keywords, RTF_KeyWord will be called if one of them is met. * NOTE: we are looking for keywords at the beginning of each word, * i.e. at the first alphabetic character or digit after any other. */const char* keyWords[] = {	"http://", "mailto:", "ftp://"};#define NUM_KEYWORDS (sizeof(keyWords)/sizeof(keyWords[0]))int wordIndex=0; /* current keyword */int charIndex=0; /* position in keyword or -1 when skipping		    to the next word */void RTF_KeyWord(const char* keyword);void RTF_FlushBuffer(void);/**************************************/void RTF_PutStr(const char* s){	if (gobble) return;	if(!isBody) RTF_StartBody();	if(charIndex>0) RTF_FlushBuffer();	charIndex=0;	fputs(s, stdout);}/**************************************/void RTF_PutChar(char ch){	if(charIndex>0) RTF_FlushBuffer();	if (gobble) return;	if(!isBody) RTF_StartBody();	switch (ch) {		case '<':			RTF_PutStr("&lt;");			break;		case '>':			RTF_PutStr("&gt;");			break;		case '&':			RTF_PutStr("&amp;");			break;		default:			fputc(ch, stdout);	}}/**************************************/void RTF_FlushBuffer(void){	int i, len, wi;	len = charIndex;	wi = wordIndex;	wordIndex=charIndex=0;	for(i=0;i<len;i++)		RTF_PutChar(keyWords[wi][i]);	return;}/**************************************/void RTF_PutCharEx(char ch){	if (gobble) return;	if(charIndex<0)	{		if(!isalnum(ch))			charIndex=0;		RTF_PutChar(ch);		return;	}	if(keyWords[wordIndex][charIndex]!=ch)	{		int j;		for(j=wordIndex+1;j<NUM_KEYWORDS;j++)		{			if(		(charIndex==0||memcmp(keyWords+wordIndex, keyWords+j, charIndex)==0)		&& keyWords[j][charIndex]==ch				)				break;		}		if(j<NUM_KEYWORDS)		{			wordIndex=j;		} else {			/* not found */			RTF_PutChar(ch);			if(isalnum(ch))				charIndex=-1;			else				charIndex=0;			return;		}	}	charIndex++;	if(charIndex>0)	{		if(keyWords[wordIndex][charIndex]==0)		{			const char *kw;			kw = keyWords[wordIndex];			wordIndex=charIndex=0;			RTF_KeyWord(kw);		}		return;	}}/**************************************/void RTF_KeyWord(const char* keyword){	char buffer[MAX_URL_LEN];	int i, len;	char ch;#define URL_STOP "\\<>{}" /* defines characters which cannot be part of URL */	/* read URL */	len = 0;	while(1)	{		ch=RTF_GetChar();		if(feof(f))			break;		if(strchr(URL_STOP,ch)!=NULL)		{			RTF_UnGetChar(ch);			break;		}		buffer[len++]=ch;		if(len==sizeof(buffer))			break;	}	/* output URL */	RTF_PutStr("<a href=\"");	RTF_PutStr(keyword);	for(i=0;i<len;i++)		if(ch=='"')			RTF_PutStr("&#34;");		else			RTF_PutChar(buffer[i]);	RTF_PutStr("\">");	for(i=0;i<len;i++)		if(ch=='"')			RTF_PutStr("&#34;");		else			RTF_PutChar(buffer[i]);	RTF_PutStr("</a>");}/**************************************/void RTF_PlainStyle (TStyleState* s){	int i;	for(i=0;i<MAX_INLINE_STYLES;i++)	{		if(s->s & (1<<i))			RTF_PutStr(styles[i][1]);	}	s->s=0;}/**************************************/void RTF_SetStyle(TStyleState* s, StyleState style){	if( (!ignore_styles||(style==s_hidden)) && ((s->s&(1<<style))==0) )	{		RTF_PutStr(styles[style][0]);		s->s|=(1<<style);	}}/**************************************/void RTF_PushState(short* level){//	printf("<!--PushState=%X-->",curr_style.s);	if(*level>=MAX_LEVELS)	{		fprintf(stderr,"Exceed maximum level\n");		exit(-1);	}	style_state[*level]=curr_style;	(*level)++;}/**************************************/void RTF_PopState(short* level){	int j;	TStyleState new_style;//	printf("<!--PopState=%X-->",curr_style.s);	if(*level<1)	{		fprintf(stderr,"RTF parse error: unexpected '}'\n");		exit(-1);	}	new_style = style_state[*level-1];	/*close off any in-line styles*/	for (j=0;j<MAX_INLINE_STYLES;j++)	{		if ( ((curr_style.s & (1<<j))!=0) && ((new_style.s & (1<<j))==0) )		{			curr_style.s &= ~(1<<j);			RTF_PutStr(styles[j][1]);		}	}	for (j=0;j<MAX_INLINE_STYLES;j++)	{		if( ((curr_style.s & (1<<j))==0) && ((new_style.s & (1<<j))!=0) )			RTF_PutStr(styles[j][0]);	}	(*level)--;	curr_style = new_style;	if (*level == skip_to_level) {		skip_to_level = -1;		gobble = FALSE;	}}/**************************************//* Map a style number into a HTML heading */short RTF_MapStyle(char* s){	int i;	for (i=0;i<7;i++)		if (!strcmp(style_mappings[i], s))			return (i);	return (0);}/**************************************/void RTF_AddStyleMap(char* name, char* number){	int i, len;	len=strlen(name);	if( name[len-1]==';') name[--len]=0;	for(i=0;i<STYLE_NUMBER;i++)	{		if(!strcasecmp(name,style_namings[i]))		{			strcpy(style_mappings[i],number);			return;		}	}}/**************************************/void RTF_BuildName(char* token, char ch){	int len;	len = strlen(token);	if(len>=MAX_NAME_LEN-1)		return;	token[len]=ch;	token[len+1]=0;}/**************************************/void RTF_ClearName(char* token){	token[0]=0;}/**************************************/TokenIndex GetTokenIndex(char* control){	TokenIndex i;	for (i=t_start; i<t_end; i++)	{		if(control[0]==tokens[i][0]) /* Added for fast compare */			if (!strcmp(control, tokens[i]))				break;	}	return i;}/**************************************/void RTF_DoStyleControl (char* control, char* arg){	if(GetTokenIndex(control)==t_s)	{		strcpy(style_number,arg);	}}/**************************************/int chartoi(char ch){	if((ch>='0')&&(ch<='9'))		return ch-'0';	if((ch>='A')&&(ch<='Z'))		return ch-'A'+10;	if((ch>='a')&&(ch<='z'))		return ch-'a'+10;	return -1;}/**************************************/void RTF_BuildArg (char ch, char* arg){	int i=0;	if(feof(f))	{		arg[0]=0;		return;	}	if(ch=='-')	{		arg[i++]='-';		ch=RTF_GetChar();		if(feof(f))		{			arg[0]=0;			return;		}	}	for(;isdigit(ch);i++)	{		arg[i]=ch;		if(i>=MAX_RTF_TOKEN-1)		{			arg[MAX_RTF_TOKEN-1]=0;			while(isdigit(ch)) {				ch=RTF_GetChar();				if(feof(f))					return;			}			break;		}		ch=RTF_GetChar();		if(feof(f))		{			arg[i+1]=0;			return;		}	}	arg[i]=0;	if(!isspace(ch))		RTF_UnGetChar(ch);}/**************************************/void RTF_BuildToken (char ch){	int i;	for(i=1;;i++)	{		char token[MAX_RTF_TOKEN], arg[MAX_RTF_TOKEN];		token[i-1]=ch;		if(i>=MAX_RTF_TOKEN-1)		{			do {				ch=RTF_GetChar();				if(feof(f))					return;			} while (isalpha(ch));			RTF_BuildArg(ch,arg);			return;		}		ch=RTF_GetChar();		if(feof(f))		{			token[i]=0;			RTF_DoControl(token,"");			return;		}		if( !isalpha(ch) )		{			token[i]=0;			RTF_BuildArg(ch,arg);			RTF_DoControl(token,arg);			return;		}	}}/**************************************/void RTF_backslash(char* pch, char* pf){	char ch;	*pf=FALSE;	ch=RTF_GetChar();

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -