📄 rtf2html.c
字号:
/************************************************************************ * This program takes a stab at converting RTF (Rich Text Format) files * into HTML. There are some limitations that keep RTF from being able to * easily represent things like in-line images and anchors as styles. In * particular, RTF styles apply to entire "paragraphs", so anchors or * images in the middle of a text stream can't easily be represented by * styles. The intent is to ultimately use something like embedded text * color changes to represent these constructs. * * In the meantime, you can take existing Word documents, apply the * correct style sheet, and convert them to HTML with this tool. * * AUTHOR: Chuck Shotton, UT-Houston Academic Computing, * cshotton@oac.hsc.uth.tmc.edu * * Dmitry Potapov, CapitalSoft * dpotapov@capitalsoft.com * * USAGE: rtf2html [rtf_filename] * * BEHAVIOR: * rtf2html will open the specified RTF input file or read from * standard input, writing converted HTML to standard output. * * NOTES: * The RTF document must be formatted with a style sheet that has * style numberings that conform to the style_mappings table * defined in this source file. * * MODIFICATIONS: * 6/21/93 : Chuck Shotton - created version 1.0. * 11/26/98 : Dmitry Potapov - version 1.1 beta * 8/20/02 : Dmitry Potapov - version 1.2 * - read title from rtf document * - field HYPERLINK support * - automatic detect hyperlinks in text * ************************************************************************/#include <stdlib.h>#include <stdio.h>#include <string.h>#include <ctype.h>#ifdef _MSC_VER# define strcasecmp _stricmp#endif#ifndef TRUE#define TRUE -1#define FALSE 0#endif#define MAX_LEVELS 40 /*defines the # of nested in-line styles (pairs of {})*/#define MAX_RTF_TOKEN 40#define MAX_URL_LEN 256 /* defines the maximal URL length */#define MAX_INLINE_STYLES 5 /*defines # of in-line styles, bold, italic, etc.*/typedef struct tag_StyleState{ unsigned char s: MAX_INLINE_STYLES;} TStyleState;typedef enum { s_plain, s_bold, s_italic, s_underline, s_hidden, /*in-line styles*/ s_para, s_br, /*pseudo style*/ s_h0, s_h1, s_h2, s_h3, s_h4, s_h5, s_h6 /*heading styles*/} StyleState;char *styles[][2] = { /*HTML Start and end tags for styles*/ {"", ""}, {"<b>", "</b>"}, {"<i>", "</i>"}, {"<u>", "</u>"}, {"<!-- ", " -->"}, {"<p>\n", ""}, {"<br>\n",""}, {"", ""}, {"<h1>", "</h1>\n"}, {"<h2>", "</h2>\n"}, {"<h3>", "</h3>\n"}, {"<h4>", "</h4>\n"}, {"<h5>", "</h5>\n"}, {"<h6>", "</h6>\n"}};/* style_mappings maps the style numbers in a RTF style sheet into one of the*//* (currently) six paragraph-oriented HTML styles (i.e. heading 1 through 6.)*//* Additional styles for lists, etc. should be added here. Style info *//* ultimately should be read from some sort of config file into these tables.*/#define MAX_NAME_LEN 40char style_name[MAX_NAME_LEN];#define STYLE_NUMBER 7char *style_namings[STYLE_NUMBER] = { "", "heading 1", "heading 2", "heading 3", "heading 4", "heading 5", "heading 6"};char style_mappings[STYLE_NUMBER][MAX_RTF_TOKEN];char style_number[MAX_RTF_TOKEN];/* RTF tokens that mean something to the parser. All others are ignored. */typedef enum { t_start, t_fonttbl, t_colortbl, t_stylesheet, t_info, t_s, t_b, t_ul, t_ulw, t_uld, t_uldb, t_i, t_v, t_plain, t_par, t_pict, t_tab, t_bullet, t_cell, t_row, t_line, t_endash, t_emdash, t_field, t_fldinst, t_end} TokenIndex;char *tokens[] = { "###", "fonttbl", "colortbl", "stylesheet", "info", "s", "b", "ul", "ulw", "uld", "uldb", "i", "v", "plain", "par", "pict", "tab", "bullet", "cell", "row", "line", "endash", "emdash", "field", "fldinst", "###"};TStyleState style_state[MAX_LEVELS], curr_style;short curr_heading;void (*RTF_DoControl)(char*,char*);char isBody, isHead;FILE* f;short level, /*current {} nesting level*/ skip_to_level,/*{} level to which parsing should skip (used to skip */ /* font tables, style sheets, color tables, etc.) */ gobble, /*Flag set to indicate all input should be discarded */ ignore_styles;/*Set to ignore inline style expansions after style use*//**************************************/char RTF_GetChar(){ char ch; do{ ch=fgetc(f); } while ((ch=='\r')||(ch=='\n')); return ch;}/**************************************/char RTF_UnGetChar(char ch){ return ungetc(ch,f);}/**************************************/void RTF_StartHead(void){ fputs("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML//EN\">\n<HTML>\n", stdout); fputs("<HEAD>\n", stdout); isHead=TRUE;}/**************************************/void RTF_StartBody(void){ if(!isHead) RTF_StartHead(); fputs("</HEAD>\n<BODY>\n", stdout); isBody=TRUE;}/**************************************//* The list of keywords, RTF_KeyWord will be called if one of them is met. * NOTE: we are looking for keywords at the beginning of each word, * i.e. at the first alphabetic character or digit after any other. */const char* keyWords[] = { "http://", "mailto:", "ftp://"};#define NUM_KEYWORDS (sizeof(keyWords)/sizeof(keyWords[0]))int wordIndex=0; /* current keyword */int charIndex=0; /* position in keyword or -1 when skipping to the next word */void RTF_KeyWord(const char* keyword);void RTF_FlushBuffer(void);/**************************************/void RTF_PutStr(const char* s){ if (gobble) return; if(!isBody) RTF_StartBody(); if(charIndex>0) RTF_FlushBuffer(); charIndex=0; fputs(s, stdout);}/**************************************/void RTF_PutChar(char ch){ if(charIndex>0) RTF_FlushBuffer(); if (gobble) return; if(!isBody) RTF_StartBody(); switch (ch) { case '<': RTF_PutStr("<"); break; case '>': RTF_PutStr(">"); break; case '&': RTF_PutStr("&"); break; default: fputc(ch, stdout); }}/**************************************/void RTF_FlushBuffer(void){ int i, len, wi; len = charIndex; wi = wordIndex; wordIndex=charIndex=0; for(i=0;i<len;i++) RTF_PutChar(keyWords[wi][i]); return;}/**************************************/void RTF_PutCharEx(char ch){ if (gobble) return; if(charIndex<0) { if(!isalnum(ch)) charIndex=0; RTF_PutChar(ch); return; } if(keyWords[wordIndex][charIndex]!=ch) { int j; for(j=wordIndex+1;j<NUM_KEYWORDS;j++) { if( (charIndex==0||memcmp(keyWords+wordIndex, keyWords+j, charIndex)==0) && keyWords[j][charIndex]==ch ) break; } if(j<NUM_KEYWORDS) { wordIndex=j; } else { /* not found */ RTF_PutChar(ch); if(isalnum(ch)) charIndex=-1; else charIndex=0; return; } } charIndex++; if(charIndex>0) { if(keyWords[wordIndex][charIndex]==0) { const char *kw; kw = keyWords[wordIndex]; wordIndex=charIndex=0; RTF_KeyWord(kw); } return; }}/**************************************/void RTF_KeyWord(const char* keyword){ char buffer[MAX_URL_LEN]; int i, len; char ch;#define URL_STOP "\\<>{}" /* defines characters which cannot be part of URL */ /* read URL */ len = 0; while(1) { ch=RTF_GetChar(); if(feof(f)) break; if(strchr(URL_STOP,ch)!=NULL) { RTF_UnGetChar(ch); break; } buffer[len++]=ch; if(len==sizeof(buffer)) break; } /* output URL */ RTF_PutStr("<a href=\""); RTF_PutStr(keyword); for(i=0;i<len;i++) if(ch=='"') RTF_PutStr("""); else RTF_PutChar(buffer[i]); RTF_PutStr("\">"); for(i=0;i<len;i++) if(ch=='"') RTF_PutStr("""); else RTF_PutChar(buffer[i]); RTF_PutStr("</a>");}/**************************************/void RTF_PlainStyle (TStyleState* s){ int i; for(i=0;i<MAX_INLINE_STYLES;i++) { if(s->s & (1<<i)) RTF_PutStr(styles[i][1]); } s->s=0;}/**************************************/void RTF_SetStyle(TStyleState* s, StyleState style){ if( (!ignore_styles||(style==s_hidden)) && ((s->s&(1<<style))==0) ) { RTF_PutStr(styles[style][0]); s->s|=(1<<style); }}/**************************************/void RTF_PushState(short* level){// printf("<!--PushState=%X-->",curr_style.s); if(*level>=MAX_LEVELS) { fprintf(stderr,"Exceed maximum level\n"); exit(-1); } style_state[*level]=curr_style; (*level)++;}/**************************************/void RTF_PopState(short* level){ int j; TStyleState new_style;// printf("<!--PopState=%X-->",curr_style.s); if(*level<1) { fprintf(stderr,"RTF parse error: unexpected '}'\n"); exit(-1); } new_style = style_state[*level-1]; /*close off any in-line styles*/ for (j=0;j<MAX_INLINE_STYLES;j++) { if ( ((curr_style.s & (1<<j))!=0) && ((new_style.s & (1<<j))==0) ) { curr_style.s &= ~(1<<j); RTF_PutStr(styles[j][1]); } } for (j=0;j<MAX_INLINE_STYLES;j++) { if( ((curr_style.s & (1<<j))==0) && ((new_style.s & (1<<j))!=0) ) RTF_PutStr(styles[j][0]); } (*level)--; curr_style = new_style; if (*level == skip_to_level) { skip_to_level = -1; gobble = FALSE; }}/**************************************//* Map a style number into a HTML heading */short RTF_MapStyle(char* s){ int i; for (i=0;i<7;i++) if (!strcmp(style_mappings[i], s)) return (i); return (0);}/**************************************/void RTF_AddStyleMap(char* name, char* number){ int i, len; len=strlen(name); if( name[len-1]==';') name[--len]=0; for(i=0;i<STYLE_NUMBER;i++) { if(!strcasecmp(name,style_namings[i])) { strcpy(style_mappings[i],number); return; } }}/**************************************/void RTF_BuildName(char* token, char ch){ int len; len = strlen(token); if(len>=MAX_NAME_LEN-1) return; token[len]=ch; token[len+1]=0;}/**************************************/void RTF_ClearName(char* token){ token[0]=0;}/**************************************/TokenIndex GetTokenIndex(char* control){ TokenIndex i; for (i=t_start; i<t_end; i++) { if(control[0]==tokens[i][0]) /* Added for fast compare */ if (!strcmp(control, tokens[i])) break; } return i;}/**************************************/void RTF_DoStyleControl (char* control, char* arg){ if(GetTokenIndex(control)==t_s) { strcpy(style_number,arg); }}/**************************************/int chartoi(char ch){ if((ch>='0')&&(ch<='9')) return ch-'0'; if((ch>='A')&&(ch<='Z')) return ch-'A'+10; if((ch>='a')&&(ch<='z')) return ch-'a'+10; return -1;}/**************************************/void RTF_BuildArg (char ch, char* arg){ int i=0; if(feof(f)) { arg[0]=0; return; } if(ch=='-') { arg[i++]='-'; ch=RTF_GetChar(); if(feof(f)) { arg[0]=0; return; } } for(;isdigit(ch);i++) { arg[i]=ch; if(i>=MAX_RTF_TOKEN-1) { arg[MAX_RTF_TOKEN-1]=0; while(isdigit(ch)) { ch=RTF_GetChar(); if(feof(f)) return; } break; } ch=RTF_GetChar(); if(feof(f)) { arg[i+1]=0; return; } } arg[i]=0; if(!isspace(ch)) RTF_UnGetChar(ch);}/**************************************/void RTF_BuildToken (char ch){ int i; for(i=1;;i++) { char token[MAX_RTF_TOKEN], arg[MAX_RTF_TOKEN]; token[i-1]=ch; if(i>=MAX_RTF_TOKEN-1) { do { ch=RTF_GetChar(); if(feof(f)) return; } while (isalpha(ch)); RTF_BuildArg(ch,arg); return; } ch=RTF_GetChar(); if(feof(f)) { token[i]=0; RTF_DoControl(token,""); return; } if( !isalpha(ch) ) { token[i]=0; RTF_BuildArg(ch,arg); RTF_DoControl(token,arg); return; } }}/**************************************/void RTF_backslash(char* pch, char* pf){ char ch; *pf=FALSE; ch=RTF_GetChar();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -