📄 rtfclass.php
字号:
<?
// use tabstop=4
/*
Rich Text Format - Parsing Class
================================
(c) 2000 Markus Fischer
<mfischer@josefine.ben.tuwien.ac.at>
http://josefine.ben.tuwien.ac.at/~mfischer/
Latest versions of this class can always be found at
http://josefine.ben.tuwien.ac.at/~mfischer/developing/php/rtf/rtfclass.phps
Testing suite is available at
http://josefine.ben.tuwien.ac.at/~mfischer/developing/php/rtf/
License: GPLv2
Specification:
http://msdn.microsoft.com/library/default.asp?URL=/library/specs/rtfspec.htm
General Notes:
==============
Unknown or unspupported control symbols are silently ignored
Group stacking is still not supported :(
group stack logic implemented; however not really used yet
Example on how to use this class:
=================================
$r = new rtf( stripslashes( $rtf));
$r->output( "xml");
$r->parse();
if( count( $r->err) == 0) // no errors detected
echo $r->out;
History:
========
Sat Nov 25 09:52:12 CET 2000 mfischer
First version which has useable but only well-formed xml output; rtf
data structure is only logically rebuild, no real parsing yet
Mon Nov 27 16:17:18 CET 2000 mfischer
Wrote handler for \plain control word (thanks to Peter Kursawe for this
one)
Tue Nov 28 02:22:16 CET 2000 mfischer
Implemented alignment (left, center, right) with HTML <DIV .. tags
Also implemented translation for < and > character when outputting html or xml
Remarks:
========
This class and all work done here is dedicated to Tatjana.
*/
/* was just a brainlag suggestion of my inner link; don't know if I'll use it */
class rtfState {
var $bold;
var $italic;
var $underlined;
}
class rtf {
var $rtf; // rtf core stream
var $len; // length in characters of the stream (get performace due avoiding calling strlen everytime)
var $err = array(); // array of error message, no entities on no error
var $wantXML; // convert to XML
var $wantHTML; // convert to HTML
// the only variable which should be accessed from the outside
var $out; // output data stream (depends on which $wantXXXXX is set to true
var $outstyles; // htmlified styles (generated after parsing if wantHTML
var $styles; // if wantHTML, stylesheet definitions are put in here
// internal parser variables --------------------------------
// control word variables
var $cword; // holds the current (or last) control word, depending on $cw
var $cw; // are we currently parsing a control word ?
var $cfirst; // could this be the first character ? so watch out for control symbols
var $flags = array(); // parser flags
var $queue; // every character which is no sepcial char, not belongs to a control word/symbol; is generally considered being 'plain'
var $stack = array(); // group stack
/* keywords which don't follw the specification (used by Word '97 - 2000) */
// not yet used
var $control_exception = array(
"clFitText",
"clftsWidth(-?[0-9]+)?",
"clNoWrap(-?[0-9]+)?",
"clwWidth(-?[0-9]+)?",
"tdfrmtxtBottom(-?[0-9]+)?",
"tdfrmtxtLeft(-?[0-9]+)?",
"tdfrmtxtRight(-?[0-9]+)?",
"tdfrmtxtTop(-?[0-9]+)?",
"trftsWidthA(-?[0-9]+)?",
"trftsWidthB(-?[0-9]+)?",
"trftsWidth(-?[0-9]+)?",
"trwWithA(-?[0-9]+)?",
"trwWithB(-?[0-9]+)?",
"trwWith(-?[0-9]+)?",
"spectspecifygen(-?[0-9]+)?"
);
var $charset_table = array(
"0" => "ANSI",
"1" => "Default",
"2" => "Symbol",
"77" => "Mac",
"128" => "Shift Jis",
"129" => "Hangul",
"130" => "Johab",
"134" => "GB2312",
"136" => "Big5",
"161" => "Greek",
"162" => "Turkish",
"163" => "Vietnamese",
"177" => "Hebrew",
"178" => "Arabic",
"179" => "Arabic Traditional",
"180" => "Arabic user",
"181" => "Hebrew user",
"186" => "Baltic",
"204" => "Russion",
"222" => "Thai",
"238" => "Eastern European",
"255" => "PC 437",
"255" => "OEM"
);
/* note: the only conversion table used */
var $fontmodifier_table = array(
"bold" => "b",
"italic" => "i",
"underlined" => "u",
"strikethru" => "strike"
);
/*
Class Constructor:
Takes as argument the raw RTF stream
(Note under certain circumstances the stream has to be stripslash'ed before handling over)
Initialises some class-global variables
*/
function rtf( $data) {
$this->len = strlen( $data);
$this->rtf = $data;
$this->wantXML = false;
$this->wantHTML = false;
$this->out = "";
$this->outstyles = "";
$this->styles = array();
$this->text = "";
if( $this->len == 0)
array_push( $this->err, "No data in stream found");
}
function parserInit() {
/*
Default values according to the specs
*/
$this->flags = array(
"fontsize" => 24,
"beginparagraph" => true
);
}
/*
Sets the output type
*/
function output( $typ) {
switch( $typ) {
case "xml": $this->wantXML = true; break;
case "html": $this->wantHTML = true; break;
default: break;
}
}
function parseControl( $control, $parameter) {
switch( $control) {
// font table definition start
case "fonttbl":
$this->flags["fonttbl"] = true; // signal fonttable control words they are allowed to behave as expected
break;
// define or set font
case "f":
if( $this->flags["fonttbl"]) { // if its set, the fonttable definition is written to; else its read from
$this->flags["fonttbl_current_write"] = $parameter;
} else {
$this->flags["fonttbl_current_read"] = $parameter;
}
break;
case "fcharset":
// this is for preparing flushQueue; it then moves the Queue to $this->fonttable .. instead to formatted output
$this->flags["fonttbl_want_fcharset"] = $parameter;
break;
case "fs":
// sets the current fontsize; is used by stylesheets (which are therefore generated on the fly
$this->flags["fontsize"] = $parameter;
break;
// handle alignment
case "qc":
$this->flags["alignment"] = "center";
break;
case "qr":
$this->flags["alignment"] = "right";
break;
// reset paragraph settings ( only alignment)
case "pard":
$this->flags["alignment"] = "";
break;
// define new paragraph (for now, thats a simple break in html)
case "par":
// begin new line
$this->flags["beginparagraph"] = true;
if( $this->wantHTML) {
$this->out .= "</div>";
}
break;
// bold
case "bnone":
$parameter = "0";
case "b":
// haven'y yet figured out WHY I need a (string)-cast here ... hm
if( (string)$parameter == "0")
$this->flags["bold"] = false;
else
$this->flags["bold"] = true;
break;
// underlined
case "ulnone":
$parameter = "0";
case "ul":
if( (string)$parameter == "0")
$this->flags["underlined"] = false;
else
$this->flags["underlined"] = true;
break;
// italic
case "inone":
$parameter = "0";
case "i":
if( (string)$parameter == "0")
$this->flags["italic"] = false;
else
$this->flags["italic"] = true;
break;
// strikethru
case "strikenone":
$parameter = "0";
case "strike":
if( (string)$parameter == "0")
$this->flags["strikethru"] = false;
else
$this->flags["strikethru"] = true;
break;
// reset all font modifiers and fontsize to 12
case "plain":
$this->flags["bold"] = false;
$this->flags["italic"] = false;
$this->flags["underlined"] = false;
$this->flags["strikethru"] = false;
$this->flags["fontsize"] = 12;
$this->flags["subscription"] = false;
$this->flags["superscription"] = false;
break;
// sub and superscription
case "subnone":
$parameter = "0";
case "sub":
if( (string)$parameter == "0")
$this->flags["subscription"] = false;
else
$this->flags["subscription"] = true;
break;
case "supernone":
$parameter = "0";
case "super":
if( (string)$parameter == "0")
$this->flags["superscription"] = false;
else
$this->flags["superscription"] = true;
break;
}
}
/*
Dispatch the control word to the output stream
*/
function flushControl() {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -