Version: 0.2
Type: Class
Category: Other
License: GNU General Public License
Description: This class is in its *very early* stages of developing and knows *a limited* number of RTF control words. It can convert them to html or xml (no dtd, just for debugging). Up2date version can be found at http://josefine.ben.tuwien.ac.at/~mfischer/developing/php/rtf/rtfclass.phps , a test form is available at http://josefine.ben.tuwien.ac.at/~mfischer/developing/php/rtf/ . Note you can’t just copy/paste a WinWord generated RTF code yet, but the class is able to understand a subset of e.g. VisualBasics RichTextFormat-control generated RTF code (thats what I need it for 😉
[email protected] , 27.11.2000
<? // use tabstop=4 /* Rich Text Format - Parsing Class ================================ (c) 2000 Markus Fischer <[email protected]> http://josefine.ben.tuwien.ac.at/~mfischer/ Latest versions of this class can always be found at http://josefine.ben.tuwien.ac.at/~mfischer/developing/php/rtf/rtfclass.phps Testing suite is available at http://josefine.ben.tuwien.ac.at/~mfischer/developing/php/rtf/ License: GPLv2 Specification: http://msdn.microsoft.com/library/default.asp?URL=/library/specs/rtfspec.htm General Notes: ============== Unknown or unspupported control symbols are silently ignored Group stacking is still not supported :( group stack logic implemented; however not really used yet Example on how to use this class: ================================= $r = new rtf( stripslashes( $rtf)); $r->output( "xml"); $r->parse(); if( count( $r->err) == 0) // no errors detected echo $r->out; History: ======== Sat Nov 25 09:52:12 CET 2000 mfischer First version which has useable but only well-formed xml output; rtf data structure is only logically rebuild, no real parsing yet Mon Nov 27 16:17:18 CET 2000 mfischer Wrote handler for plain control word (thanks to Peter Kursawe for this one) Tue Nov 28 02:22:16 CET 2000 mfischer Implemented alignment (left, center, right) with HTML <DIV .. tags Also implemented translation for < and > character when outputting html or xml Remarks: ======== This class and all work done here is dedicated to Tatjana. */ /* was just a brainlag suggestion of my inner link; don't know if I'll use it */ class rtfState { var $bold; var $italic; var $underlined; } class rtf { var $rtf; // rtf core stream var $len; // length in characters of the stream (get performace due avoiding calling strlen everytime) var $err = array(); // array of error message, no entities on no error var $wantXML; // convert to XML var $wantHTML; // convert to HTML // the only variable which should be accessed from the outside var $out; // output data stream (depends on which $wantXXXXX is set to true var $outstyles; // htmlified styles (generated after parsing if wantHTML var $styles; // if wantHTML, stylesheet definitions are put in here // internal parser variables -------------------------------- // control word variables var $cword; // holds the current (or last) control word, depending on $cw var $cw; // are we currently parsing a control word ? var $cfirst; // could this be the first character ? so watch out for control symbols var $flags = array(); // parser flags var $queue; // every character which is no sepcial char, not belongs to a control word/symbol; is generally considered being 'plain' var $stack = array(); // group stack /* keywords which don't follw the specification (used by Word '97 - 2000) */ // not yet used var $control_exception = array( "clFitText", "clftsWidth(-?[0-9]+)?", "clNoWrap(-?[0-9]+)?", "clwWidth(-?[0-9]+)?", "tdfrmtxtBottom(-?[0-9]+)?", "tdfrmtxtLeft(-?[0-9]+)?", "tdfrmtxtRight(-?[0-9]+)?", "tdfrmtxtTop(-?[0-9]+)?", "trftsWidthA(-?[0-9]+)?", "trftsWidthB(-?[0-9]+)?", "trftsWidth(-?[0-9]+)?", "trwWithA(-?[0-9]+)?", "trwWithB(-?[0-9]+)?", "trwWith(-?[0-9]+)?", "spectspecifygen(-?[0-9]+)?" ); var $charset_table = array( "0" => "ANSI", "1" => "Default", "2" => "Symbol", "77" => "Mac", "128" => "Shift Jis", "129" => "Hangul", "130" => "Johab", "134" => "GB2312", "136" => "Big5", "161" => "Greek", "162" => "Turkish", "163" => "Vietnamese", "177" => "Hebrew", "178" => "Arabic", "179" => "Arabic Traditional", "180" => "Arabic user", "181" => "Hebrew user", "186" => "Baltic", "204" => "Russion", "222" => "Thai", "238" => "Eastern European", "255" => "PC 437", "255" => "OEM" ); /* note: the only conversion table used */ var $fontmodifier_table = array( "bold" => "b", "italic" => "i", "underlined" => "u", "strikethru" => "strike" ); /* Class Constructor: Takes as argument the raw RTF stream (Note under certain circumstances the stream has to be stripslash'ed before handling over) Initialises some class-global variables */ function rtf( $data) { $this->len = strlen( $data); $this->rtf = $data; $this->wantXML = false; $this->wantHTML = false; $this->out = ""; $this->outstyles = ""; $this->styles = array(); $this->text = ""; if( $this->len == 0) array_push( $this->err, "No data in stream found"); } function parserInit() { /* Default values according to the specs */ $this->flags = array( "fontsize" => 24, "beginparagraph" => true ); } /* Sets the output type */ function output( $typ) { switch( $typ) { case "xml": $this->wantXML = true; break; case "html": $this->wantHTML = true; break; default: break; } } function parseControl( $control, $parameter) { switch( $control) { // font table definition start case "fonttbl": $this->flags["fonttbl"] = true; // signal fonttable control words they are allowed to behave as expected break; // define or set font case "f": if( $this->flags["fonttbl"]) { // if its set, the fonttable definition is written to; else its read from $this->flags["fonttbl_current_write"] = $parameter; } else { $this->flags["fonttbl_current_read"] = $parameter; } break; case "fcharset": // this is for preparing flushQueue; it then moves the Queue to $this->fonttable .. instead to formatted output $this->flags["fonttbl_want_fcharset"] = $parameter; break; case "fs": // sets the current fontsize; is used by stylesheets (which are therefore generated on the fly $this->flags["fontsize"] = $parameter; break; // handle alignment case "qc": $this->flags["alignment"] = "center"; break; case "qr": $this->flags["alignment"] = "right"; break; // reset paragraph settings ( only alignment) case "pard": $this->flags["alignment"] = ""; break; // define new paragraph (for now, thats a simple break in html) case "par": // begin new line $this->flags["beginparagraph"] = true; if( $this->wantHTML) { $this->out .= "</div>"; } break; // bold case "bnone": $parameter = "0"; case "b": // haven'y yet figured out WHY I need a (string)-cast here ... hm if( (string)$parameter == "0") $this->flags["bold"] = false; else $this->flags["bold"] = true; break; // underlined case "ulnone": $parameter = "0"; case "ul": if( (string)$parameter == "0") $this->flags["underlined"] = false; else $this->flags["underlined"] = true; break; // italic case "inone": $parameter = "0"; case "i": if( (string)$parameter == "0") $this->flags["italic"] = false; else $this->flags["italic"] = true; break; // strikethru case "strikenone": $parameter = "0"; case "strike": if( (string)$parameter == "0") $this->flags["strikethru"] = false; else $this->flags["strikethru"] = true; break; // reset all font modifiers and fontsize to 12 case "plain": $this->flags["bold"] = false; $this->flags["italic"] = false; $this->flags["underlined"] = false; $this->flags["strikethru"] = false; $this->flags["fontsize"] = 12; break; } } /* Dispatch the control word to the output stream */ function flushControl() { if( ereg( "^([A-Za-z]+)(-?[0-9]*) ?$", $this->cword, $match)) { $this->parseControl( $match[1], $match[2]); if( $this->wantXML) { $this->out.="<control word="".$match[1]."""; if( strlen( $match[2]) > 0) $this->out.=" param="".$match[2]."""; $this->out.="/>"; } } } /* If output stream supports comments, dispatch it */ function flushComment( $comment) { if( $this->wantXML || $this->wantHTML) { $this->out.="<!-- ".$comment." -->"; } } /* Dispatch start/end of logical rtf groups (not every output type needs it; merely debugging purpose) */ function flushGroup( $state) { if( $state == "open") { /* push onto the stack */ array_push( $this->stack, $this->flags); if( $this->wantXML) $this->out.="<group>"; } if( $state == "close") { /* pop from the stack */ $this->last_flags = $this->flags; $this->flags = array_pop( $this->stack); $this->flags["fonttbl_current_write"] = ""; // on group close, no more fontdefinition will be written to this id // this is not really the right way to do it ! // of course a '}' not necessarily donates a fonttable end; a fonttable // group at least *can* contain sub-groups // therefore an stacked approach is heavily needed $this->flags["fonttbl"] = false; // no matter what you do, if a group closes, its fonttbl definition is closed too if( $this->wantXML) $this->out.="</group>"; } } function flushHead() { if( $this->wantXML) $this->out.="<rtf>"; } function flushBottom() { if( $this->wantXML) $this->out.="</rtf>"; } function checkHtmlSpanContent( $command) { reset( $this->fontmodifier_table); while( list( $rtf, $html) = each( $this->fontmodifier_table)) { if( $this->flags[$rtf] == true) { if( $command == "start") $this->out .= "<".$html.">"; else $this->out .= "</".$html.">"; } } } /* flush text in queue */ function flushQueue() { if( strlen( $this->queue)) { // processing logic if( ereg( "^[0-9]+$", $this->flags["fonttbl_want_fcharset"])) { $this->fonttable[$this->flags["fonttbl_want_fcharset"]]["charset"] = $this->queue; $this->flags["fonttbl_want_fcharset"] = ""; $this->queue = ""; } // output logic if( strlen( $this->queue)) { /* Everything which passes this is (or, at leat, *should*) be only outputted plaintext Thats why we can safely add the css-stylesheet when using wantHTML */ if( $this->wantXML) $this->out.= "<plain>".$this->queue."</plain>"; if( $this->wantHTML) { // only output html if a valid (for now, just numeric;) fonttable is given if( ereg( "^[0-9]+$", $this->flags["fonttbl_current_read"])) { if( $this->flags["beginparagraph"] == true) { $this->flags["beginparagraph"] = false; $this->out .= "<div align=""; switch( $this->flags["alignment"]) { case "right": $this->out .= "right"; break; case "center": $this->out .= "center"; break; case "left": default: $this->out .= "left"; } $this->out .= "">"; } /* define new style for that span */ $this->styles["f".$this->flags["fonttbl_current_read"]."s".$this->flags["fontsize"]] = "font-family:".$this->fonttable[$this->flags["fonttbl_current_read"]]["charset"]." font-size:".$this->flags["fontsize"].";"; /* write span start */ $this->out .= "<span class="f".$this->flags["fonttbl_current_read"]."s".$this->flags["fontsize"]."">"; /* check if the span content has a modifier */ $this->checkHtmlSpanContent( "start"); /* write span content */ $this->out .= $this->queue; /* close modifiers */ $this->checkHtmlSpanContent( "stop"); /* close span */ "</span>"; } } $this->queue = ""; } } } /* handle special charactes like 'ef */ function flushSpecial( $special) { if( strlen( $special) == 2) { if( $this->wantXML) $this->out .= "<special value="".$special.""/>"; } } /* Output errors at end */ function flushErrors() { if( count( $this->err) > 0) { if( $this->wantXML) { $this->out .= "<errors>"; while( list($num,$value) = each( $this->err)) { $this->out .= "<message>".$value."</message>"; } $this->out .= "</errors>"; } } } function makeStyles() { $this->outstyles = "<style type="text/css"><!--n"; reset( $this->styles); while( list( $stylename, $styleattrib) = each( $this->styles)) { $this->outstyles .= ".".$stylename." { ".$styleattrib." }n"; } $this->outstyles .= "--></style>n"; } /* finally .. How this parser (is supposed) to work: ====================================== This parse simple starts at the beginning of the rtf core stream, catches every controlling character {,} and , automatically builds control words and control symbols during his livetime, trashes every other character into the plain text queue */ function parse() { $this->parserInit(); $i = 0; $this->cw= false; // flag if control word is currently parsed $this->cfirst = false;// first control character ? $this->cword = ""; // last or current control word ( depends on $this->cw $this->queue = ""; // plain text data found during parsing $this->flushHead(); while( $i < $this->len) { switch( $this->rtf[$i]) { case "{": if( $this->cw) { $this->flushControl(); $this->cw= false; $this->cfirst = false; } else $this->flushQueue(); $this->flushGroup( "open"); break; case "}": if( $this->cw) { $this->flushControl(); $this->cw= false; $this->cfirst = false; } else $this->flushQueue(); $this->flushGroup( "close"); break; case "": if( $this->cfirst) { // catches '' $this->flushComment( "true, ".$i); $this->queue .= ''; $this->cfirst = false; $this->cw= false; break; } if( $this->cw) { $this->flushControl(); } else $this->flushQueue(); $this->cw = true; $this->cfirst = true; $this->cword = ""; break; default: if( (ord( $this->rtf[$i]) == 10) || (ord($this->rtf[$i]) == 13)) break; // eat line breaks if( $this->cw) { // active control word ? /* Watch the RE: there's an optional space at the end which IS part of the control word (but actually its ignored by flushControl) */ if( ereg( "^[a-zA-Z0-9-]?$", $this->rtf[$i])) { // continue parsing $this->cword .= $this->rtf[$i]; $this->cfirst = false; } else { /* Control word could be a 'control symbol', like ~ or * etc. */ $specialmatch = false; if( $this->cfirst) { if( $this->rtf[$i] == ''') { // expect to get some special chars $this->flushQueue(); $this->flushSpecial( $this->rtf[$i+1].$this->rtf[$i+2]); $i+=2; $specialmatch = true; $this->cw = false; $this->cfirst = false; $this->cword = ""; } else if( ereg( "^[{}*]$", $this->rtf[$i])) { $this->flushComment( "control symbols not yet handled"); $specialmatch = true; } $this->cfirst = false; } else { if( $this->rtf[$i] == ' ') { // space delimtes control words, so just discard it and flush the controlword $this->cw = false; $this->flushControl(); break; } } if( ! $specialmatch) { $this->flushControl(); $this->cw = false; $this->cfirst = false; /* The current character is a delimeter, but is NOT part of the control word so we hop one step back in the stream and process it again */ $i--; } } } else { // < and > need translation before putting into queue when XML or HTML is wanted if( ($this->wantHTML) || ($this->wantXML)) { switch( $this->rtf[$i]) { case "<": $this->queue .= "<"; break; case ">": $this->queue .= ">"; break; default: $this->queue .= $this->rtf[$i]; break; } } else $this->queue .= $this->rtf[$i]; } } $i++; } $this->flushQueue(); $this->flushErrors(); $this->flushBottom(); if( $this->wantHTML) { $this->makeStyles(); } } } ?>