📄 rtfparser.java
字号:
/*
* $Id: RtfParser.java 3580 2008-08-06 15:52:00Z howard_s $
*
* Copyright 2007 by Howard Shank (hgshank@yahoo.com)
*
* The contents of this file are subject to the Mozilla Public License Version 1.1
* (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the License.
*
* The Original Code is 'iText, a free JAVA-PDF library'.
*
* The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
* the Initial Developer are Copyright (C) 1999-2006 by Bruno Lowagie.
* All Rights Reserved.
* Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
* are Copyright (C) 2000-2006 by Paulo Soares. All Rights Reserved.
*
* Contributor(s): all the names of the contributors are added in the source code
* where applicable.
*
* Alternatively, the contents of this file may be used under the terms of the
* LGPL license (the ?GNU LIBRARY GENERAL PUBLIC LICENSE?), in which case the
* provisions of LGPL are applicable instead of those above. If you wish to
* allow use of your version of this file only under the terms of the LGPL
* License and not to allow others to use your version of this file under
* the MPL, indicate your decision by deleting the provisions above and
* replace them with the notice and other provisions required by the LGPL.
* If you do not delete the provisions above, a recipient may use your version
* of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the MPL as stated above or under the terms of the GNU
* Library General Public License as published by the Free Software Foundation;
* either version 2 of the License, or any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
* details.
*
* If you didn't download this code from the following link, you should check if
* you aren't using an obsolete version:
* http://www.lowagie.com/iText/
*/
package com.lowagie.text.rtf.parser;
import java.awt.Color;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.EventListener;
import java.util.Iterator;
import java.util.Stack;
import com.lowagie.text.Document;
import com.lowagie.text.DocumentException;
import com.lowagie.text.Element;
import com.lowagie.text.List;
import com.lowagie.text.rtf.direct.RtfDirectContent;
import com.lowagie.text.rtf.document.RtfDocument;
import com.lowagie.text.rtf.parser.ctrlwords.RtfCtrlWordData;
import com.lowagie.text.rtf.parser.ctrlwords.RtfCtrlWordListener;
import com.lowagie.text.rtf.parser.ctrlwords.RtfCtrlWordMgr;
import com.lowagie.text.rtf.parser.destinations.RtfDestination;
import com.lowagie.text.rtf.parser.destinations.RtfDestinationMgr;
/**
* The RtfParser allows the importing of RTF documents or
* RTF document fragments. The RTF document or fragment is tokenised,
* font and color definitions corrected and then added to
* the document being written.
*
* @author Mark Hall (Mark.Hall@mail.room3b.eu)
* @author Howard Shank (hgshank@yahoo.com)
* @since 2.0.8
*/
public class RtfParser {
/**
* Debugging flag.
*/
private static final boolean debugParser = false; // DEBUG Files are unlikely to be read by any reader!
private String logFile = null;
private boolean logging = false;
private boolean logAppend = false;
/**
* The iText element to add the RTF document to.
* @since 2.1.3
*/
private Element elem = null;
/**
* The iText document to add the RTF document to.
*/
private Document document = null;
/**
* The RtfDocument to add the RTF document or fragment to.
*/
private RtfDocument rtfDoc = null;
/**
* The RtfKeywords that creates and handles keywords that are implemented.
*/
private RtfCtrlWordMgr rtfKeywordMgr = null;
/**
* The RtfImportHeader to store imported font and color mappings in.
*/
private RtfImportMgr importMgr = null;
/**
* The RtfDestinationMgr object to manage destinations.
*/
private RtfDestinationMgr destinationMgr = null;
/**
* Stack for saving states for groups
*/
private Stack stackState = null;
/**
* The current parser state.
*/
private RtfParserState currentState = null;
/**
* The pushback reader to read the input stream.
*/
private PushbackInputStream pbReader = null;
/**
* Conversion type. Identifies if we are doing in import or a convert.
*/
private int conversionType = TYPE_IMPORT_FULL;
/*
* Bitmapping:
*
* 0111 1111 1111 1111 = Unkown state
* 0xxx xxxx xxxx xxxx = In Header
* 1xxx xxxx xxxx xxxx = In Document
* 2xxx xxxx xxxx xxxx = Reserved
* 4xxx xxxx xxxx xxxx = Other
* 8xxx xxxx xxxx xxxx = Errors
*/
/*
* Header state values
*/
/**
* Currently the RTF document header is being parsed.
*/
public static final int PARSER_IN_HEADER = (0x0 << 28) | 0x000000;
/**
* Currently the RTF charset is being parsed.
*/
public static final int PARSER_IN_CHARSET = PARSER_IN_HEADER | 0x000001;
/**
* Currently the RTF deffont is being parsed.
*/
public static final int PARSER_IN_DEFFONT = PARSER_IN_HEADER | 0x000002;
/**
* Currently the RTF font table is being parsed.
*/
public static final int PARSER_IN_FONT_TABLE = PARSER_IN_HEADER | 0x000003;
/**
* Currently a RTF font table info element is being parsed.
*/
public static final int PARSER_IN_FONT_TABLE_INFO = PARSER_IN_HEADER | 0x000004;
/**
* Currently the RTF filetbl is being parsed.
*/
public static final int PARSER_IN_FILE_TABLE = PARSER_IN_HEADER | 0x000005;
/**
* Currently the RTF color table is being parsed.
*/
public static final int PARSER_IN_COLOR_TABLE = PARSER_IN_HEADER | 0x000006;
/**
* Currently the RTF stylesheet is being parsed.
*/
public static final int PARSER_IN_STYLESHEET = PARSER_IN_HEADER | 0x000007;
/**
* Currently the RTF listtables is being parsed.
*/
public static final int PARSER_IN_LIST_TABLE = PARSER_IN_HEADER | 0x000008;
/**
* Currently the RTF listtable override is being parsed.
*/
public static final int PARSER_IN_LISTOVERRIDE_TABLE = PARSER_IN_HEADER | 0x000009;
/**
* Currently the RTF revtbl is being parsed.
*/
public static final int PARSER_IN_REV_TABLE = PARSER_IN_HEADER | 0x00000A;
/**
* Currently the RTF rsidtable is being parsed.
*/
public static final int PARSER_IN_RSID_TABLE = PARSER_IN_HEADER | 0x0000B;
/**
* Currently the RTF generator is being parsed.
*/
public static final int PARSER_IN_GENERATOR = PARSER_IN_HEADER | 0x00000C;
/**
* Currently the RTF Paragraph group properties Table (word 2002)
*/
public static final int PARSER_IN_PARAGRAPH_TABLE = PARSER_IN_HEADER | 0x00000E;
/**
* Currently the RTF Old Properties.
*/
public static final int PARSER_IN_OLDCPROPS = PARSER_IN_HEADER | 0x00000F;
/**
* Currently the RTF Old Properties.
*/
public static final int PARSER_IN_OLDPPROPS = PARSER_IN_HEADER | 0x000010;
/**
* Currently the RTF Old Properties.
*/
public static final int PARSER_IN_OLDTPROPS = PARSER_IN_HEADER | 0x000012;
/**
* Currently the RTF Old Properties.
*/
public static final int PARSER_IN_OLDSPROPS = PARSER_IN_HEADER | 0x000013;
/**
* Currently the RTF User Protection Information.
*/
public static final int PARSER_IN_PROT_USER_TABLE = PARSER_IN_HEADER | 0x000014;
/**
* Currently the Latent Style and Formatting usage restrictions
*/
public static final int PARSER_IN_LATENTSTYLES = PARSER_IN_HEADER | 0x000015;
public static final int PARSER_IN_PARAGRAPH_GROUP_PROPERTIES =PARSER_IN_HEADER | 0x000016;
/*
* Document state values
*/
/**
* Currently the RTF document content is being parsed.
*/
public static final int PARSER_IN_DOCUMENT = (0x2 << 28 ) | 0x000000;
/**
* Currently the RTF info group is being parsed.
*/
public static final int PARSER_IN_INFO_GROUP = PARSER_IN_DOCUMENT | 0x000001;
public static final int PARSER_IN_UPR = PARSER_IN_DOCUMENT | 0x000002;
/**
* Currently a shppict control word is being parsed.
*/
public static final int PARSER_IN_SHPPICT = PARSER_IN_DOCUMENT | 0x000010; //16
/**
* Currently a pict control word is being parsed.
*/
public static final int PARSER_IN_PICT = PARSER_IN_DOCUMENT | 0x000011; //17
/**
* Currently a picprop control word is being parsed.
*/
public static final int PARSER_IN_PICPROP = PARSER_IN_DOCUMENT | 0x000012; //18
/**
* Currently a blipuid control word is being parsed.
*/
public static final int PARSER_IN_BLIPUID = PARSER_IN_DOCUMENT | 0x000013; //19
/* other states */
/**
* The parser is at the beginning or the end of the file.
*/
public static final int PARSER_STARTSTOP = (0x4 << 28)| 0x0001;
/* ERRORS */
/**
* Currently the parser is in an error state.
*/
public static final int PARSER_ERROR = (0x8 << 28) | 0x0000;
/**
* The parser reached the end of the file.
*/
public static final int PARSER_ERROR_EOF = PARSER_ERROR | 0x0001;
/**
* Currently the parser is in an unknown state.
*/
public static final int PARSER_IN_UNKNOWN = PARSER_ERROR | 0x0FFFFFFF;
/**
* Conversion type is unknown
*/
public static final int TYPE_UNIDENTIFIED = -1;
/**
* Conversion type is an import. Uses direct content to add everything.
* This is what the original import does.
*/
public static final int TYPE_IMPORT_FULL = 0;
/**
* Conversion type is an import of a partial file/fragment. Uses direct content to add everything.
*/
public static final int TYPE_IMPORT_FRAGMENT = 1;
/**
* Conversion type is a conversion. This uses the document (not rtfDoc) to add
* all the elements making it a different supported documents depending on the writer used.
*/
public static final int TYPE_CONVERT = 2;
/**
* Conversion type to import a document into an element. i.e. Chapter, Section, Table Cell, etc.
* @since 2.1.4
*/
public static final int TYPE_IMPORT_INTO_ELEMENT = 3;
/**
* Destination is normal. Text is processed.
*/
public static final int DESTINATION_NORMAL = 0;
/**
* Destination is skipping. Text is ignored.
*/
public static final int DESTINATION_SKIP = 1;
//////////////////////////////////// TOKENISE VARIABLES ///////////////////
/*
* State flags use 4/28 bitmask.
* First 4 bits (nibble) indicates major state. Used for unknown and error
* Last 28 bits indicates the value;
*/
/**
* The RtfTokeniser is in its ground state. Any token may follow.
*/
public static final int TOKENISER_NORMAL = 0x00000000;
/**
* The last token parsed was a slash.
*/
public static final int TOKENISER_SKIP_BYTES = 0x00000001;
/**
* The RtfTokeniser is currently tokenising a control word.
*/
public static final int TOKENISER_SKIP_GROUP = 0x00000002;
/**
* The RtfTokeniser is currently reading binary stream.
*/
public static final int TOKENISER_BINARY= 0x00000003;
/**
* The RtfTokeniser is currently reading hex data.
*/
public static final int TOKENISER_HEX= 0x00000004;
/**
* The RtfTokeniser ignore result
*/
public static final int TOKENISER_IGNORE_RESULT= 0x00000005;
/**
* The RtfTokeniser is currently in error state
*/
public static final int TOKENISER_STATE_IN_ERROR = 0x80000000; // 1000 0000 0000 0000 0000 0000 0000 0000
/**
* The RtfTokeniser is currently in an unkown state
*/
public static final int TOKENISER_STATE_IN_UNKOWN = 0xFF000000; // 1111 0000 0000 0000 0000 0000 0000 0000
/**
* The current group nesting level.
*/
private int groupLevel = 0;
/**
* The current document group nesting level. Used for fragments.
*/
private int docGroupLevel = 0;
/**
* When the tokeniser is Binary.
*/
private long binByteCount = 0;
/**
* When the tokeniser is set to skip bytes, binSkipByteCount is the number of bytes to skip.
*/
private long binSkipByteCount = 0;
/**
* When the tokeniser is set to skip to next group, this is the group indentifier to return to.
*/
private int skipGroupLevel = 0;
//RTF parser error codes
public static final int errOK =0; // Everything's fine!
public static final int errStackUnderflow = -1; // Unmatched '}'
public static final int errStackOverflow = -2; // Too many '{' -- memory exhausted
public static final int errUnmatchedBrace = -3; // RTF ended during an open group.
public static final int errInvalidHex = -4; // invalid hex character found in data
public static final int errBadTable = -5; // RTF table (sym or prop) invalid
public static final int errAssertion = -6; // Assertion failure
public static final int errEndOfFile = -7; // End of file reached while reading RTF
public static final int errCtrlWordNotFound = -8; // control word was not found
//////////////////////////////////// TOKENISE VARIABLES ///////////////////
//////////////////////////////////// STATS VARIABLES ///////////////////
/**
* Total bytes read.
*/
private long byteCount = 0;
/**
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -