📄 worddocument.java
字号:
/* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2003 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache POI" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * "Apache POI", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */package org.apache.poi.hdf.extractor;import org.apache.poi.hdf.extractor.util.*;import org.apache.poi.hdf.extractor.data.*;import java.util.*;import java.io.*;import javax.swing.*;import java.awt.*;import org.apache.poi.poifs.filesystem.POIFSFileSystem;import org.apache.poi.poifs.filesystem.POIFSDocument;import org.apache.poi.poifs.filesystem.DocumentEntry;import org.apache.poi.util.LittleEndian;/** * This class contains the main functionality for the Word file "reader". Much * of the code in this class is based on the Word 97 document file format. Only * works for non-complex files * * @author Ryan Ackley */public class WordDocument{ /** byte buffer containing the main Document stream*/ byte[] _header; /** contains all style information for this document see Word 97 Doc spec*/ StyleSheet _styleSheet; /** contains All list information for this document*/ ListTables _listTables; /** contains global Document properties for this document*/ DOP _docProps = new DOP(); int _currentList = -1; int _tableSize; int _sectionCounter = 1; /** fonts available for this document*/ FontTable _fonts; /** document's text blocks*/ BTreeSet _text = new BTreeSet(); /** document's character runs */ BTreeSet _characterTable = new BTreeSet(); /** document's paragraphs*/ BTreeSet _paragraphTable = new BTreeSet(); /** doucment's sections*/ BTreeSet _sectionTable = new BTreeSet(); /** used for XSL-FO conversion*/ StringBuffer _headerBuffer = new StringBuffer(); /** used for XSL-FO conversion*/ StringBuffer _bodyBuffer = new StringBuffer(); /** used for XSL-FO table conversion*/ StringBuffer _cellBuffer; /** used for XSL-FO table conversion*/ ArrayList _cells; /** used for XSL-FO table conversion*/ ArrayList _table; /** document's header and footer information*/ byte[] _plcfHdd; /** starting position of text in main document stream*/ int _fcMin; /** length of main document text stream*/ int _ccpText; /** length of footnotes text*/ int _ccpFtn; /** The name of the file to write to */ private static String _outName; /** OLE stuff*/ private InputStream istream; /** OLE stuff*/ private POIFSFileSystem filesystem; //used internally private static int HEADER_EVEN_INDEX = 0; private static int HEADER_ODD_INDEX = 1; private static int FOOTER_EVEN_INDEX = 2; private static int FOOTER_ODD_INDEX = 3; private static int HEADER_FIRST_INDEX = 4; private static int FOOTER_FIRST_INDEX = 5; /** * right now this function takes one parameter: a Word file, and outputs an * XSL-FO document at c:\test.xml (this is hardcoded) */ public static void main(String args[]) { /*try { WordDocument file = new WordDocument(args[0], "r"); Writer out = new BufferedWriter(new FileWriter(args[1])); file.writeAllText(out); out.flush(); out.close(); } catch(Throwable t) { t.printStackTrace(); }*/ try { _outName = args[1]; WordDocument file = new WordDocument(args[0]); file.closeDoc(); } catch(Exception e) { e.printStackTrace(); } System.exit(0); } /** * Spits out the document text * * @param out The Writer to write the text to. * @throws IOException if there is a problem while reading from the file or * writing out the text. */ public void writeAllText(Writer out) throws IOException { int textStart = Utils.convertBytesToInt(_header, 0x18); int textEnd = Utils.convertBytesToInt(_header, 0x1c); ArrayList textPieces = findProperties(textStart, textEnd, _text.root); int size = textPieces.size(); for(int x = 0; x < size; x++) { TextPiece nextPiece = (TextPiece)textPieces.get(x); int start = nextPiece.getStart(); int end = nextPiece.getEnd(); boolean unicode = nextPiece.usesUnicode(); int add = 1; if(unicode) { add = 2; char ch; for(int y = start; y < end; y += add) { ch = (char)Utils.convertBytesToShort(_header, y); out.write(ch); } } else { String sText = new String(_header, start, end-start); out.write(sText); } } } /** * Constructs a Word document from fileName. Parses the document and places * all the important stuff into data structures. * * @param fileName The name of the file to read. * @throws IOException if there is a problem while parsing the document. */ public WordDocument(String fileName) throws IOException { this(new FileInputStream(fileName)); } public WordDocument(InputStream inputStream) throws IOException { //do Ole stuff istream = inputStream; filesystem = new POIFSFileSystem(istream); //get important stuff from the Header block and parse all the //data structures readFIB(); //get the SEPS for the main document text ArrayList sections = findProperties(_fcMin, _fcMin + _ccpText, _sectionTable.root); //iterate through sections, paragraphs, and character runs doing what //you will with the data. int size = sections.size(); for(int x = 0; x < size; x++) { SepxNode node = (SepxNode)sections.get(x); int start = node.getStart(); int end = node.getEnd(); SEP sep = (SEP)StyleSheet.uncompressProperty(node.getSepx(), new SEP(), _styleSheet); writeSection(Math.max(_fcMin, start), Math.min(_fcMin + _ccpText, end), sep, _text, _paragraphTable, _characterTable, _styleSheet); } //finish istream.close(); } /** * Extracts the main document stream from the POI file then hands off to other * functions that parse other areas. * * @throws IOException */ private void readFIB() throws IOException { //get the main document stream DocumentEntry headerProps = (DocumentEntry)filesystem.getRoot().getEntry("WordDocument"); //I call it the header but its also the main document stream _header = new byte[headerProps.getSize()]; filesystem.createDocumentInputStream("WordDocument").read(_header); //Get the information we need from the header int info = LittleEndian.getShort(_header, 0xa); _fcMin = LittleEndian.getInt(_header, 0x18); _ccpText = LittleEndian.getInt(_header, 0x4c); _ccpFtn = LittleEndian.getInt(_header, 0x50); int charPLC = LittleEndian.getInt(_header, 0xfa); int charPlcSize = LittleEndian.getInt(_header, 0xfe); int parPLC = LittleEndian.getInt(_header, 0x102); int parPlcSize = LittleEndian.getInt(_header, 0x106); boolean useTable1 = (info & 0x200) != 0; //process the text and formatting properties processComplexFile(useTable1, charPLC, charPlcSize, parPLC, parPlcSize); } /** * Extracts the correct Table stream from the POI filesystem then hands off to * other functions to process text and formatting info. the name is based on * the fact that in Word 8(97) all text (not character or paragraph formatting) * is stored in complex format. * * @param useTable1 boolean that specifies if we should use table1 or table0 * @param charTable offset in table stream of character property bin table * @param charPlcSize size of character property bin table * @param parTable offset in table stream of paragraph property bin table. * @param parPlcSize size of paragraph property bin table. * @return boolean indocating success of * @throws IOException */ private void processComplexFile(boolean useTable1, int charTable, int charPlcSize, int parTable, int parPlcSize) throws IOException { //get the location of the piece table int complexOffset = LittleEndian.getInt(_header, 0x1a2); String tablename=null; DocumentEntry tableEntry = null; if(useTable1) { tablename="1Table"; } else { tablename="0Table"; } tableEntry = (DocumentEntry)filesystem.getRoot().getEntry(tablename); //load the table stream into a buffer int size = tableEntry.getSize(); byte[] tableStream = new byte[size]; filesystem.createDocumentInputStream(tablename).read(tableStream); //init the DOP for this document initDocProperties(tableStream); //load the header/footer raw data for this document initPclfHdd(tableStream); //parse out the text locations findText(tableStream, complexOffset); //parse out text formatting findFormatting(tableStream, charTable, charPlcSize, parTable, parPlcSize); } /** * Goes through the piece table and parses out the info regarding the text * blocks. For Word 97 and greater all text is stored in the "complex" way * because of unicode. * * @param tableStream buffer containing the main table stream. * @param beginning of the complex data. * @throws IOException */ private void findText(byte[] tableStream, int complexOffset) throws IOException { //actual text int pos = complexOffset; //skips through the prms before we reach the piece table. These contain data //for actual fast saved files while(tableStream[pos] == 1) { pos++; int skip = LittleEndian.getShort(tableStream, pos); pos += 2 + skip; } if(tableStream[pos] != 2) { throw new IOException("corrupted Word file"); } else { //parse out the text pieces int pieceTableSize = LittleEndian.getInt(tableStream, ++pos); pos += 4; int pieces = (pieceTableSize - 4) / 12; for (int x = 0; x < pieces; x++) { int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) + (x * 8) + 2); boolean unicode = false; if ((filePos & 0x40000000) == 0) { unicode = true; } else { unicode = false; filePos &= ~(0x40000000);//gives me FC in doc stream filePos /= 2; } int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) - LittleEndian.getInt(tableStream, pos + (x * 4)); TextPiece piece = new TextPiece(filePos, totLength, unicode); _text.add(piece); } } } /** * Does all of the formatting parsing * * @param tableStream Main table stream buffer. * @param charOffset beginning of the character bin table. * @param chrPlcSize size of the char bin table. * @param parOffset offset of the paragraph bin table. * @param size of the paragraph bin table. */ private void findFormatting(byte[] tableStream, int charOffset, int charPlcSize, int parOffset, int parPlcSize) throws IOException { openDoc(); createStyleSheet(tableStream); createListTables(tableStream); createFontTable(tableStream); //find character runs //Get all the chpx info and store it int arraySize = (charPlcSize - 4)/8; //first we must go through the bin table and find the fkps for(int x = 0; x < arraySize; x++) { //get page number(has nothing to do with document page) //containing the chpx for the paragraph int PN = LittleEndian.getInt(tableStream, charOffset + (4 * (arraySize + 1) + (4 * x))); byte[] fkp = new byte[512]; System.arraycopy(_header, (PN * 512), fkp, 0, 512); //take each fkp and get the chpxs int crun = Utils.convertUnsignedByteToInt(fkp[511]); for(int y = 0; y < crun; y++) { //get the beginning fc of each paragraph text run int fcStart = LittleEndian.getInt(fkp, y * 4); int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4); //get the offset in fkp of the papx for this paragraph int chpxOffset = 2 * Utils.convertUnsignedByteToInt(fkp[((crun + 1) * 4) + y]); //optimization if offset == 0 use "Normal" style if(chpxOffset == 0) { _characterTable.add(new ChpxNode(fcStart, fcEnd, new byte[0])); continue; } int size = Utils.convertUnsignedByteToInt(fkp[chpxOffset]); byte[] chpx = new byte[size]; System.arraycopy(fkp, ++chpxOffset, chpx, 0, size); //_papTable.put(new Integer(fcStart), papx); _characterTable.add(new ChpxNode(fcStart, fcEnd, chpx)); } } //find paragraphs arraySize = (parPlcSize - 4)/8; //first we must go through the bin table and find the fkps for(int x = 0; x < arraySize; x++) { int PN = LittleEndian.getInt(tableStream, parOffset + (4 * (arraySize + 1) + (4 * x))); byte[] fkp = new byte[512]; System.arraycopy(_header, (PN * 512), fkp, 0, 512); //take each fkp and get the paps int crun = Utils.convertUnsignedByteToInt(fkp[511]); for(int y = 0; y < crun; y++) { //get the beginning fc of each paragraph text run int fcStart = LittleEndian.getInt(fkp, y * 4); int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -