worddocument.java

来自「Office格式转换代码」· Java 代码 · 共 1,861 行 · 第 1/4 页
JAVA
1,861 行
/* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2003 The Apache Software Foundation.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * 3. The end-user documentation included with the redistribution, *    if any, must include the following acknowledgment: *       "This product includes software developed by the *        Apache Software Foundation (http://www.apache.org/)." *    Alternately, this acknowledgment may appear in the software itself, *    if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and *    "Apache POI" must not be used to endorse or promote products *    derived from this software without prior written permission. For *    written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", *    "Apache POI", nor may "Apache" appear in their name, without *    prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation.  For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */package org.apache.poi.hdf.extractor;import org.apache.poi.hdf.extractor.util.*;import org.apache.poi.hdf.extractor.data.*;import java.util.*;import java.io.*;import javax.swing.*;import java.awt.*;import org.apache.poi.poifs.filesystem.POIFSFileSystem;import org.apache.poi.poifs.filesystem.POIFSDocument;import org.apache.poi.poifs.filesystem.DocumentEntry;import org.apache.poi.util.LittleEndian;/** * This class contains the main functionality for the Word file "reader". Much * of the code in this class is based on the Word 97 document file format. Only * works for non-complex files * * @author Ryan Ackley */public class WordDocument{  /** byte buffer containing the main Document stream*/  byte[] _header;  /** contains all style information for this document see Word 97 Doc spec*/  StyleSheet _styleSheet;  /** contains All list information for this document*/  ListTables _listTables;  /** contains global Document properties for this document*/  DOP _docProps = new DOP();  int _currentList = -1;  int _tableSize;  int _sectionCounter = 1;  /** fonts available for this document*/  FontTable _fonts;  /** document's text blocks*/  BTreeSet _text = new BTreeSet();  /** document's character runs */  BTreeSet _characterTable = new BTreeSet();  /** document's paragraphs*/  BTreeSet _paragraphTable = new BTreeSet();  /** doucment's sections*/  BTreeSet _sectionTable = new BTreeSet();  /** used for XSL-FO conversion*/  StringBuffer _headerBuffer = new StringBuffer();  /** used for XSL-FO conversion*/  StringBuffer _bodyBuffer = new StringBuffer();  /** used for XSL-FO table conversion*/  StringBuffer _cellBuffer;  /** used for XSL-FO table conversion*/  ArrayList _cells;  /** used for XSL-FO table conversion*/  ArrayList _table;  /** document's header and footer information*/  byte[] _plcfHdd;  /** starting position of text in main document stream*/  int _fcMin;  /** length of main document text stream*/  int _ccpText;  /** length of footnotes text*/  int _ccpFtn;  /** The name of the file to write to */  private static String _outName;  /** OLE stuff*/  private InputStream istream;  /** OLE stuff*/  private POIFSFileSystem filesystem;  //used internally  private static int HEADER_EVEN_INDEX = 0;  private static int HEADER_ODD_INDEX = 1;  private static int FOOTER_EVEN_INDEX = 2;  private static int FOOTER_ODD_INDEX = 3;  private static int HEADER_FIRST_INDEX = 4;  private static int FOOTER_FIRST_INDEX = 5;  /**   *  right now this function takes one parameter: a Word file, and outputs an   *  XSL-FO document at c:\test.xml (this is hardcoded)   */  public static void main(String args[])  {      /*try      {        WordDocument file = new WordDocument(args[0], "r");        Writer out = new BufferedWriter(new FileWriter(args[1]));        file.writeAllText(out);        out.flush();        out.close();      }      catch(Throwable t)      {        t.printStackTrace();      }*/      try      {          _outName = args[1];          WordDocument file = new WordDocument(args[0]);          file.closeDoc();      }      catch(Exception e)      {          e.printStackTrace();      }      System.exit(0);  }  /**   * Spits out the document text   *   * @param out The Writer to write the text to.   * @throws IOException if there is a problem while reading from the file or   *         writing out the text.   */  public void writeAllText(Writer out) throws IOException  {    int textStart = Utils.convertBytesToInt(_header, 0x18);    int textEnd = Utils.convertBytesToInt(_header, 0x1c);    ArrayList textPieces = findProperties(textStart, textEnd, _text.root);    int size = textPieces.size();    for(int x = 0; x < size; x++)    {      TextPiece nextPiece = (TextPiece)textPieces.get(x);      int start = nextPiece.getStart();      int end = nextPiece.getEnd();      boolean unicode = nextPiece.usesUnicode();      int add = 1;      if(unicode)      {        add = 2;        char ch;        for(int y = start; y < end; y += add)        {	  ch = (char)Utils.convertBytesToShort(_header, y);	  out.write(ch);        }      }      else      {	String sText = new String(_header, start, end-start);	out.write(sText);      }    }  }  /**   * Constructs a Word document from fileName. Parses the document and places   * all the important stuff into data structures.   *   * @param fileName The name of the file to read.   * @throws IOException if there is a problem while parsing the document.   */  public WordDocument(String fileName) throws IOException  {  	this(new FileInputStream(fileName));  }    public WordDocument(InputStream inputStream) throws IOException  {        //do Ole stuff        istream = inputStream;        filesystem = new POIFSFileSystem(istream);        //get important stuff from the Header block and parse all the        //data structures        readFIB();        //get the SEPS for the main document text        ArrayList sections = findProperties(_fcMin, _fcMin + _ccpText, _sectionTable.root);        //iterate through sections, paragraphs, and character runs doing what        //you will with the data.        int size = sections.size();        for(int x = 0; x < size; x++)        {          SepxNode node = (SepxNode)sections.get(x);          int start = node.getStart();          int end = node.getEnd();          SEP sep = (SEP)StyleSheet.uncompressProperty(node.getSepx(), new SEP(), _styleSheet);          writeSection(Math.max(_fcMin, start), Math.min(_fcMin + _ccpText, end), sep, _text, _paragraphTable, _characterTable, _styleSheet);        }        //finish        istream.close();  }  /**   * Extracts the main document stream from the POI file then hands off to other   * functions that parse other areas.   *   * @throws IOException   */  private void readFIB() throws IOException  {      //get the main document stream      DocumentEntry headerProps =        (DocumentEntry)filesystem.getRoot().getEntry("WordDocument");      //I call it the header but its also the main document stream      _header = new byte[headerProps.getSize()];      filesystem.createDocumentInputStream("WordDocument").read(_header);      //Get the information we need from the header      int info = LittleEndian.getShort(_header, 0xa);      _fcMin = LittleEndian.getInt(_header, 0x18);      _ccpText = LittleEndian.getInt(_header, 0x4c);      _ccpFtn = LittleEndian.getInt(_header, 0x50);      int charPLC = LittleEndian.getInt(_header, 0xfa);      int charPlcSize = LittleEndian.getInt(_header, 0xfe);      int parPLC = LittleEndian.getInt(_header, 0x102);      int parPlcSize = LittleEndian.getInt(_header, 0x106);      boolean useTable1 = (info & 0x200) != 0;      //process the text and formatting properties      processComplexFile(useTable1, charPLC, charPlcSize, parPLC, parPlcSize);  }  /**   * Extracts the correct Table stream from the POI filesystem then hands off to   * other functions to process text and formatting info. the name is based on   * the fact that in Word 8(97) all text (not character or paragraph formatting)   * is stored in complex format.   *   * @param useTable1 boolean that specifies if we should use table1 or table0   * @param charTable offset in table stream of character property bin table   * @param charPlcSize size of character property bin table   * @param parTable offset in table stream of paragraph property bin table.   * @param parPlcSize size of paragraph property bin table.   * @return boolean indocating success of   * @throws IOException   */  private void processComplexFile(boolean useTable1, int charTable,                                     int charPlcSize, int parTable, int parPlcSize) throws IOException  {      //get the location of the piece table      int complexOffset = LittleEndian.getInt(_header, 0x1a2);      String tablename=null;      DocumentEntry tableEntry = null;      if(useTable1)      {          tablename="1Table";      }      else      {          tablename="0Table";      }      tableEntry = (DocumentEntry)filesystem.getRoot().getEntry(tablename);      //load the table stream into a buffer      int size = tableEntry.getSize();      byte[] tableStream = new byte[size];      filesystem.createDocumentInputStream(tablename).read(tableStream);      //init the DOP for this document      initDocProperties(tableStream);      //load the header/footer raw data for this document      initPclfHdd(tableStream);      //parse out the text locations      findText(tableStream, complexOffset);      //parse out text formatting      findFormatting(tableStream, charTable, charPlcSize, parTable, parPlcSize);  }  /**   * Goes through the piece table and parses out the info regarding the text   * blocks. For Word 97 and greater all text is stored in the "complex" way   * because of unicode.   *   * @param tableStream buffer containing the main table stream.   * @param beginning of the complex data.   * @throws IOException   */  private void findText(byte[] tableStream, int complexOffset) throws IOException  {    //actual text    int pos = complexOffset;    //skips through the prms before we reach the piece table. These contain data    //for actual fast saved files    while(tableStream[pos] == 1)    {        pos++;        int skip = LittleEndian.getShort(tableStream, pos);        pos += 2 + skip;    }    if(tableStream[pos] != 2)    {        throw new IOException("corrupted Word file");    }    else    {        //parse out the text pieces        int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);        pos += 4;        int pieces = (pieceTableSize - 4) / 12;        for (int x = 0; x < pieces; x++)        {            int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) + (x * 8) + 2);            boolean unicode = false;            if ((filePos & 0x40000000) == 0)            {                unicode = true;            }            else            {                unicode = false;                filePos &= ~(0x40000000);//gives me FC in doc stream                filePos /= 2;            }            int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -                            LittleEndian.getInt(tableStream, pos + (x * 4));            TextPiece piece = new TextPiece(filePos, totLength, unicode);            _text.add(piece);        }    }  }  /**   * Does all of the formatting parsing   *   * @param tableStream Main table stream buffer.   * @param charOffset beginning of the character bin table.   * @param chrPlcSize size of the char bin table.   * @param parOffset offset of the paragraph bin table.   * @param size of the paragraph bin table.   */  private void findFormatting(byte[] tableStream, int charOffset,                              int charPlcSize, int parOffset, int parPlcSize) throws IOException  {      openDoc();      createStyleSheet(tableStream);      createListTables(tableStream);      createFontTable(tableStream);      //find character runs      //Get all the chpx info and store it      int arraySize = (charPlcSize - 4)/8;      //first we must go through the bin table and find the fkps      for(int x = 0; x < arraySize; x++)      {          //get page number(has nothing to do with document page)          //containing the chpx for the paragraph          int PN = LittleEndian.getInt(tableStream, charOffset + (4 * (arraySize + 1) + (4 * x)));          byte[] fkp = new byte[512];          System.arraycopy(_header, (PN * 512), fkp, 0, 512);          //take each fkp and get the chpxs          int crun = Utils.convertUnsignedByteToInt(fkp[511]);          for(int y = 0; y < crun; y++)          {              //get the beginning fc of each paragraph text run              int fcStart = LittleEndian.getInt(fkp, y * 4);              int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);              //get the offset in fkp of the papx for this paragraph              int chpxOffset = 2 * Utils.convertUnsignedByteToInt(fkp[((crun + 1) * 4) + y]);              //optimization if offset == 0 use "Normal" style              if(chpxOffset == 0)              {                _characterTable.add(new ChpxNode(fcStart, fcEnd, new byte[0]));                continue;              }              int size = Utils.convertUnsignedByteToInt(fkp[chpxOffset]);              byte[] chpx = new byte[size];              System.arraycopy(fkp, ++chpxOffset, chpx, 0, size);              //_papTable.put(new Integer(fcStart), papx);              _characterTable.add(new ChpxNode(fcStart, fcEnd, chpx));          }      }      //find paragraphs      arraySize = (parPlcSize - 4)/8;      //first we must go through the bin table and find the fkps      for(int x = 0; x < arraySize; x++)      {          int PN = LittleEndian.getInt(tableStream, parOffset + (4 * (arraySize + 1) + (4 * x)));          byte[] fkp = new byte[512];          System.arraycopy(_header, (PN * 512), fkp, 0, 512);          //take each fkp and get the paps          int crun = Utils.convertUnsignedByteToInt(fkp[511]);          for(int y = 0; y < crun; y++)          {              //get the beginning fc of each paragraph text run              int fcStart = LittleEndian.getInt(fkp, y * 4);              int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);
worddocument.java - 源码说明

本页面展示了「Office格式转换代码」中的 worddocument.java 源码文件，采用 Java 编程语言编写，共 1,861 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Office相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?