📄 worddocument.java

📁 java 读写word excel ppt
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* ====================================================================   Licensed to the Apache Software Foundation (ASF) under one or more   contributor license agreements.  See the NOTICE file distributed with   this work for additional information regarding copyright ownership.   The ASF licenses this file to You under the Apache License, Version 2.0   (the "License"); you may not use this file except in compliance with   the License.  You may obtain a copy of the License at       http://www.apache.org/licenses/LICENSE-2.0   Unless required by applicable law or agreed to in writing, software   distributed under the License is distributed on an "AS IS" BASIS,   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   See the License for the specific language governing permissions and   limitations under the License.==================================================================== */        package org.apache.poi.hdf.extractor;import org.apache.poi.hdf.extractor.util.*;import org.apache.poi.hdf.extractor.data.*;import java.util.*;import java.io.*;import org.apache.poi.poifs.filesystem.POIFSFileSystem;import org.apache.poi.poifs.filesystem.DocumentEntry;import org.apache.poi.util.LittleEndian;/** * This class contains the main functionality for the Word file "reader". Much * of the code in this class is based on the Word 97 document file format. Only * works for non-complex files * * @author Ryan Ackley */public class WordDocument{  /** byte buffer containing the main Document stream*/  byte[] _header;  /** contains all style information for this document see Word 97 Doc spec*/  StyleSheet _styleSheet;  /** contains All list information for this document*/  ListTables _listTables;  /** contains global Document properties for this document*/  DOP _docProps = new DOP();  int _currentList = -1;  int _tableSize;  int _sectionCounter = 1;  /** fonts available for this document*/  FontTable _fonts;  /** document's text blocks*/  BTreeSet _text = new BTreeSet();  /** document's character runs */  BTreeSet _characterTable = new BTreeSet();  /** document's paragraphs*/  BTreeSet _paragraphTable = new BTreeSet();  /** doucment's sections*/  BTreeSet _sectionTable = new BTreeSet();  /** used for XSL-FO conversion*/  StringBuffer _headerBuffer = new StringBuffer();  /** used for XSL-FO conversion*/  StringBuffer _bodyBuffer = new StringBuffer();  /** used for XSL-FO table conversion*/  StringBuffer _cellBuffer;  /** used for XSL-FO table conversion*/  ArrayList _cells;  /** used for XSL-FO table conversion*/  ArrayList _table;  /** document's header and footer information*/  byte[] _plcfHdd;  /** starting position of text in main document stream*/  int _fcMin;  /** length of main document text stream*/  int _ccpText;  /** length of footnotes text*/  int _ccpFtn;  /** The name of the file to write to */  private static String _outName;  /** OLE stuff*/  private InputStream istream;  /** OLE stuff*/  private POIFSFileSystem filesystem;  //used internally  private static int HEADER_EVEN_INDEX = 0;  private static int HEADER_ODD_INDEX = 1;  private static int FOOTER_EVEN_INDEX = 2;  private static int FOOTER_ODD_INDEX = 3;  private static int HEADER_FIRST_INDEX = 4;  private static int FOOTER_FIRST_INDEX = 5;  /**   *  right now this function takes one parameter: a Word file, and outputs an   *  XSL-FO document at c:\test.xml (this is hardcoded)   */  public static void main(String args[])  {      /*try      {        WordDocument file = new WordDocument(args[0], "r");        Writer out = new BufferedWriter(new FileWriter(args[1]));        file.writeAllText(out);        out.flush();        out.close();      }      catch(Throwable t)      {        t.printStackTrace();      }*/      try      {          _outName = args[1];          WordDocument file = new WordDocument(args[0]);          file.closeDoc();      }      catch(Exception e)      {          e.printStackTrace();      }      System.exit(0);  }  /**   * Spits out the document text   *   * @param out The Writer to write the text to.   * @throws IOException if there is a problem while reading from the file or   *         writing out the text.   */  public void writeAllText(Writer out) throws IOException  {    int textStart = Utils.convertBytesToInt(_header, 0x18);    int textEnd = Utils.convertBytesToInt(_header, 0x1c);    ArrayList textPieces = findProperties(textStart, textEnd, _text.root);    int size = textPieces.size();    for(int x = 0; x < size; x++)    {      TextPiece nextPiece = (TextPiece)textPieces.get(x);      int start = nextPiece.getStart();      int end = nextPiece.getEnd();      boolean unicode = nextPiece.usesUnicode();      int add = 1;      if(unicode)      {        add = 2;        char ch;        for(int y = start; y < end; y += add)        {	  ch = (char)Utils.convertBytesToShort(_header, y);	  out.write(ch);        }      }      else      {	String sText = new String(_header, start, end-start);	out.write(sText);      }    }  }  /**   * Constructs a Word document from fileName. Parses the document and places   * all the important stuff into data structures.   *   * @param fileName The name of the file to read.   * @throws IOException if there is a problem while parsing the document.   */  public WordDocument(String fileName) throws IOException  {  	this(new FileInputStream(fileName));  }    public WordDocument(InputStream inputStream) throws IOException  {        //do Ole stuff        istream = inputStream;        filesystem = new POIFSFileSystem(istream);        //get important stuff from the Header block and parse all the        //data structures        readFIB();        //get the SEPS for the main document text        ArrayList sections = findProperties(_fcMin, _fcMin + _ccpText, _sectionTable.root);        //iterate through sections, paragraphs, and character runs doing what        //you will with the data.        int size = sections.size();        for(int x = 0; x < size; x++)        {          SepxNode node = (SepxNode)sections.get(x);          int start = node.getStart();          int end = node.getEnd();          SEP sep = (SEP)StyleSheet.uncompressProperty(node.getSepx(), new SEP(), _styleSheet);          writeSection(Math.max(_fcMin, start), Math.min(_fcMin + _ccpText, end), sep, _text, _paragraphTable, _characterTable, _styleSheet);        }        //finish        istream.close();  }  /**   * Extracts the main document stream from the POI file then hands off to other   * functions that parse other areas.   *   * @throws IOException   */  private void readFIB() throws IOException  {      //get the main document stream      DocumentEntry headerProps =        (DocumentEntry)filesystem.getRoot().getEntry("WordDocument");      //I call it the header but its also the main document stream      _header = new byte[headerProps.getSize()];      filesystem.createDocumentInputStream("WordDocument").read(_header);      //Get the information we need from the header      int info = LittleEndian.getShort(_header, 0xa);      _fcMin = LittleEndian.getInt(_header, 0x18);      _ccpText = LittleEndian.getInt(_header, 0x4c);      _ccpFtn = LittleEndian.getInt(_header, 0x50);      int charPLC = LittleEndian.getInt(_header, 0xfa);      int charPlcSize = LittleEndian.getInt(_header, 0xfe);      int parPLC = LittleEndian.getInt(_header, 0x102);      int parPlcSize = LittleEndian.getInt(_header, 0x106);      boolean useTable1 = (info & 0x200) != 0;      //process the text and formatting properties      processComplexFile(useTable1, charPLC, charPlcSize, parPLC, parPlcSize);  }  /**   * Extracts the correct Table stream from the POI filesystem then hands off to   * other functions to process text and formatting info. the name is based on   * the fact that in Word 8(97) all text (not character or paragraph formatting)   * is stored in complex format.   *   * @param useTable1 boolean that specifies if we should use table1 or table0   * @param charTable offset in table stream of character property bin table   * @param charPlcSize size of character property bin table   * @param parTable offset in table stream of paragraph property bin table.   * @param parPlcSize size of paragraph property bin table.   * @return boolean indocating success of   * @throws IOException   */  private void processComplexFile(boolean useTable1, int charTable,                                     int charPlcSize, int parTable, int parPlcSize) throws IOException  {      //get the location of the piece table      int complexOffset = LittleEndian.getInt(_header, 0x1a2);      String tablename=null;      DocumentEntry tableEntry = null;      if(useTable1)      {          tablename="1Table";      }      else      {          tablename="0Table";      }      tableEntry = (DocumentEntry)filesystem.getRoot().getEntry(tablename);      //load the table stream into a buffer      int size = tableEntry.getSize();      byte[] tableStream = new byte[size];      filesystem.createDocumentInputStream(tablename).read(tableStream);      //init the DOP for this document      initDocProperties(tableStream);      //load the header/footer raw data for this document      initPclfHdd(tableStream);      //parse out the text locations      findText(tableStream, complexOffset);      //parse out text formatting      findFormatting(tableStream, charTable, charPlcSize, parTable, parPlcSize);  }  /**   * Goes through the piece table and parses out the info regarding the text   * blocks. For Word 97 and greater all text is stored in the "complex" way   * because of unicode.   *   * @param tableStream buffer containing the main table stream.   * @param beginning of the complex data.   * @throws IOException   */  private void findText(byte[] tableStream, int complexOffset) throws IOException  {    //actual text    int pos = complexOffset;    //skips through the prms before we reach the piece table. These contain data    //for actual fast saved files    while(tableStream[pos] == 1)    {        pos++;        int skip = LittleEndian.getShort(tableStream, pos);        pos += 2 + skip;    }    if(tableStream[pos] != 2)    {        throw new IOException("corrupted Word file");    }    else    {        //parse out the text pieces        int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);        pos += 4;        int pieces = (pieceTableSize - 4) / 12;        for (int x = 0; x < pieces; x++)        {            int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) + (x * 8) + 2);            boolean unicode = false;            if ((filePos & 0x40000000) == 0)            {                unicode = true;            }            else            {                unicode = false;                filePos &= ~(0x40000000);//gives me FC in doc stream                filePos /= 2;            }            int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -                            LittleEndian.getInt(tableStream, pos + (x * 4));            TextPiece piece = new TextPiece(filePos, totLength, unicode);            _text.add(piece);        }    }  }  /**   * Does all of the formatting parsing   *   * @param tableStream Main table stream buffer.   * @param charOffset beginning of the character bin table.   * @param chrPlcSize size of the char bin table.   * @param parOffset offset of the paragraph bin table.   * @param size of the paragraph bin table.   */  private void findFormatting(byte[] tableStream, int charOffset,                              int charPlcSize, int parOffset, int parPlcSize) throws IOException  {      openDoc();      createStyleSheet(tableStream);      createListTables(tableStream);      createFontTable(tableStream);      //find character runs      //Get all the chpx info and store it      int arraySize = (charPlcSize - 4)/8;      //first we must go through the bin table and find the fkps      for(int x = 0; x < arraySize; x++)      {          //get page number(has nothing to do with document page)          //containing the chpx for the paragraph          int PN = LittleEndian.getInt(tableStream, charOffset + (4 * (arraySize + 1) + (4 * x)));          byte[] fkp = new byte[512];          System.arraycopy(_header, (PN * 512), fkp, 0, 512);          //take each fkp and get the chpxs          int crun = Utils.convertUnsignedByteToInt(fkp[511]);          for(int y = 0; y < crun; y++)          {              //get the beginning fc of each paragraph text run              int fcStart = LittleEndian.getInt(fkp, y * 4);              int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);              //get the offset in fkp of the papx for this paragraph              int chpxOffset = 2 * Utils.convertUnsignedByteToInt(fkp[((crun + 1) * 4) + y]);              //optimization if offset == 0 use "Normal" style              if(chpxOffset == 0)              {                _characterTable.add(new ChpxNode(fcStart, fcEnd, new byte[0]));                continue;              }              int size = Utils.convertUnsignedByteToInt(fkp[chpxOffset]);              byte[] chpx = new byte[size];              System.arraycopy(fkp, ++chpxOffset, chpx, 0, size);              //_papTable.put(new Integer(fcStart), papx);              _characterTable.add(new ChpxNode(fcStart, fcEnd, chpx));          }      }      //find paragraphs      arraySize = (parPlcSize - 4)/8;      //first we must go through the bin table and find the fkps      for(int x = 0; x < arraySize; x++)      {          int PN = LittleEndian.getInt(tableStream, parOffset + (4 * (arraySize + 1) + (4 * x)));          byte[] fkp = new byte[512];          System.arraycopy(_header, (PN * 512), fkp, 0, 512);          //take each fkp and get the paps          int crun = Utils.convertUnsignedByteToInt(fkp[511]);          for(int y = 0; y < crun; y++)          {              //get the beginning fc of each paragraph text run              int fcStart = LittleEndian.getInt(fkp, y * 4);              int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);              //get the offset in fkp of the papx for this paragraph              int papxOffset = 2 * Utils.convertUnsignedByteToInt(fkp[((crun + 1) * 4) + (y * 13)]);              int size = 2 * Utils.convertUnsignedByteToInt(fkp[papxOffset]);              if(size == 0)              {                  size = 2 * Utils.convertUnsignedByteToInt(fkp[++papxOffset]);              }              else              {                  size--;              }              byte[] papx = new byte[size];              System.arraycopy(fkp, ++papxOffset, papx, 0, size);              _paragraphTable.add(new PapxNode(fcStart, fcEnd, papx));          }      }      //find sections      int fcMin = Utils.convertBytesToInt(_header, 0x18);      int plcfsedFC = Utils.convertBytesToInt(_header, 0xca);      int plcfsedSize = Utils.convertBytesToInt(_header, 0xce);      byte[] plcfsed = new byte[plcfsedSize];      System.arraycopy(tableStream, plcfsedFC, plcfsed, 0, plcfsedSize);      arraySize = (plcfsedSize - 4)/16;      //openDoc();
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -