wordextractor.java
来自「EXCEL read and write」· Java 代码 · 共 242 行
JAVA
242 行
/** Licensed to the Apache Software Foundation (ASF) under one or more* contributor license agreements. See the NOTICE file distributed with* this work for additional information regarding copyright ownership.* The ASF licenses this file to You under the Apache License, Version 2.0* (the "License"); you may not use this file except in compliance with* the License. You may obtain a copy of the License at** http://www.apache.org/licenses/LICENSE-2.0** Unless required by applicable law or agreed to in writing, software* distributed under the License is distributed on an "AS IS" BASIS,* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.* See the License for the specific language governing permissions and* limitations under the License.*/package org.apache.poi.hwpf.extractor;import java.io.IOException;import java.io.InputStream;import java.io.FileInputStream;import java.io.UnsupportedEncodingException;import java.util.Iterator;import org.apache.poi.POIOLE2TextExtractor;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.model.TextPiece;import org.apache.poi.hwpf.usermodel.HeaderStories;import org.apache.poi.hwpf.usermodel.Paragraph;import org.apache.poi.hwpf.usermodel.Range;import org.apache.poi.poifs.filesystem.POIFSFileSystem;/** * Class to extract the text from a Word Document. * * You should use either getParagraphText() or getText() unless * you have a strong reason otherwise. * * @author Nick Burch (nick at torchbox dot com) */public class WordExtractor extends POIOLE2TextExtractor { private POIFSFileSystem fs; private HWPFDocument doc; /** * Create a new Word Extractor * @param is InputStream containing the word file */ public WordExtractor(InputStream is) throws IOException { this( HWPFDocument.verifyAndBuildPOIFS(is) ); } /** * Create a new Word Extractor * @param fs POIFSFileSystem containing the word file */ public WordExtractor(POIFSFileSystem fs) throws IOException { this(new HWPFDocument(fs)); this.fs = fs; } /** * Create a new Word Extractor * @param doc The HWPFDocument to extract from */ public WordExtractor(HWPFDocument doc) throws IOException { super(doc); this.doc = doc; } /** * Command line extractor, so people will stop moaning that * they can't just run this. */ public static void main(String[] args) throws IOException { if(args.length == 0) { System.err.println("Use:"); System.err.println(" java org.apache.poi.hwpf.extractor.WordExtractor <filename>"); System.exit(1); } // Process the first argument as a file FileInputStream fin = new FileInputStream(args[0]); WordExtractor extractor = new WordExtractor(fin); System.out.println(extractor.getText()); } /** * Get the text from the word file, as an array with one String * per paragraph */ public String[] getParagraphText() { String[] ret; // Extract using the model code try { Range r = doc.getRange(); ret = new String[r.numParagraphs()]; for(int i=0; i<ret.length; i++) { Paragraph p = r.getParagraph(i); ret[i] = p.text(); // Fix the line ending if(ret[i].endsWith("\r")) { ret[i] = ret[i] + "\n"; } } } catch(Exception e) { // Something's up with turning the text pieces into paragraphs // Fall back to ripping out the text pieces ret = new String[1]; ret[0] = getTextFromPieces(); } return ret; } /** * Add the header/footer text, if it's not empty */ private void appendHeaderFooter(String text, StringBuffer out) { if(text == null || text.length() == 0) return; text = text.replace('\r', '\n'); if(! text.endsWith("\n")) { out.append(text); out.append('\n'); return; } if(text.endsWith("\n\n")) { out.append(text.substring(0, text.length()-1)); return; } out.append(text); return; } /** * Grab the text from the headers */ public String getHeaderText() { HeaderStories hs = new HeaderStories(doc); StringBuffer ret = new StringBuffer(); if(hs.getFirstHeader() != null) { appendHeaderFooter(hs.getFirstHeader(), ret); } if(hs.getEvenHeader() != null) { appendHeaderFooter(hs.getEvenHeader(), ret); } if(hs.getOddHeader() != null) { appendHeaderFooter(hs.getOddHeader(), ret); } return ret.toString(); } /** * Grab the text from the footers */ public String getFooterText() { HeaderStories hs = new HeaderStories(doc); StringBuffer ret = new StringBuffer(); if(hs.getFirstFooter() != null) { appendHeaderFooter(hs.getFirstFooter(), ret); } if(hs.getEvenFooter() != null) { appendHeaderFooter(hs.getEvenFooter(), ret); } if(hs.getOddFooter() != null) { appendHeaderFooter(hs.getOddFooter(), ret); } return ret.toString(); } /** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too. */ public String getTextFromPieces() { StringBuffer textBuf = new StringBuffer(); Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); while (textPieces.hasNext()) { TextPiece piece = (TextPiece) textPieces.next(); String encoding = "Cp1252"; if (piece.isUnicode()) { encoding = "UTF-16LE"; } try { String text = new String(piece.getRawBytes(), encoding); textBuf.append(text); } catch(UnsupportedEncodingException e) { throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken"); } } String text = textBuf.toString(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); text = text.replaceAll("\r\r", "\r\n\r\n"); if(text.endsWith("\r")) { text += "\n"; } return text; } /** * Grab the text, based on the paragraphs. Shouldn't include any crud, * but slightly slower than getTextFromPieces(). */ public String getText() { StringBuffer ret = new StringBuffer(); ret.append(getHeaderText()); String[] text = getParagraphText(); for(int i=0; i<text.length; i++) { ret.append(text[i]); } ret.append(getFooterText()); return ret.toString(); } /** * Removes any fields (eg macros, page markers etc) * from the string. */ public static String stripFields(String text) { return Range.stripFields(text); }}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?