/** * Copyright (c) 2003-2005, www.pdfbox.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the name of pdfbox; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://www.pdfbox.org * */ package org.pdfbox.util; import java.io.IOException; import java.io.StringWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Vector; import org.pdfbox.cos.COSDocument; import org.pdfbox.cos.COSStream; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.pdmodel.common.PDRectangle; import org.pdfbox.pdmodel.common.PDStream; import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary; import org.pdfbox.pdmodel.encryption.PDStandardEncryption; import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; import org.pdfbox.exceptions.CryptographyException; import org.pdfbox.exceptions.InvalidPasswordException; import org.apache.log4j.Logger; /** * This class will take a pdf document and strip out all of the text and ignore the * formatting and such. * * @author Ben Litchfield (ben@benlitchfield.com) * @version $Revision: 1.62 $ */ public class PDFTextStripper extends PDFStreamEngine { private static Logger log = Logger.getLogger(PDFTextStripper.class); private int currentPageNo = 0; private int startPage = 1; private int endPage = Integer.MAX_VALUE; private PDOutlineItem startBookmark = null; private int startBookmarkPageNumber = -1; private PDOutlineItem endBookmark = null; private int endBookmarkPageNumber = -1; private PDDocument document; private boolean suppressDuplicateOverlappingText = true; private boolean shouldSeparateByBeads = true; private boolean sortByPosition = false; private List pageArticles = null; /** * The charactersByArticle is used to extract text by article divisions. For example * a PDF that has two columns like a newspaper, we want to extract the first column and * then the second column. In this example the PDF would have 2 beads(or articles), one for * each column. The size of the charactersByArticle would be 5, because not all text on the * screen will fall into one of the articles. The five divisions are shown below * * Text before first article * first article text * text between first article and second article * second article text * text after second article * * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. */ protected Vector charactersByArticle = new Vector(); private Map characterListMapping = new HashMap(); private String lineSeparator = System.getProperty("line.separator"); private String pageSeparator = System.getProperty("line.separator"); private String wordSeparator = " "; /** * The stream to write the output to. */ protected Writer output; /** * Instantiate a new PDFTextStripper object. This object will load properties from * Resources/PDFTextStripper.properties. * @throws IOException If there is an error loading the properties. */ public PDFTextStripper() throws IOException { super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) ); } /** * This will return the text of a document. See writeText.
* NOTE: The document must not be encrypted when coming into this method. * * @param doc The document to get the text from. * * @return The text of the PDF document. * * @throws IOException if the doc state is invalid or it is encrypted. */ public String getText( PDDocument doc ) throws IOException { StringWriter outputStream = new StringWriter(); writeText( doc, outputStream ); return outputStream.toString(); } /** * @deprecated * @see PDFTextStripper#getText( PDDocument ) * @param doc The document to extract the text from. * @return The document text. * @throws IOException If there is an error extracting the text. */ public String getText( COSDocument doc ) throws IOException { return getText( new PDDocument( doc ) ); } /** * @deprecated * @see PDFTextStripper#writeText( PDDocument, Writer ) * @param doc The document to extract the text. * @param outputStream The stream to write the text to. * @throws IOException If there is an error extracting the text. */ public void writeText( COSDocument doc, Writer outputStream ) throws IOException { writeText( new PDDocument( doc ), outputStream ); } /** * This will take a PDDocument and write the text of that document to the print writer. * * @param doc The document to get the data from. * @param outputStream The location to put the text. * * @throws IOException If the doc is in an invalid state. */ public void writeText( PDDocument doc, Writer outputStream ) throws IOException { PDEncryptionDictionary encDictionary = doc.getEncryptionDictionary(); //only care about standard encryption and if it was decrypted with the //user password if( encDictionary instanceof PDStandardEncryption && !doc.wasDecryptedWithOwnerPassword() ) { PDStandardEncryption stdEncryption = (PDStandardEncryption)encDictionary; if( !stdEncryption.canExtractContent() ) { throw new IOException( "You do not have permission to extract text" ); } } currentPageNo = 0; document = doc; output = outputStream; startDocument(document); if( document.isEncrypted() ) { // We are expecting non-encrypted documents here, but it is common // for users to pass in a document that is encrypted with an empty // password (such a document appears to not be encrypted by // someone viewing the document, thus the confusion). We will // attempt to decrypt with the empty password to handle this case. // log.debug("Document is encrypted, decrypting with empty password"); try { document.decrypt(""); } catch (CryptographyException e) { throw new IOException("Error decrypting document, details: " + e.getMessage()); } catch (InvalidPasswordException e) { throw new IOException("Error: document is encrypted"); } } processPages( document.getDocumentCatalog().getAllPages() ); endDocument(document); } /** * This will process all of the pages and the text that is in them. * * @param pages The pages object in the document. * * @throws IOException If there is an error parsing the text. */ protected void processPages( List pages ) throws IOException { if( log.isDebugEnabled() ) { log.debug( "processPages( " + pages + " )" ); } if( startBookmark != null ) { startBookmarkPageNumber = getPageNumber( startBookmark, pages ); } if( endBookmark != null ) { endBookmarkPageNumber = getPageNumber( endBookmark, pages ); } if( startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject() ) { //this is a special case where both the start and end bookmark //are the same but point to nothing. In this case //we will not extract any text. startBookmarkPageNumber = 0; endBookmarkPageNumber = 0; } Iterator pageIter = pages.iterator(); while( pageIter.hasNext() ) { PDPage nextPage = (PDPage)pageIter.next(); PDStream contentStream = nextPage.getContents(); if( contentStream != null ) { COSStream contents = contentStream.getStream(); processPage( nextPage, contents ); } } if( log.isDebugEnabled() ) { log.debug( "processPages() end" ); } } private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException { int pageNumber = -1; PDPage page = bookmark.findDestinationPage( document ); if( page != null ) { pageNumber = allPages.indexOf( page )+1;//use one based indexing } return pageNumber; } /** * This method is available for subclasses of this class. It will be called before processing * of the document start. * * @param pdf The PDF document that is being processed. * @throws IOException If an IO error occurs. */ protected void startDocument(PDDocument pdf) throws IOException { // no default implementation, but available for subclasses } /** * This method is available for subclasses of this class. It will be called after processing * of the document finishes. * * @param pdf The PDF document that is being processed. * @throws IOException If an IO error occurs. */ protected void endDocument(PDDocument pdf ) throws IOException { // no default implementation, but available for subclasses } /** * This will process the contents of a page. * * @param page The page to process. * @param content The contents of the page. * * @throws IOException If there is an error processing the page. */ protected void processPage( PDPage page, COSStream content ) throws IOException { long start = System.currentTimeMillis(); if( log.isDebugEnabled() ) { log.debug( "processPage( " + page + ", " + content + " )" ); } currentPageNo++; if( currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber )) { startPage( page ); pageArticles = page.getThreadBeads(); int numberOfArticleSections = 1 + pageArticles.size() * 2; if( !shouldSeparateByBeads ) { numberOfArticleSections = 1; } int originalSize = charactersByArticle.size(); charactersByArticle.setSize( numberOfArticleSections ); for( int i=0; i"); } float endOfLastTextX = -1; float startOfNextWordX = -1; float lastWordSpacing = -1; TextPosition lastProcessedCharacter = null; for( int i=0; i (currentY + (position.getFontSize() * 0.9f * verticalScaling))))) { if (log.isDebugEnabled()) { log.debug(""); } output.write(lineSeparator); endOfLastTextX = -1; startOfNextWordX = -1; currentY = -1; lastBaselineFontSize = -1; } if (startOfNextWordX != -1 && startOfNextWordX < position.getX() && lastProcessedCharacter != null && //only bother adding a space if the last character was not a space lastProcessedCharacter.getCharacter() != null && !lastProcessedCharacter.getCharacter().endsWith( " " ) ) { if (log.isDebugEnabled()) { log.debug(""); } output.write( wordSeparator ); } if (log.isDebugEnabled()) { log.debug("flushText" + " x=" + position.getX() + " y=" + position.getY() + " xScale=" + position.getXScale() + " yScale=" + position.getYScale() + " width=" + position.getWidth() + " currentY=" + currentY + " endOfLastTextX=" + endOfLastTextX + " startOfNextWordX=" + startOfNextWordX + " fontSize=" + position.getFontSize() + " wordSpacing=" + wordSpacing + " string=\"" + characterValue + "\""); } if (currentY == -1) { currentY = position.getY(); } if (currentY == position.getY()) { lastBaselineFontSize = position.getFontSize(); } // RDD - endX is what PDF considers to be the x coordinate of the // end position of the text. We use it in computing our metrics below. // endOfLastTextX = position.getX() + position.getWidth(); if (characterValue != null) { output.write(characterValue); } else { log.debug( "Position.getString() is null so not writing anything" ); } lastProcessedCharacter = position; } endParagraph(); } // RDD - newline at end of flush - required for end of page (so that the top // of the next page starts on its own line. // if( log.isDebugEnabled() ) { log.debug(""); } output.write(pageSeparator); output.flush(); } /** * Write the string to the output stream. * * @param text The text to write to the stream. * @throws IOException If there is an error when writing the text. */ protected void writeCharacters( TextPosition text ) throws IOException { output.write( text.getCharacter() ); } /** * This will determine of two floating point numbers are within a specified variance. * * @param first The first number to compare to. * @param second The second number to compare to. * @param variance The allowed variance. */ private boolean within( float first, float second, float variance ) { return second > first - variance && second < first + variance; } /** * This will show add a character to the list of characters to be printed to * the text file. * * @param text The description of the character to display. */ protected void showCharacter( TextPosition text ) { boolean showCharacter = true; if( suppressDuplicateOverlappingText ) { showCharacter = false; String textCharacter = text.getCharacter(); float textX = text.getX(); float textY = text.getY(); List sameTextCharacters = (List)characterListMapping.get( textCharacter ); if( sameTextCharacters == null ) { sameTextCharacters = new ArrayList(); characterListMapping.put( textCharacter, sameTextCharacters ); } // RDD - Here we compute the value that represents the end of the rendered // text. This value is used to determine whether subsequent text rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme amounts // of padding are applied, then backed off (not sure why this is done, but there // are cases where the padding is on the order of 10x the character width, and // the TJ just backs up to compensate after each character). Also, we subtract // an amount to allow for kerning (a percentage of the width of the last // character). // boolean suppressCharacter = false; float tolerance = (text.getWidth()/textCharacter.length())/3.0f; for( int i=0; i * The default is to not sort by position.
*
* A PDF writer could choose to write each character in a different order. By * default PDFBox does not sort the text tokens before processing them due to * performance reasons. * * @param newSortByPosition Tell PDFBox to sort the text positions. */ public void setSortByPosition(boolean newSortByPosition) { sortByPosition = newSortByPosition; } }