/**
* Copyright (c) 2003-2005, www.pdfbox.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of pdfbox; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://www.pdfbox.org
*
*/
package org.pdfbox.util;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSStream;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.common.PDRectangle;
import org.pdfbox.pdmodel.common.PDStream;
import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.pdfbox.pdmodel.encryption.PDStandardEncryption;
import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.apache.log4j.Logger;
/**
* This class will take a pdf document and strip out all of the text and ignore the
* formatting and such.
*
* @author Ben Litchfield (ben@benlitchfield.com)
* @version $Revision: 1.62 $
*/
public class PDFTextStripper extends PDFStreamEngine
{
private static Logger log = Logger.getLogger(PDFTextStripper.class);
private int currentPageNo = 0;
private int startPage = 1;
private int endPage = Integer.MAX_VALUE;
private PDOutlineItem startBookmark = null;
private int startBookmarkPageNumber = -1;
private PDOutlineItem endBookmark = null;
private int endBookmarkPageNumber = -1;
private PDDocument document;
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;
private boolean sortByPosition = false;
private List pageArticles = null;
/**
* The charactersByArticle is used to extract text by article divisions. For example
* a PDF that has two columns like a newspaper, we want to extract the first column and
* then the second column. In this example the PDF would have 2 beads(or articles), one for
* each column. The size of the charactersByArticle would be 5, because not all text on the
* screen will fall into one of the articles. The five divisions are shown below
*
* Text before first article
* first article text
* text between first article and second article
* second article text
* text after second article
*
* Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
*/
protected Vector charactersByArticle = new Vector();
private Map characterListMapping = new HashMap();
private String lineSeparator = System.getProperty("line.separator");
private String pageSeparator = System.getProperty("line.separator");
private String wordSeparator = " ";
/**
* The stream to write the output to.
*/
protected Writer output;
/**
* Instantiate a new PDFTextStripper object. This object will load properties from
* Resources/PDFTextStripper.properties.
* @throws IOException If there is an error loading the properties.
*/
public PDFTextStripper() throws IOException
{
super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
}
/**
* This will return the text of a document. See writeText.
* NOTE: The document must not be encrypted when coming into this method.
*
* @param doc The document to get the text from.
*
* @return The text of the PDF document.
*
* @throws IOException if the doc state is invalid or it is encrypted.
*/
public String getText( PDDocument doc ) throws IOException
{
StringWriter outputStream = new StringWriter();
writeText( doc, outputStream );
return outputStream.toString();
}
/**
* @deprecated
* @see PDFTextStripper#getText( PDDocument )
* @param doc The document to extract the text from.
* @return The document text.
* @throws IOException If there is an error extracting the text.
*/
public String getText( COSDocument doc ) throws IOException
{
return getText( new PDDocument( doc ) );
}
/**
* @deprecated
* @see PDFTextStripper#writeText( PDDocument, Writer )
* @param doc The document to extract the text.
* @param outputStream The stream to write the text to.
* @throws IOException If there is an error extracting the text.
*/
public void writeText( COSDocument doc, Writer outputStream ) throws IOException
{
writeText( new PDDocument( doc ), outputStream );
}
/**
* This will take a PDDocument and write the text of that document to the print writer.
*
* @param doc The document to get the data from.
* @param outputStream The location to put the text.
*
* @throws IOException If the doc is in an invalid state.
*/
public void writeText( PDDocument doc, Writer outputStream ) throws IOException
{
PDEncryptionDictionary encDictionary = doc.getEncryptionDictionary();
//only care about standard encryption and if it was decrypted with the
//user password
if( encDictionary instanceof PDStandardEncryption &&
!doc.wasDecryptedWithOwnerPassword() )
{
PDStandardEncryption stdEncryption = (PDStandardEncryption)encDictionary;
if( !stdEncryption.canExtractContent() )
{
throw new IOException( "You do not have permission to extract text" );
}
}
currentPageNo = 0;
document = doc;
output = outputStream;
startDocument(document);
if( document.isEncrypted() )
{
// We are expecting non-encrypted documents here, but it is common
// for users to pass in a document that is encrypted with an empty
// password (such a document appears to not be encrypted by
// someone viewing the document, thus the confusion). We will
// attempt to decrypt with the empty password to handle this case.
//
log.debug("Document is encrypted, decrypting with empty password");
try
{
document.decrypt("");
}
catch (CryptographyException e)
{
throw new IOException("Error decrypting document, details: " + e.getMessage());
}
catch (InvalidPasswordException e)
{
throw new IOException("Error: document is encrypted");
}
}
processPages( document.getDocumentCatalog().getAllPages() );
endDocument(document);
}
/**
* This will process all of the pages and the text that is in them.
*
* @param pages The pages object in the document.
*
* @throws IOException If there is an error parsing the text.
*/
protected void processPages( List pages ) throws IOException
{
if( log.isDebugEnabled() )
{
log.debug( "processPages( " + pages + " )" );
}
if( startBookmark != null )
{
startBookmarkPageNumber = getPageNumber( startBookmark, pages );
}
if( endBookmark != null )
{
endBookmarkPageNumber = getPageNumber( endBookmark, pages );
}
if( startBookmarkPageNumber == -1 && startBookmark != null &&
endBookmarkPageNumber == -1 && endBookmark != null &&
startBookmark.getCOSObject() == endBookmark.getCOSObject() )
{
//this is a special case where both the start and end bookmark
//are the same but point to nothing. In this case
//we will not extract any text.
startBookmarkPageNumber = 0;
endBookmarkPageNumber = 0;
}
Iterator pageIter = pages.iterator();
while( pageIter.hasNext() )
{
PDPage nextPage = (PDPage)pageIter.next();
PDStream contentStream = nextPage.getContents();
if( contentStream != null )
{
COSStream contents = contentStream.getStream();
processPage( nextPage, contents );
}
}
if( log.isDebugEnabled() )
{
log.debug( "processPages() end" );
}
}
private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException
{
int pageNumber = -1;
PDPage page = bookmark.findDestinationPage( document );
if( page != null )
{
pageNumber = allPages.indexOf( page )+1;//use one based indexing
}
return pageNumber;
}
/**
* This method is available for subclasses of this class. It will be called before processing
* of the document start.
*
* @param pdf The PDF document that is being processed.
* @throws IOException If an IO error occurs.
*/
protected void startDocument(PDDocument pdf) throws IOException
{
// no default implementation, but available for subclasses
}
/**
* This method is available for subclasses of this class. It will be called after processing
* of the document finishes.
*
* @param pdf The PDF document that is being processed.
* @throws IOException If an IO error occurs.
*/
protected void endDocument(PDDocument pdf ) throws IOException
{
// no default implementation, but available for subclasses
}
/**
* This will process the contents of a page.
*
* @param page The page to process.
* @param content The contents of the page.
*
* @throws IOException If there is an error processing the page.
*/
protected void processPage( PDPage page, COSStream content ) throws IOException
{
long start = System.currentTimeMillis();
if( log.isDebugEnabled() )
{
log.debug( "processPage( " + page + ", " + content + " )" );
}
currentPageNo++;
if( currentPageNo >= startPage && currentPageNo <= endPage &&
(startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
(endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
{
startPage( page );
pageArticles = page.getThreadBeads();
int numberOfArticleSections = 1 + pageArticles.size() * 2;
if( !shouldSeparateByBeads )
{
numberOfArticleSections = 1;
}
int originalSize = charactersByArticle.size();
charactersByArticle.setSize( numberOfArticleSections );
for( int i=0; i");
}
float endOfLastTextX = -1;
float startOfNextWordX = -1;
float lastWordSpacing = -1;
TextPosition lastProcessedCharacter = null;
for( int i=0; i (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
{
if (log.isDebugEnabled())
{
log.debug("");
}
output.write(lineSeparator);
endOfLastTextX = -1;
startOfNextWordX = -1;
currentY = -1;
lastBaselineFontSize = -1;
}
if (startOfNextWordX != -1 && startOfNextWordX < position.getX() &&
lastProcessedCharacter != null &&
//only bother adding a space if the last character was not a space
lastProcessedCharacter.getCharacter() != null &&
!lastProcessedCharacter.getCharacter().endsWith( " " ) )
{
if (log.isDebugEnabled())
{
log.debug("");
}
output.write( wordSeparator );
}
if (log.isDebugEnabled())
{
log.debug("flushText" +
" x=" + position.getX() +
" y=" + position.getY() +
" xScale=" + position.getXScale() +
" yScale=" + position.getYScale() +
" width=" + position.getWidth() +
" currentY=" + currentY +
" endOfLastTextX=" + endOfLastTextX +
" startOfNextWordX=" + startOfNextWordX +
" fontSize=" + position.getFontSize() +
" wordSpacing=" + wordSpacing +
" string=\"" + characterValue + "\"");
}
if (currentY == -1)
{
currentY = position.getY();
}
if (currentY == position.getY())
{
lastBaselineFontSize = position.getFontSize();
}
// RDD - endX is what PDF considers to be the x coordinate of the
// end position of the text. We use it in computing our metrics below.
//
endOfLastTextX = position.getX() + position.getWidth();
if (characterValue != null)
{
output.write(characterValue);
}
else
{
log.debug( "Position.getString() is null so not writing anything" );
}
lastProcessedCharacter = position;
}
endParagraph();
}
// RDD - newline at end of flush - required for end of page (so that the top
// of the next page starts on its own line.
//
if( log.isDebugEnabled() )
{
log.debug("");
}
output.write(pageSeparator);
output.flush();
}
/**
* Write the string to the output stream.
*
* @param text The text to write to the stream.
* @throws IOException If there is an error when writing the text.
*/
protected void writeCharacters( TextPosition text ) throws IOException
{
output.write( text.getCharacter() );
}
/**
* This will determine of two floating point numbers are within a specified variance.
*
* @param first The first number to compare to.
* @param second The second number to compare to.
* @param variance The allowed variance.
*/
private boolean within( float first, float second, float variance )
{
return second > first - variance && second < first + variance;
}
/**
* This will show add a character to the list of characters to be printed to
* the text file.
*
* @param text The description of the character to display.
*/
protected void showCharacter( TextPosition text )
{
boolean showCharacter = true;
if( suppressDuplicateOverlappingText )
{
showCharacter = false;
String textCharacter = text.getCharacter();
float textX = text.getX();
float textY = text.getY();
List sameTextCharacters = (List)characterListMapping.get( textCharacter );
if( sameTextCharacters == null )
{
sameTextCharacters = new ArrayList();
characterListMapping.put( textCharacter, sameTextCharacters );
}
// RDD - Here we compute the value that represents the end of the rendered
// text. This value is used to determine whether subsequent text rendered
// on the same line overwrites the current text.
//
// We subtract any positive padding to handle cases where extreme amounts
// of padding are applied, then backed off (not sure why this is done, but there
// are cases where the padding is on the order of 10x the character width, and
// the TJ just backs up to compensate after each character). Also, we subtract
// an amount to allow for kerning (a percentage of the width of the last
// character).
//
boolean suppressCharacter = false;
float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
for( int i=0; i
* The default is to not sort by position.
*
* A PDF writer could choose to write each character in a different order. By
* default PDFBox does not sort the text tokens before processing them due to
* performance reasons.
*
* @param newSortByPosition Tell PDFBox to sort the text positions.
*/
public void setSortByPosition(boolean newSortByPosition)
{
sortByPosition = newSortByPosition;
}
}