1 files changed, 1033 insertions, 0 deletions
diff --git a/src/main/java/org/pdfbox/util/PDFTextStripper.java b/src/main/java/org/pdfbox/util/PDFTextStripper.java
new file mode 100644
index 0000000..56e80cc
--- /dev/null
+++ b/src/main/java/org/pdfbox/util/PDFTextStripper.java
@@ -0,0 +1,1033 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.util;
+
+import java.io.IOException;
+import java.io.StringWriter;
+import java.io.Writer;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Vector;
+
+import org.pdfbox.cos.COSDocument;
+import org.pdfbox.cos.COSStream;
+
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdmodel.PDPage;
+
+import org.pdfbox.pdmodel.common.PDRectangle;
+import org.pdfbox.pdmodel.common.PDStream;
+
+import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
+import org.pdfbox.pdmodel.encryption.PDStandardEncryption;
+import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
+
+import org.pdfbox.exceptions.CryptographyException;
+import org.pdfbox.exceptions.InvalidPasswordException;
+
+import org.apache.log4j.Logger;
+
+
+/**
+ * This class will take a pdf document and strip out all of the text and ignore the
+ * formatting and such.
+ *
+ * @author Ben Litchfield (ben@benlitchfield.com)
+ * @version $Revision: 1.62 $
+ */
+public class PDFTextStripper extends PDFStreamEngine
+{
+    private static Logger log = Logger.getLogger(PDFTextStripper.class);
+    
+    private int currentPageNo = 0;
+    private int startPage = 1;
+    private int endPage = Integer.MAX_VALUE;
+    private PDOutlineItem startBookmark = null;
+    private int startBookmarkPageNumber = -1;
+    private PDOutlineItem endBookmark = null;
+    private int endBookmarkPageNumber = -1;
+    private PDDocument document;
+    private boolean suppressDuplicateOverlappingText = true;
+    private boolean shouldSeparateByBeads = true;
+    private boolean sortByPosition = false;
+    
+    private List pageArticles = null;
+    /**
+     * The charactersByArticle is used to extract text by article divisions.  For example
+     * a PDF that has two columns like a newspaper, we want to extract the first column and
+     * then the second column.  In this example the PDF would have 2 beads(or articles), one for
+     * each column.  The size of the charactersByArticle would be 5, because not all text on the 
+     * screen will fall into one of the articles.  The five divisions are shown below
+     * 
+     * Text before first article
+     * first article text
+     * text between first article and second article
+     * second article text
+     * text after second article
+     * 
+     * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
+     */
+    protected Vector charactersByArticle = new Vector();
+    
+    private Map characterListMapping = new HashMap();
+    
+    private String lineSeparator = System.getProperty("line.separator");
+    private String pageSeparator = System.getProperty("line.separator");
+    private String wordSeparator = " ";
+    
+    /**
+     * The stream to write the output to.
+     */
+    protected Writer output;
+    
+    /**
+     * Instantiate a new PDFTextStripper object.  This object will load properties from
+     * Resources/PDFTextStripper.properties.
+     * @throws IOException If there is an error loading the properties.
+     */
+    public PDFTextStripper() throws IOException
+    {
+        super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
+    }
+
+    /**
+     * This will return the text of a document.  See writeText. <br />
+     * NOTE: The document must not be encrypted when coming into this method.
+     *
+     * @param doc The document to get the text from.
+     *
+     * @return The text of the PDF document.
+     *
+     * @throws IOException if the doc state is invalid or it is encrypted.
+     */
+    public String getText( PDDocument doc ) throws IOException
+    {
+        StringWriter outputStream = new StringWriter();
+        writeText( doc, outputStream );
+        return outputStream.toString();
+    }
+
+    /**
+     * @deprecated
+     * @see PDFTextStripper#getText( PDDocument )
+     * @param doc The document to extract the text from.
+     * @return The document text.
+     * @throws IOException If there is an error extracting the text.
+     */
+    public String getText( COSDocument doc ) throws IOException
+    {
+        return getText( new PDDocument( doc ) );
+    }
+
+    /**
+     * @deprecated
+     * @see PDFTextStripper#writeText( PDDocument, Writer )
+     * @param doc The document to extract the text.
+     * @param outputStream The stream to write the text to.
+     * @throws IOException If there is an error extracting the text.
+     */
+    public void writeText( COSDocument doc, Writer outputStream ) throws IOException
+    {
+        writeText( new PDDocument( doc ), outputStream );
+    }
+
+    /**
+     * This will take a PDDocument and write the text of that document to the print writer.
+     *
+     * @param doc The document to get the data from.
+     * @param outputStream The location to put the text.
+     *
+     * @throws IOException If the doc is in an invalid state.
+     */
+    public void writeText( PDDocument doc, Writer outputStream ) throws IOException
+    {
+        
+        PDEncryptionDictionary encDictionary = doc.getEncryptionDictionary();
+
+        //only care about standard encryption and if it was decrypted with the
+        //user password
+        if( encDictionary instanceof PDStandardEncryption && 
+            !doc.wasDecryptedWithOwnerPassword() )
+        {
+            PDStandardEncryption stdEncryption = (PDStandardEncryption)encDictionary;
+            if( !stdEncryption.canExtractContent() )
+            {
+                throw new IOException( "You do not have permission to extract text" );
+            }
+        }
+        currentPageNo = 0;
+        document = doc;
+        output = outputStream;
+        startDocument(document);
+
+        if( document.isEncrypted() )
+        {
+            // We are expecting non-encrypted documents here, but it is common
+            // for users to pass in a document that is encrypted with an empty
+            // password (such a document appears to not be encrypted by
+            // someone viewing the document, thus the confusion).  We will
+            // attempt to decrypt with the empty password to handle this case.
+            //
+            log.debug("Document is encrypted, decrypting with empty password");
+            try
+            {
+                document.decrypt("");
+            }
+            catch (CryptographyException e)
+            {
+                throw new IOException("Error decrypting document, details: " + e.getMessage());
+            }
+            catch (InvalidPasswordException e)
+            {
+                throw new IOException("Error: document is encrypted");
+            }
+        }
+
+        processPages( document.getDocumentCatalog().getAllPages() );
+        endDocument(document);
+    }
+
+    /**
+     * This will process all of the pages and the text that is in them.
+     *
+     * @param pages The pages object in the document.
+     *
+     * @throws IOException If there is an error parsing the text.
+     */
+    protected void processPages( List pages ) throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "processPages( " + pages + " )" );
+        }
+        
+        if( startBookmark != null )
+        {
+            startBookmarkPageNumber = getPageNumber( startBookmark, pages );
+        }
+        
+        if( endBookmark != null )
+        {
+            endBookmarkPageNumber = getPageNumber( endBookmark, pages );
+        }
+        
+        if( startBookmarkPageNumber == -1 && startBookmark != null &&
+            endBookmarkPageNumber == -1 && endBookmark != null &&
+            startBookmark.getCOSObject() == endBookmark.getCOSObject() )
+        {
+            //this is a special case where both the start and end bookmark
+            //are the same but point to nothing.  In this case
+            //we will not extract any text.
+            startBookmarkPageNumber = 0;
+            endBookmarkPageNumber = 0;
+        }
+        
+
+        Iterator pageIter = pages.iterator();
+        while( pageIter.hasNext() )
+        {
+            PDPage nextPage = (PDPage)pageIter.next();
+            PDStream contentStream = nextPage.getContents();
+            if( contentStream != null )
+            {
+                COSStream contents = contentStream.getStream();
+                processPage( nextPage, contents );
+            }
+        }
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "processPages() end" );
+        }
+    }
+    
+    private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException 
+    {
+        int pageNumber = -1;
+        PDPage page = bookmark.findDestinationPage( document );
+        if( page != null )
+        {
+            pageNumber = allPages.indexOf( page )+1;//use one based indexing
+        }
+        return pageNumber;
+    }
+    
+    /**
+     * This method is available for subclasses of this class.  It will be called before processing
+     * of the document start.
+     * 
+     * @param pdf The PDF document that is being processed.
+     * @throws IOException If an IO error occurs.
+     */
+    protected void startDocument(PDDocument pdf) throws IOException 
+    {
+        // no default implementation, but available for subclasses    
+    }
+    
+    /**
+     * This method is available for subclasses of this class.  It will be called after processing
+     * of the document finishes.
+     * 
+     * @param pdf The PDF document that is being processed.
+     * @throws IOException If an IO error occurs.
+     */
+    protected void endDocument(PDDocument pdf ) throws IOException 
+    {
+        // no default implementation, but available for subclasses
+    }
+
+    /**
+     * This will process the contents of a page.
+     *
+     * @param page The page to process.
+     * @param content The contents of the page.
+     *
+     * @throws IOException If there is an error processing the page.
+     */
+    protected void processPage( PDPage page, COSStream content ) throws IOException
+    {
+        long start = System.currentTimeMillis();
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "processPage( " + page + ", " + content + " )" );
+        }
+        currentPageNo++;
+        if( currentPageNo >= startPage && currentPageNo <= endPage &&
+            (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) && 
+            (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
+        {
+            startPage( page );
+            pageArticles = page.getThreadBeads();
+            int numberOfArticleSections = 1 + pageArticles.size() * 2;
+            if( !shouldSeparateByBeads )
+            {
+                numberOfArticleSections = 1;
+            }
+            int originalSize = charactersByArticle.size();
+            charactersByArticle.setSize( numberOfArticleSections );
+            for( int i=0; i<numberOfArticleSections; i++ )
+            {
+                if( numberOfArticleSections < originalSize )
+                {
+                    ((List)charactersByArticle.get( i )).clear();
+                }
+                else
+                {
+                    charactersByArticle.set( i, new ArrayList() );
+                }
+            }
+            
+            characterListMapping.clear();
+            long startProcess = System.currentTimeMillis();
+            processStream( page, page.findResources(), content );
+            long stopProcess = System.currentTimeMillis();
+            long startFlush = System.currentTimeMillis();
+            flushText();
+            long stopFlush = System.currentTimeMillis();
+            if( log.isDebugEnabled() )
+            {
+                log.debug( "processStream time=" + (stopProcess-startProcess) );
+                log.debug( "flushText time=" + (stopFlush-startFlush) );
+            }
+            endPage( page );
+        }
+        long stop = System.currentTimeMillis();
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "processPage() end time=" + (stop-start) );
+        }
+        
+    }
+    
+    /**
+     * Start a new paragraph.  Default implementation is to do nothing.  Subclasses
+     * may provide additional information.
+     * 
+     * @throws IOException If there is any error writing to the stream.
+     */
+    protected void startParagraph() throws IOException
+    {
+        //default is to do nothing.
+    }
+    
+    /**
+     * End a paragraph.  Default implementation is to do nothing.  Subclasses
+     * may provide additional information.
+     * 
+     * @throws IOException If there is any error writing to the stream.
+     */
+    protected void endParagraph() throws IOException
+    {
+        //default is to do nothing
+    }
+    
+    /**
+     * Start a new page.  Default implementation is to do nothing.  Subclasses
+     * may provide additional information.
+     * 
+     * @param page The page we are about to process.
+     * 
+     * @throws IOException If there is any error writing to the stream.
+     */
+    protected void startPage( PDPage page ) throws IOException
+    {
+        //default is to do nothing.
+    }
+    
+    /**
+     * End a page.  Default implementation is to do nothing.  Subclasses
+     * may provide additional information.
+     * 
+     * @param page The page we are about to process.
+     * 
+     * @throws IOException If there is any error writing to the stream.
+     */
+    protected void endPage( PDPage page ) throws IOException
+    {
+        //default is to do nothing
+    }
+
+    /**
+     * This will print the text to the output stream.
+     *
+     * @throws IOException If there is an error writing the text.
+     */
+    protected void flushText() throws IOException
+    {
+        if( log.isDebugEnabled() )
+        {
+            log.debug( "flushText() start" );
+        }
+        float currentY = -1;
+        float lastBaselineFontSize = -1;
+        if( log.isDebugEnabled() )
+        {
+            log.debug("<Starting text object list>");
+        }
+        float endOfLastTextX = -1;
+        float startOfNextWordX = -1;
+        float lastWordSpacing = -1;
+        TextPosition lastProcessedCharacter = null;
+        
+        for( int i=0; i<charactersByArticle.size(); i++)
+        {
+            startParagraph();
+            List textList = (List)charactersByArticle.get( i );
+            if( sortByPosition )
+            {
+                TextPositionComparator comparator = new TextPositionComparator( getCurrentPage() );
+                Collections.sort( textList, comparator );
+            }
+            Iterator textIter = textList.iterator();
+            while( textIter.hasNext() )
+            {
+                TextPosition position = (TextPosition)textIter.next();
+                String characterValue = position.getCharacter();
+                
+                //wordSpacing = position.getWordSpacing();
+                float wordSpacing = 0;
+                
+                if( wordSpacing == 0 )
+                {
+                    //try to get width of a space character
+                    wordSpacing = position.getWidthOfSpace();
+                    //if still zero fall back to getting the width of the current
+                    //character
+                    if( wordSpacing == 0 )
+                    {
+                        wordSpacing = position.getWidth();
+                    }
+                }
+                
+                
+                // RDD - We add a conservative approximation for space determination.
+                // basically if there is a blank area between two characters that is
+                //equal to some percentage of the word spacing then that will be the
+                //start of the next word
+                if( lastWordSpacing <= 0 )
+                {
+                    startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
+                }
+                else
+                {
+                    startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
+                }
+                
+                lastWordSpacing = wordSpacing;
+    
+                // RDD - We will suppress text that is very close to the current line
+                // and which overwrites previously rendered text on this line.
+                // This is done specifically to handle a reasonably common situation
+                // where an application (MS Word, in the case of my examples) renders
+                // text four times at small (1 point) offsets in order to accomplish
+                // bold printing.  You would not want to do this step if you were
+                // going to render the TextPosition objects graphically.
+                //
+                /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
+                    (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
+                {
+                    if (log.isDebugEnabled())
+                    {
+                        log.debug("Suppressing text overwrite" +
+                                  " x: " + position.getX() +
+                                  " endOfLastTextX: " + endOfLastTextX +
+                                  " string: " + position.getCharacter());
+                    }
+                    continue;
+                }*/
+    
+                // RDD - Here we determine whether this text object is on the current
+                // line.  We use the lastBaselineFontSize to handle the superscript
+                // case, and the size of the current font to handle the subscript case.
+                // Text must overlap with the last rendered baseline text by at least
+                // a small amount in order to be considered as being on the same line.
+                //
+                int verticalScaling = 1;
+                if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
+                {
+                    verticalScaling = -1;
+                }
+                if (currentY != -1 &&
+                    ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
+                     (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
+                {
+                    if (log.isDebugEnabled())
+                    {
+                        log.debug("<newline currentY=" + currentY + ", y=" + position.getY() + 
+                                  " fs=" + position.getFontSize()+ " lb fs=" + lastBaselineFontSize + ">");
+                    }
+                    output.write(lineSeparator);
+                    endOfLastTextX = -1;
+                    startOfNextWordX = -1;
+                    currentY = -1;
+                    lastBaselineFontSize = -1;
+                }
+    
+                if (startOfNextWordX != -1 && startOfNextWordX < position.getX() &&
+                   lastProcessedCharacter != null &&
+                   //only bother adding a space if the last character was not a space
+                   lastProcessedCharacter.getCharacter() != null &&
+                   !lastProcessedCharacter.getCharacter().endsWith( " " ) )
+                {
+                    if (log.isDebugEnabled())
+                    {
+                        log.debug("<space startOfNextWordX=" + startOfNextWordX + ", x=" + position.getX() + ">");
+                    }
+                    output.write( wordSeparator );
+                }
+    
+    
+                if (log.isDebugEnabled())
+                {
+                    log.debug("flushText" +
+                              " x=" + position.getX() +
+                              " y=" + position.getY() +
+                              " xScale=" + position.getXScale() +
+                              " yScale=" + position.getYScale() +
+                              " width=" + position.getWidth() +
+                              " currentY=" + currentY +
+                              " endOfLastTextX=" + endOfLastTextX +
+                              " startOfNextWordX=" + startOfNextWordX +
+                              " fontSize=" + position.getFontSize() +
+                              " wordSpacing=" + wordSpacing +
+                              " string=\"" + characterValue + "\"");
+                }
+    
+                if (currentY == -1)
+                {
+                    currentY = position.getY();
+                }
+    
+                if (currentY == position.getY())
+                {
+                    lastBaselineFontSize = position.getFontSize();
+                }
+    
+                // RDD - endX is what PDF considers to be the x coordinate of the
+                // end position of the text.  We use it in computing our metrics below.
+                //
+                endOfLastTextX = position.getX() + position.getWidth();
+    
+    
+                if (characterValue != null)
+                {
+                    output.write(characterValue);
+                }
+                else
+                {
+                    log.debug( "Position.getString() is null so not writing anything" );
+                }
+                lastProcessedCharacter = position;
+            }
+            endParagraph();
+        }
+        
+
+        // RDD - newline at end of flush - required for end of page (so that the top
+        // of the next page starts on its own line.
+        //
+        if( log.isDebugEnabled() )
+        {
+            log.debug("<newline endOfFlush=\"true\">");
+        }
+        output.write(pageSeparator);
+
+        output.flush();
+    }
+    
+    /**
+     * Write the string to the output stream.
+     *  
+     * @param text The text to write to the stream.
+     * @throws IOException If there is an error when writing the text.
+     */
+    protected void writeCharacters( TextPosition text ) throws IOException
+    {
+        output.write( text.getCharacter() );
+    }
+
+    /**
+     * This will determine of two floating point numbers are within a specified variance.
+     *
+     * @param first The first number to compare to.
+     * @param second The second number to compare to.
+     * @param variance The allowed variance.
+     */
+    private boolean within( float first, float second, float variance )
+    {
+        return second > first - variance && second < first + variance;
+    }
+
+    /**
+     * This will show add a character to the list of characters to be printed to
+     * the text file.
+     *
+     * @param text The description of the character to display.
+     */
+    protected void showCharacter( TextPosition text )
+    {
+        boolean showCharacter = true;
+        if( suppressDuplicateOverlappingText )
+        {
+            showCharacter = false;
+            String textCharacter = text.getCharacter();
+            float textX = text.getX();
+            float textY = text.getY();
+            List sameTextCharacters = (List)characterListMapping.get( textCharacter );
+            if( sameTextCharacters == null )
+            {
+                sameTextCharacters = new ArrayList();
+                characterListMapping.put( textCharacter, sameTextCharacters );
+            }
+    
+            // RDD - Here we compute the value that represents the end of the rendered
+            // text.  This value is used to determine whether subsequent text rendered
+            // on the same line overwrites the current text.
+            //
+            // We subtract any positive padding to handle cases where extreme amounts
+            // of padding are applied, then backed off (not sure why this is done, but there
+            // are cases where the padding is on the order of 10x the character width, and
+            // the TJ just backs up to compensate after each character).  Also, we subtract
+            // an amount to allow for kerning (a percentage of the width of the last
+            // character).
+            //
+            boolean suppressCharacter = false;
+            float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
+            for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
+            {
+                TextPosition character = (TextPosition)sameTextCharacters.get( i );
+                String charCharacter = character.getCharacter();
+                float charX = character.getX();
+                float charY = character.getY();
+                //only want to suppress
+                
+                if( charCharacter != null &&
+                    //charCharacter.equals( textCharacter ) &&
+                    within( charX, textX, tolerance ) &&
+                    within( charY, 
+                            textY, 
+                            tolerance ) )
+                {
+                    if( log.isDebugEnabled() )
+                    {
+                        log.debug("suppressText" +
+                                  " x=" + charX +
+                                  " y=" + charY +
+                                  " width=" + character.getWidth() +
+                                  " fontSize=" + character.getFontSize() +
+                                  " string=\"" + charCharacter + "\"");
+                    }
+                    suppressCharacter = true;
+                }
+            }
+            if( !suppressCharacter )
+            {
+                sameTextCharacters.add( text );
+                showCharacter = true;
+            }
+        }
+        
+        if( showCharacter )
+        {
+            //if we are showing the character then we need to determine which
+            //article it belongs to.
+            int foundArticleDivisionIndex = -1;
+            int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
+            int notFoundButFirstLeftArticleDivisionIndex = -1;
+            int notFoundButFirstAboveArticleDivisionIndex = -1;
+            float x = text.getX();
+            float y = text.getY();
+            if( shouldSeparateByBeads )
+            {
+                for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
+                {
+                    PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
+                    if( bead != null )
+                    {
+                        PDRectangle rect = bead.getRectangle();
+                        if( rect.contains( x, y ) )
+                        {
+                            foundArticleDivisionIndex = i*2+1;
+                        }
+                        else if( (x < rect.getLowerLeftX() ||
+                                  y < rect.getUpperRightY()) &&
+                            notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
+                        {
+                            notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
+                        }
+                        else if( x < rect.getLowerLeftX() &&
+                                notFoundButFirstLeftArticleDivisionIndex == -1)
+                        {
+                            notFoundButFirstLeftArticleDivisionIndex = i*2;
+                        }
+                        else if( y < rect.getUpperRightY() &&
+                                notFoundButFirstAboveArticleDivisionIndex == -1)
+                        {
+                            notFoundButFirstAboveArticleDivisionIndex = i*2;
+                        }                        
+                    }
+                    else
+                    {
+                        foundArticleDivisionIndex = 0;
+                    }
+                }
+            }
+            else
+            {
+                foundArticleDivisionIndex = 0;
+            }
+            int articleDivisionIndex = -1;
+            if( foundArticleDivisionIndex != -1 )
+            {
+                articleDivisionIndex = foundArticleDivisionIndex;
+            }
+            else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
+            {
+                articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
+            }
+            else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
+            {
+                articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
+            }
+            else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
+            {
+                articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
+            }
+            else
+            {
+                articleDivisionIndex = charactersByArticle.size()-1;
+            }
+            List textList = (List) charactersByArticle.get( articleDivisionIndex );
+            textList.add( text );
+        }
+    }
+
+    /**
+     * This is the page that the text extraction will start on.  The pages start
+     * at page 1.  For example in a 5 page PDF document, if the start page is 1
+     * then all pages will be extracted.  If the start page is 4 then pages 4 and 5
+     * will be extracted.  The default value is 1.
+     *
+     * @return Value of property startPage.
+     */
+    public int getStartPage()
+    {
+        return startPage;
+    }
+
+    /**
+     * This will set the first page to be extracted by this class.
+     *
+     * @param startPageValue New value of property startPage.
+     */
+    public void setStartPage(int startPageValue)
+    {
+        startPage = startPageValue;
+    }
+
+    /**
+     * This will get the last page that will be extracted.  This is inclusive,
+     * for example if a 5 page PDF an endPage value of 5 would extract the
+     * entire document, an end page of 2 would extract pages 1 and 2.  This defaults
+     * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
+     *
+     * @return Value of property endPage.
+     */
+    public int getEndPage()
+    {
+        return endPage;
+    }
+
+    /**
+     * This will set the last page to be extracted by this class.
+     *
+     * @param endPageValue New value of property endPage.
+     */
+    public void setEndPage(int endPageValue)
+    {
+        endPage = endPageValue;
+    }
+
+    /**
+     * Set the desired line separator for output text.  The line.separator
+     * system property is used if the line separator preference is not set
+     * explicitly using this method.
+     *
+     * @param separator The desired line separator string.
+     */
+    public void setLineSeparator(String separator)
+    {
+        lineSeparator = separator;
+    }
+
+    /**
+     * This will get the line separator.
+     *
+     * @return The desired line separator string.
+     */
+    public String getLineSeparator()
+    {
+        return lineSeparator;
+    }
+
+    /**
+     * Set the desired page separator for output text.  The line.separator
+     * system property is used if the page separator preference is not set
+     * explicitly using this method.
+     *
+     * @param separator The desired page separator string.
+     */
+    public void setPageSeparator(String separator)
+    {
+        pageSeparator = separator;
+    }
+
+    /**
+     * This will get the word separator.
+     *
+     * @return The desired word separator string.
+     */
+    public String getWordSeparator()
+    {
+        return wordSeparator;
+    }
+
+    /**
+     * Set the desired word separator for output text.  The PDFBox text extraction
+     * algorithm will output a space character if there is enough space between
+     * two words.  By default a space character is used.  If you need and accurate
+     * count of characters that are found in a PDF document then you might want to
+     * set the word separator to the empty string.
+     *
+     * @param separator The desired page separator string.
+     */
+    public void setWordSeparator(String separator)
+    {
+        wordSeparator = separator;
+    }
+
+    /**
+     * This will get the page separator.
+     *
+     * @return The page separator string.
+     */
+    public String getPageSeparator()
+    {
+        return pageSeparator;
+    }
+    /**
+     * @return Returns the suppressDuplicateOverlappingText.
+     */
+    public boolean shouldSuppressDuplicateOverlappingText()
+    {
+        return suppressDuplicateOverlappingText;
+    }
+    
+    /**
+     * Get the current page number that is being processed.
+     * 
+     * @return A 1 based number representing the current page.
+     */
+    protected int getCurrentPageNo() 
+    {
+        return currentPageNo;
+    }
+
+    /**
+     * The output stream that is being written to.
+     * 
+     * @return The stream that output is being written to.
+     */
+    protected Writer getOutput() 
+    {
+        return output;
+    }
+    
+    /**
+     * Character strings are grouped by articles.  It is quite common that there
+     * will only be a single article.  This returns a List that contains List objects,
+     * the inner lists will contain TextPosition objects.
+     * 
+     * @return A double List of TextPositions for all text strings on the page.
+     */
+    protected List getCharactersByArticle()
+    {
+        return charactersByArticle;
+    }
+    
+    /**
+     * By default the text stripper will attempt to remove text that overlapps each other.
+     * Word paints the same character several times in order to make it look bold.  By setting
+     * this to false all text will be extracted, which means that certain sections will be 
+     * duplicated, but better performance will be noticed.
+     * 
+     * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
+     */
+    public void setSuppressDuplicateOverlappingText(
+            boolean suppressDuplicateOverlappingTextValue)
+    {
+        this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
+    }
+    
+    /**
+     * This will tell if the text stripper should separate by beads.
+     * 
+     * @return If the text will be grouped by beads.
+     */
+    public boolean shouldSeparateByBeads()
+    {
+        return shouldSeparateByBeads;
+    }
+    
+    /**
+     * Set if the text stripper should group the text output by a list of beads.  The default value is true!
+     * 
+     * @param aShouldSeparateByBeads The new grouping of beads.
+     */
+    public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
+    {
+        this.shouldSeparateByBeads = aShouldSeparateByBeads;
+    }
+    
+    /**
+     * Get the bookmark where text extraction should end, inclusive.  Default is null.
+     * 
+     * @return The ending bookmark.
+     */
+    public PDOutlineItem getEndBookmark()
+    {
+        return endBookmark;
+    }
+    
+    /**
+     * Set the bookmark where the text extraction should stop.
+     * 
+     * @param aEndBookmark The ending bookmark.
+     */
+    public void setEndBookmark(PDOutlineItem aEndBookmark)
+    {
+        endBookmark = aEndBookmark;
+    }
+    
+    /**
+     * Get the bookmark where text extraction should start, inclusive.  Default is null.
+     * 
+     * @return The starting bookmark.
+     */
+    public PDOutlineItem getStartBookmark()
+    {
+        return startBookmark;
+    }
+    
+    /**
+     * Set the bookmark where text extraction should start, inclusive.
+     * 
+     * @param aStartBookmark The starting bookmark.
+     */
+    public void setStartBookmark(PDOutlineItem aStartBookmark)
+    {
+        startBookmark = aStartBookmark;
+    }
+
+    /**
+     * This will tell if the text stripper should sort the text tokens
+     * before writing to the stream.
+     * 
+     * @return true If the text tokens will be sorted before being written.
+     */
+    public boolean shouldSortByPosition() 
+    {
+        return sortByPosition;
+    }
+
+    /**
+     * The order of the text tokens in a PDF file may not be in the same
+     * as they appear visually on the screen.  For example, a PDF writer may
+     * write out all text by font, so all bold or larger text, then make a second
+     * pass and write out the normal text.<br/>
+     * The default is to <b>not</b> sort by position.<br/>
+     * <br/>
+     * A PDF writer could choose to write each character in a different order.  By
+     * default PDFBox does <b>not</b> sort the text tokens before processing them due to
+     * performance reasons.
+     *     
+     * @param newSortByPosition Tell PDFBox to sort the text positions.
+     */
+    public void setSortByPosition(boolean newSortByPosition) 
+    {
+        sortByPosition = newSortByPosition;
+    }
+}
+\ No newline at end of file