1 files changed, 0 insertions, 1062 deletions
diff --git a/src/main/java/org/pdfbox/util/PDFTextStripper.java b/src/main/java/org/pdfbox/util/PDFTextStripper.java
deleted file mode 100644
index 62efb64..0000000
--- a/src/main/java/org/pdfbox/util/PDFTextStripper.java
+++ /dev/null
@@ -1,1062 +0,0 @@
-/**
- * Copyright (c) 2003-2005, www.pdfbox.org
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- * 3. Neither the name of pdfbox; nor the names of its
- *    contributors may be used to endorse or promote products derived from this
- *    software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * http://www.pdfbox.org
- *
- */
-package org.pdfbox.util;
-
-import java.io.IOException;
-import java.io.StringWriter;
-import java.io.Writer;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Vector;
-
-import org.pdfbox.cos.COSDocument;
-import org.pdfbox.cos.COSStream;
-
-import org.pdfbox.pdmodel.PDDocument;
-import org.pdfbox.pdmodel.PDPage;
-
-import org.pdfbox.pdmodel.common.PDRectangle;
-import org.pdfbox.pdmodel.common.PDStream;
-
-import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
-import org.pdfbox.pdmodel.encryption.PDStandardEncryption;
-import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
-import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
-
-import org.pdfbox.exceptions.CryptographyException;
-import org.pdfbox.exceptions.InvalidPasswordException;
-
-import org.apache.log4j.Logger;
-
-
-/**
- * This class will take a pdf document and strip out all of the text and ignore the
- * formatting and such.
- *
- * @author Ben Litchfield (ben@benlitchfield.com)
- * @version $Revision: 1.62 $
- */
-public class PDFTextStripper extends PDFStreamEngine
-{
-    private static Logger log = Logger.getLogger(PDFTextStripper.class);
-    private int first_page_rotation = 0;
-    private boolean getFirstPageRotationFromThis = false;
-    private int currentPageNo = 0;
-    private int startPage = 1;
-    private int endPage = Integer.MAX_VALUE;
-    private PDOutlineItem startBookmark = null;
-    private int startBookmarkPageNumber = -1;
-    private PDOutlineItem endBookmark = null;
-    private int endBookmarkPageNumber = -1;
-    private PDDocument document;
-    private boolean suppressDuplicateOverlappingText = true;
-    private boolean shouldSeparateByBeads = true;
-    private boolean sortByPosition = false;
-    
-    private List pageArticles = null;
-    /**
-     * The charactersByArticle is used to extract text by article divisions.  For example
-     * a PDF that has two columns like a newspaper, we want to extract the first column and
-     * then the second column.  In this example the PDF would have 2 beads(or articles), one for
-     * each column.  The size of the charactersByArticle would be 5, because not all text on the 
-     * screen will fall into one of the articles.  The five divisions are shown below
-     * 
-     * Text before first article
-     * first article text
-     * text between first article and second article
-     * second article text
-     * text after second article
-     * 
-     * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
-     */
-    protected Vector charactersByArticle = new Vector();
-    
-    private Map characterListMapping = new HashMap();
-    
-    private String lineSeparator = System.getProperty("line.separator");
-    private String pageSeparator = System.getProperty("line.separator");
-    private String wordSeparator = " ";
-    
-    /**
-     * The stream to write the output to.
-     */
-    protected Writer output;
-    
-    /**
-     * Instantiate a new PDFTextStripper object.  This object will load properties from
-     * Resources/PDFTextStripper.properties.
-     * @throws IOException If there is an error loading the properties.
-     */
-    public PDFTextStripper() throws IOException
-    {
-        super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
-    }
-
-    /**
-     * This will return the text of a document.  See writeText. <br />
-     * NOTE: The document must not be encrypted when coming into this method.
-     *
-     * @param doc The document to get the text from.
-     *
-     * @return The text of the PDF document.
-     *
-     * @throws IOException if the doc state is invalid or it is encrypted.
-     */
-    public String getText( PDDocument doc ) throws IOException
-    {
-        StringWriter outputStream = new StringWriter();
-        writeText( doc, outputStream );
-        return outputStream.toString();
-    }
-
-    /**
-     * @deprecated
-     * @see PDFTextStripper#getText( PDDocument )
-     * @param doc The document to extract the text from.
-     * @return The document text.
-     * @throws IOException If there is an error extracting the text.
-     */
-    public String getText( COSDocument doc ) throws IOException
-    {
-        return getText( new PDDocument( doc ) );
-    }
-
-    /**
-     * @deprecated
-     * @see PDFTextStripper#writeText( PDDocument, Writer )
-     * @param doc The document to extract the text.
-     * @param outputStream The stream to write the text to.
-     * @throws IOException If there is an error extracting the text.
-     */
-    public void writeText( COSDocument doc, Writer outputStream ) throws IOException
-    {
-        writeText( new PDDocument( doc ), outputStream );
-    }
-
-    /**
-     * This will take a PDDocument and write the text of that document to the print writer.
-     *
-     * @param doc The document to get the data from.
-     * @param outputStream The location to put the text.
-     *
-     * @throws IOException If the doc is in an invalid state.
-     */
-    public void writeText( PDDocument doc, Writer outputStream ) throws IOException
-    {
-        
-        PDEncryptionDictionary encDictionary = doc.getEncryptionDictionary();
-
-        //only care about standard encryption and if it was decrypted with the
-        //user password
-        if( encDictionary instanceof PDStandardEncryption && 
-            !doc.wasDecryptedWithOwnerPassword() )
-        {
-            PDStandardEncryption stdEncryption = (PDStandardEncryption)encDictionary;
-            if( !stdEncryption.canExtractContent() )
-            {
-                throw new IOException( "You do not have permission to extract text" );
-            }
-        }
-        currentPageNo = 0;
-        document = doc;
-        output = outputStream;
-        startDocument(document);
-
-        if( document.isEncrypted() )
-        {
-            // We are expecting non-encrypted documents here, but it is common
-            // for users to pass in a document that is encrypted with an empty
-            // password (such a document appears to not be encrypted by
-            // someone viewing the document, thus the confusion).  We will
-            // attempt to decrypt with the empty password to handle this case.
-            //
-            log.debug("Document is encrypted, decrypting with empty password");
-            try
-            {
-                document.decrypt("");
-            }
-            catch (CryptographyException e)
-            {
-                throw new IOException("Error decrypting document, details: " + e.getMessage());
-            }
-            catch (InvalidPasswordException e)
-            {
-                throw new IOException("Error: document is encrypted");
-            }
-        }
-
-        processPages( document.getDocumentCatalog().getAllPages() );
-        endDocument(document);
-    }
-
-    /**
-     * This will process all of the pages and the text that is in them.
-     *
-     * @param pages The pages object in the document.
-     *
-     * @throws IOException If there is an error parsing the text.
-     */
-    protected void processPages( List pages ) throws IOException
-    {
-        if( log.isDebugEnabled() )
-        {
-            log.debug( "processPages( " + pages + " )" );
-        }
-        
-        if( startBookmark != null )
-        {
-            startBookmarkPageNumber = getPageNumber( startBookmark, pages );
-        }
-        
-        if( endBookmark != null )
-        {
-            endBookmarkPageNumber = getPageNumber( endBookmark, pages );
-        }
-        
-        if( startBookmarkPageNumber == -1 && startBookmark != null &&
-            endBookmarkPageNumber == -1 && endBookmark != null &&
-            startBookmark.getCOSObject() == endBookmark.getCOSObject() )
-        {
-            //this is a special case where both the start and end bookmark
-            //are the same but point to nothing.  In this case
-            //we will not extract any text.
-            startBookmarkPageNumber = 0;
-            endBookmarkPageNumber = 0;
-        }
-        
-
-        Iterator pageIter = pages.iterator();
-        while( pageIter.hasNext() )
-        {
-            PDPage nextPage = (PDPage)pageIter.next();
-            PDStream contentStream = nextPage.getContents();
-            if( contentStream != null )
-            {
-                COSStream contents = contentStream.getStream();
-                processPage( nextPage, contents );
-            }
-        }
-        if( log.isDebugEnabled() )
-        {
-            log.debug( "processPages() end" );
-        }
-    }
-    
-    private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException 
-    {
-        int pageNumber = -1;
-        PDPage page = bookmark.findDestinationPage( document );
-        if( page != null )
-        {
-            pageNumber = allPages.indexOf( page )+1;//use one based indexing
-        }
-        return pageNumber;
-    }
-    
-    /**
-     * This method is available for subclasses of this class.  It will be called before processing
-     * of the document start.
-     * 
-     * @param pdf The PDF document that is being processed.
-     * @throws IOException If an IO error occurs.
-     */
-    protected void startDocument(PDDocument pdf) throws IOException 
-    {
-        // no default implementation, but available for subclasses    
-    }
-    
-    /**
-     * This method is available for subclasses of this class.  It will be called after processing
-     * of the document finishes.
-     * 
-     * @param pdf The PDF document that is being processed.
-     * @throws IOException If an IO error occurs.
-     */
-    protected void endDocument(PDDocument pdf ) throws IOException 
-    {
-        // no default implementation, but available for subclasses
-    }
-
-    /**
-     * This will process the contents of a page.
-     *
-     * @param page The page to process.
-     * @param content The contents of the page.
-     *
-     * @throws IOException If there is an error processing the page.
-     */
-    protected void processPage( PDPage page, COSStream content ) throws IOException
-    {
-        long start = System.currentTimeMillis();
-       // System.out.println( "mruhmer processPage page.findRotation()"+page.findRotation());
-       // System.out.println( "mruhmer processPage currentPageNo="+currentPageNo);
-       // System.out.println( "mruhmer processPage page.getRotation()"+page.getRotation());
-
-        if((currentPageNo==0) ) 
-        {
-            if (getFirstPageRotationFromThis)
-            {
-        	  page.setRotation(first_page_rotation);
-            }
-        }      
-        if( log.isDebugEnabled() )
-        {
-            log.debug( "processPage( " + page + ", " + content + " )" );
-        }
-        currentPageNo++;
-        if( currentPageNo >= startPage && currentPageNo <= endPage &&
-            (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) && 
-            (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
-        {
-            startPage( page );
-            pageArticles = page.getThreadBeads();
-            int numberOfArticleSections = 1 + pageArticles.size() * 2;
-            if( !shouldSeparateByBeads )
-            {
-                numberOfArticleSections = 1;
-            }
-            int originalSize = charactersByArticle.size();
-            charactersByArticle.setSize( numberOfArticleSections );
-            for( int i=0; i<numberOfArticleSections; i++ )
-            {
-                if( numberOfArticleSections < originalSize )
-                {
-                    ((List)charactersByArticle.get( i )).clear();
-                }
-                else
-                {
-                    charactersByArticle.set( i, new ArrayList() );
-                }
-            }
-            
-            characterListMapping.clear();
-            long startProcess = System.currentTimeMillis();
-            processStream( page, page.findResources(), content );
-            long stopProcess = System.currentTimeMillis();
-            long startFlush = System.currentTimeMillis();
-            flushText();
-            long stopFlush = System.currentTimeMillis();
-            if( log.isDebugEnabled() )
-            {
-                log.debug( "processStream time=" + (stopProcess-startProcess) );
-                log.debug( "flushText time=" + (stopFlush-startFlush) );
-            }
-            endPage( page );
-        }
-        long stop = System.currentTimeMillis();
-        if( log.isDebugEnabled() )
-        {
-            log.debug( "processPage() end time=" + (stop-start) );
-        }
-        
-    }
-    
-    /**
-     * Start a new paragraph.  Default implementation is to do nothing.  Subclasses
-     * may provide additional information.
-     * 
-     * @throws IOException If there is any error writing to the stream.
-     */
-    protected void startParagraph() throws IOException
-    {
-        //default is to do nothing.
-    }
-    
-    /**
-     * End a paragraph.  Default implementation is to do nothing.  Subclasses
-     * may provide additional information.
-     * 
-     * @throws IOException If there is any error writing to the stream.
-     */
-    protected void endParagraph() throws IOException
-    {
-        //default is to do nothing
-    }
-    
-    /**
-     * Start a new page.  Default implementation is to do nothing.  Subclasses
-     * may provide additional information.
-     * 
-     * @param page The page we are about to process.
-     * 
-     * @throws IOException If there is any error writing to the stream.
-     */
-    protected void startPage( PDPage page ) throws IOException
-    {
-        //default is to do nothing.
-    }
-    
-    /**
-     * End a page.  Default implementation is to do nothing.  Subclasses
-     * may provide additional information.
-     * 
-     * @param page The page we are about to process.
-     * 
-     * @throws IOException If there is any error writing to the stream.
-     */
-    protected void endPage( PDPage page ) throws IOException
-    {
-        //default is to do nothing
-    }
-
-    /**
-     * This will print the text to the output stream.
-     *
-     * @throws IOException If there is an error writing the text.
-     */
-    protected void flushText() throws IOException
-    {
-        if( log.isDebugEnabled() )
-        {
-            log.debug( "flushText() start" );
-        }
-        float currentY = -1;
-        float lastBaselineFontSize = -1;
-        if( log.isDebugEnabled() )
-        {
-            log.debug("<Starting text object list>");
-        }
-        float endOfLastTextX = -1;
-        float startOfNextWordX = -1;
-        float lastWordSpacing = -1;
-        TextPosition lastProcessedCharacter = null;
-        
-        for( int i=0; i<charactersByArticle.size(); i++)
-        {
-            startParagraph();
-            List textList = (List)charactersByArticle.get( i );
-            if( sortByPosition )
-            {
-                TextPositionComparator comparator = new TextPositionComparator( getCurrentPage() );
-                Collections.sort( textList, comparator );
-            }
-            Iterator textIter = textList.iterator();
-            while( textIter.hasNext() )
-            {
-                TextPosition position = (TextPosition)textIter.next();
-                String characterValue = position.getCharacter();
-                
-                //wordSpacing = position.getWordSpacing();
-                float wordSpacing = 0;
-                
-                if( wordSpacing == 0 )
-                {
-                    //try to get width of a space character
-                    wordSpacing = position.getWidthOfSpace();
-                    //if still zero fall back to getting the width of the current
-                    //character
-                    if( wordSpacing == 0 )
-                    {
-                        wordSpacing = position.getWidth();
-                    }
-                }
-                
-                
-                // RDD - We add a conservative approximation for space determination.
-                // basically if there is a blank area between two characters that is
-                //equal to some percentage of the word spacing then that will be the
-                //start of the next word
-                if( lastWordSpacing <= 0 )
-                {
-                    startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
-                }
-                else
-                {
-                    startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
-                }
-                
-                lastWordSpacing = wordSpacing;
-    
-                // RDD - We will suppress text that is very close to the current line
-                // and which overwrites previously rendered text on this line.
-                // This is done specifically to handle a reasonably common situation
-                // where an application (MS Word, in the case of my examples) renders
-                // text four times at small (1 point) offsets in order to accomplish
-                // bold printing.  You would not want to do this step if you were
-                // going to render the TextPosition objects graphically.
-                //
-                /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
-                    (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
-                {
-                    if (log.isDebugEnabled())
-                    {
-                        log.debug("Suppressing text overwrite" +
-                                  " x: " + position.getX() +
-                                  " endOfLastTextX: " + endOfLastTextX +
-                                  " string: " + position.getCharacter());
-                    }
-                    continue;
-                }*/
-    
-                // RDD - Here we determine whether this text object is on the current
-                // line.  We use the lastBaselineFontSize to handle the superscript
-                // case, and the size of the current font to handle the subscript case.
-                // Text must overlap with the last rendered baseline text by at least
-                // a small amount in order to be considered as being on the same line.
-                //
-                int verticalScaling = 1;
-                if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
-                {
-                    verticalScaling = -1;
-                }
-                if (currentY != -1 &&
-                    ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
-                     (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
-                {
-                    if (log.isDebugEnabled())
-                    {
-                        log.debug("<newline currentY=" + currentY + ", y=" + position.getY() + 
-                                  " fs=" + position.getFontSize()+ " lb fs=" + lastBaselineFontSize + ">");
-                    }
-                    output.write(lineSeparator);
-                    endOfLastTextX = -1;
-                    startOfNextWordX = -1;
-                    currentY = -1;
-                    lastBaselineFontSize = -1;
-                }
-    
-                if (startOfNextWordX != -1 && startOfNextWordX < position.getX() &&
-                   lastProcessedCharacter != null &&
-                   //only bother adding a space if the last character was not a space
-                   lastProcessedCharacter.getCharacter() != null &&
-                   !lastProcessedCharacter.getCharacter().endsWith( " " ) )
-                {
-                    if (log.isDebugEnabled())
-                    {
-                        log.debug("<space startOfNextWordX=" + startOfNextWordX + ", x=" + position.getX() + ">");
-                    }
-                    output.write( wordSeparator );
-                }
-    
-    
-                if (log.isDebugEnabled())
-                {
-                    log.debug("flushText" +
-                              " x=" + position.getX() +
-                              " y=" + position.getY() +
-                              " xScale=" + position.getXScale() +
-                              " yScale=" + position.getYScale() +
-                              " width=" + position.getWidth() +
-                              " currentY=" + currentY +
-                              " endOfLastTextX=" + endOfLastTextX +
-                              " startOfNextWordX=" + startOfNextWordX +
-                              " fontSize=" + position.getFontSize() +
-                              " wordSpacing=" + wordSpacing +
-                              " string=\"" + characterValue + "\"");
-                }
-    
-                if (currentY == -1)
-                {
-                    currentY = position.getY();
-                }
-    
-                if (currentY == position.getY())
-                {
-                    lastBaselineFontSize = position.getFontSize();
-                }
-    
-                // RDD - endX is what PDF considers to be the x coordinate of the
-                // end position of the text.  We use it in computing our metrics below.
-                //
-                endOfLastTextX = position.getX() + position.getWidth();
-    
-    
-                if (characterValue != null)
-                {
-                    output.write(characterValue);
-                }
-                else
-                {
-                    log.debug( "Position.getString() is null so not writing anything" );
-                }
-                lastProcessedCharacter = position;
-            }
-            endParagraph();
-        }
-        
-
-        // RDD - newline at end of flush - required for end of page (so that the top
-        // of the next page starts on its own line.
-        //
-        if( log.isDebugEnabled() )
-        {
-            log.debug("<newline endOfFlush=\"true\">");
-        }
-        output.write(pageSeparator);
-
-        output.flush();
-    }
-    
-    /**
-     * Write the string to the output stream.
-     *  
-     * @param text The text to write to the stream.
-     * @throws IOException If there is an error when writing the text.
-     */
-    protected void writeCharacters( TextPosition text ) throws IOException
-    {
-        output.write( text.getCharacter() );
-    }
-
-    /**
-     * This will determine of two floating point numbers are within a specified variance.
-     *
-     * @param first The first number to compare to.
-     * @param second The second number to compare to.
-     * @param variance The allowed variance.
-     */
-    private boolean within( float first, float second, float variance )
-    {
-        return second > first - variance && second < first + variance;
-    }
-
-    /**
-     * This will show add a character to the list of characters to be printed to
-     * the text file.
-     *
-     * @param text The description of the character to display.
-     */
-    protected void showCharacter( TextPosition text )
-    {
-        boolean showCharacter = true;
-        if( suppressDuplicateOverlappingText )
-        {
-            showCharacter = false;
-            String textCharacter = text.getCharacter();
-            float textX = text.getX();
-            float textY = text.getY();
-            List sameTextCharacters = (List)characterListMapping.get( textCharacter );
-            if( sameTextCharacters == null )
-            {
-                sameTextCharacters = new ArrayList();
-                characterListMapping.put( textCharacter, sameTextCharacters );
-            }
-    
-            // RDD - Here we compute the value that represents the end of the rendered
-            // text.  This value is used to determine whether subsequent text rendered
-            // on the same line overwrites the current text.
-            //
-            // We subtract any positive padding to handle cases where extreme amounts
-            // of padding are applied, then backed off (not sure why this is done, but there
-            // are cases where the padding is on the order of 10x the character width, and
-            // the TJ just backs up to compensate after each character).  Also, we subtract
-            // an amount to allow for kerning (a percentage of the width of the last
-            // character).
-            //
-            boolean suppressCharacter = false;
-            float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
-            for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
-            {
-                TextPosition character = (TextPosition)sameTextCharacters.get( i );
-                String charCharacter = character.getCharacter();
-                float charX = character.getX();
-                float charY = character.getY();
-                //only want to suppress
-                
-                if( charCharacter != null &&
-                    //charCharacter.equals( textCharacter ) &&
-                    within( charX, textX, tolerance ) &&
-                    within( charY, 
-                            textY, 
-                            tolerance ) )
-                {
-                    if( log.isDebugEnabled() )
-                    {
-                        log.debug("suppressText" +
-                                  " x=" + charX +
-                                  " y=" + charY +
-                                  " width=" + character.getWidth() +
-                                  " fontSize=" + character.getFontSize() +
-                                  " string=\"" + charCharacter + "\"");
-                    }
-                    suppressCharacter = true;
-                }
-            }
-            if( !suppressCharacter )
-            {
-                sameTextCharacters.add( text );
-                showCharacter = true;
-            }
-        }
-        
-        if( showCharacter )
-        {
-            //if we are showing the character then we need to determine which
-            //article it belongs to.
-            int foundArticleDivisionIndex = -1;
-            int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
-            int notFoundButFirstLeftArticleDivisionIndex = -1;
-            int notFoundButFirstAboveArticleDivisionIndex = -1;
-            float x = text.getX();
-            float y = text.getY();
-            if( shouldSeparateByBeads )
-            {
-                for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
-                {
-                    PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
-                    if( bead != null )
-                    {
-                        PDRectangle rect = bead.getRectangle();
-                        if( rect.contains( x, y ) )
-                        {
-                            foundArticleDivisionIndex = i*2+1;
-                        }
-                        else if( (x < rect.getLowerLeftX() ||
-                                  y < rect.getUpperRightY()) &&
-                            notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
-                        {
-                            notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
-                        }
-                        else if( x < rect.getLowerLeftX() &&
-                                notFoundButFirstLeftArticleDivisionIndex == -1)
-                        {
-                            notFoundButFirstLeftArticleDivisionIndex = i*2;
-                        }
-                        else if( y < rect.getUpperRightY() &&
-                                notFoundButFirstAboveArticleDivisionIndex == -1)
-                        {
-                            notFoundButFirstAboveArticleDivisionIndex = i*2;
-                        }                        
-                    }
-                    else
-                    {
-                        foundArticleDivisionIndex = 0;
-                    }
-                }
-            }
-            else
-            {
-                foundArticleDivisionIndex = 0;
-            }
-            int articleDivisionIndex = -1;
-            if( foundArticleDivisionIndex != -1 )
-            {
-                articleDivisionIndex = foundArticleDivisionIndex;
-            }
-            else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
-            {
-                articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
-            }
-            else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
-            {
-                articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
-            }
-            else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
-            {
-                articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
-            }
-            else
-            {
-                articleDivisionIndex = charactersByArticle.size()-1;
-            }
-            List textList = (List) charactersByArticle.get( articleDivisionIndex );
-            textList.add( text );
-        }
-    }
-
-    /**
-     * This is the page that the text extraction will start on.  The pages start
-     * at page 1.  For example in a 5 page PDF document, if the start page is 1
-     * then all pages will be extracted.  If the start page is 4 then pages 4 and 5
-     * will be extracted.  The default value is 1.
-     *
-     * @return Value of property startPage.
-     */
-    public int getStartPage()
-    {
-        return startPage;
-    }
-
-    /**
-     * This will set the first page to be extracted by this class.
-     *
-     * @param startPageValue New value of property startPage.
-     */
-    public void setStartPage(int startPageValue)
-    {
-        startPage = startPageValue;
-    }
-
-    /**
-     * This will get the last page that will be extracted.  This is inclusive,
-     * for example if a 5 page PDF an endPage value of 5 would extract the
-     * entire document, an end page of 2 would extract pages 1 and 2.  This defaults
-     * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
-     *
-     * @return Value of property endPage.
-     */
-    public int getEndPage()
-    {
-        return endPage;
-    }
-
-    /**
-     * This will set the last page to be extracted by this class.
-     *
-     * @param endPageValue New value of property endPage.
-     */
-    public void setEndPage(int endPageValue)
-    {
-        endPage = endPageValue;
-    }
-
-    /**
-     * Set the desired line separator for output text.  The line.separator
-     * system property is used if the line separator preference is not set
-     * explicitly using this method.
-     *
-     * @param separator The desired line separator string.
-     */
-    public void setLineSeparator(String separator)
-    {
-        lineSeparator = separator;
-    }
-
-    /**
-     * This will get the line separator.
-     *
-     * @return The desired line separator string.
-     */
-    public String getLineSeparator()
-    {
-        return lineSeparator;
-    }
-
-    /**
-     * Set the desired page separator for output text.  The line.separator
-     * system property is used if the page separator preference is not set
-     * explicitly using this method.
-     *
-     * @param separator The desired page separator string.
-     */
-    public void setPageSeparator(String separator)
-    {
-        pageSeparator = separator;
-    }
-
-    /**
-     * This will get the word separator.
-     *
-     * @return The desired word separator string.
-     */
-    public String getWordSeparator()
-    {
-        return wordSeparator;
-    }
-
-    /**
-     * Set the desired word separator for output text.  The PDFBox text extraction
-     * algorithm will output a space character if there is enough space between
-     * two words.  By default a space character is used.  If you need and accurate
-     * count of characters that are found in a PDF document then you might want to
-     * set the word separator to the empty string.
-     *
-     * @param separator The desired page separator string.
-     */
-    public void setWordSeparator(String separator)
-    {
-        wordSeparator = separator;
-    }
-
-    /**
-     * This will get the page separator.
-     *
-     * @return The page separator string.
-     */
-    public String getPageSeparator()
-    {
-        return pageSeparator;
-    }
-    /**
-     * @return Returns the suppressDuplicateOverlappingText.
-     */
-    public boolean shouldSuppressDuplicateOverlappingText()
-    {
-        return suppressDuplicateOverlappingText;
-    }
-    
-    /**
-     * Get the current page number that is being processed.
-     * 
-     * @return A 1 based number representing the current page.
-     */
-    protected int getCurrentPageNo() 
-    {
-        return currentPageNo;
-    }
-
-    /**
-     * The output stream that is being written to.
-     * 
-     * @return The stream that output is being written to.
-     */
-    protected Writer getOutput() 
-    {
-        return output;
-    }
-    
-    /**
-     * Character strings are grouped by articles.  It is quite common that there
-     * will only be a single article.  This returns a List that contains List objects,
-     * the inner lists will contain TextPosition objects.
-     * 
-     * @return A double List of TextPositions for all text strings on the page.
-     */
-    protected List getCharactersByArticle()
-    {
-        return charactersByArticle;
-    }
-    
-    /**
-     * By default the text stripper will attempt to remove text that overlapps each other.
-     * Word paints the same character several times in order to make it look bold.  By setting
-     * this to false all text will be extracted, which means that certain sections will be 
-     * duplicated, but better performance will be noticed.
-     * 
-     * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
-     */
-    public void setSuppressDuplicateOverlappingText(
-            boolean suppressDuplicateOverlappingTextValue)
-    {
-        this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
-    }
-    
-    /**
-     * This will tell if the text stripper should separate by beads.
-     * 
-     * @return If the text will be grouped by beads.
-     */
-    public boolean shouldSeparateByBeads()
-    {
-        return shouldSeparateByBeads;
-    }
-    
-    /**
-     * Set if the text stripper should group the text output by a list of beads.  The default value is true!
-     * 
-     * @param aShouldSeparateByBeads The new grouping of beads.
-     */
-    public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
-    {
-        this.shouldSeparateByBeads = aShouldSeparateByBeads;
-    }
-    
-    /**
-     * Get the bookmark where text extraction should end, inclusive.  Default is null.
-     * 
-     * @return The ending bookmark.
-     */
-    public PDOutlineItem getEndBookmark()
-    {
-        return endBookmark;
-    }
-    
-    /**
-     * Set the bookmark where the text extraction should stop.
-     * 
-     * @param aEndBookmark The ending bookmark.
-     */
-    public void setEndBookmark(PDOutlineItem aEndBookmark)
-    {
-        endBookmark = aEndBookmark;
-    }
-    
-    /**
-     * Get the bookmark where text extraction should start, inclusive.  Default is null.
-     * 
-     * @return The starting bookmark.
-     */
-    public PDOutlineItem getStartBookmark()
-    {
-        return startBookmark;
-    }
-    
-    /**
-     * Set the bookmark where text extraction should start, inclusive.
-     * 
-     * @param aStartBookmark The starting bookmark.
-     */
-    public void setStartBookmark(PDOutlineItem aStartBookmark)
-    {
-        startBookmark = aStartBookmark;
-    }
-
-    /**
-     * This will tell if the text stripper should sort the text tokens
-     * before writing to the stream.
-     * 
-     * @return true If the text tokens will be sorted before being written.
-     */
-    public boolean shouldSortByPosition() 
-    {
-        return sortByPosition;
-    }
-
-    /**
-     * The order of the text tokens in a PDF file may not be in the same
-     * as they appear visually on the screen.  For example, a PDF writer may
-     * write out all text by font, so all bold or larger text, then make a second
-     * pass and write out the normal text.<br/>
-     * The default is to <b>not</b> sort by position.<br/>
-     * <br/>
-     * A PDF writer could choose to write each character in a different order.  By
-     * default PDFBox does <b>not</b> sort the text tokens before processing them due to
-     * performance reasons.
-     *     
-     * @param newSortByPosition Tell PDFBox to sort the text positions.
-     */
-    public void setSortByPosition(boolean newSortByPosition) 
-    {
-        sortByPosition = newSortByPosition;
-    }
-    /**
-     * the rotation of the first page 0|90|180|270
-     * @param Rotation
-     * 
-     */    
-    public void setFirstPageRotation(int Rotation) 
-    {
-    	first_page_rotation = Rotation;
-    }
-    /**
-     * tells if rotation of first handled page should be concerned
-     * @param newFirstPageRotationFromThis boolean
-     */
-    public void setGetFirstPageRotationFromThis(boolean newFirstPageRotationFromThis)
-    {
-    	getFirstPageRotationFromThis = newFirstPageRotationFromThis;
-    }
-}
-\ No newline at end of file