aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/pdfbox/util/PDFTextStripper.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/pdfbox/util/PDFTextStripper.java')
-rw-r--r--src/main/java/org/pdfbox/util/PDFTextStripper.java1062
1 files changed, 0 insertions, 1062 deletions
diff --git a/src/main/java/org/pdfbox/util/PDFTextStripper.java b/src/main/java/org/pdfbox/util/PDFTextStripper.java
deleted file mode 100644
index 62efb64..0000000
--- a/src/main/java/org/pdfbox/util/PDFTextStripper.java
+++ /dev/null
@@ -1,1062 +0,0 @@
-/**
- * Copyright (c) 2003-2005, www.pdfbox.org
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of pdfbox; nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * http://www.pdfbox.org
- *
- */
-package org.pdfbox.util;
-
-import java.io.IOException;
-import java.io.StringWriter;
-import java.io.Writer;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Vector;
-
-import org.pdfbox.cos.COSDocument;
-import org.pdfbox.cos.COSStream;
-
-import org.pdfbox.pdmodel.PDDocument;
-import org.pdfbox.pdmodel.PDPage;
-
-import org.pdfbox.pdmodel.common.PDRectangle;
-import org.pdfbox.pdmodel.common.PDStream;
-
-import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
-import org.pdfbox.pdmodel.encryption.PDStandardEncryption;
-import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
-import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
-
-import org.pdfbox.exceptions.CryptographyException;
-import org.pdfbox.exceptions.InvalidPasswordException;
-
-import org.apache.log4j.Logger;
-
-
-/**
- * This class will take a pdf document and strip out all of the text and ignore the
- * formatting and such.
- *
- * @author Ben Litchfield (ben@benlitchfield.com)
- * @version $Revision: 1.62 $
- */
-public class PDFTextStripper extends PDFStreamEngine
-{
- private static Logger log = Logger.getLogger(PDFTextStripper.class);
- private int first_page_rotation = 0;
- private boolean getFirstPageRotationFromThis = false;
- private int currentPageNo = 0;
- private int startPage = 1;
- private int endPage = Integer.MAX_VALUE;
- private PDOutlineItem startBookmark = null;
- private int startBookmarkPageNumber = -1;
- private PDOutlineItem endBookmark = null;
- private int endBookmarkPageNumber = -1;
- private PDDocument document;
- private boolean suppressDuplicateOverlappingText = true;
- private boolean shouldSeparateByBeads = true;
- private boolean sortByPosition = false;
-
- private List pageArticles = null;
- /**
- * The charactersByArticle is used to extract text by article divisions. For example
- * a PDF that has two columns like a newspaper, we want to extract the first column and
- * then the second column. In this example the PDF would have 2 beads(or articles), one for
- * each column. The size of the charactersByArticle would be 5, because not all text on the
- * screen will fall into one of the articles. The five divisions are shown below
- *
- * Text before first article
- * first article text
- * text between first article and second article
- * second article text
- * text after second article
- *
- * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
- */
- protected Vector charactersByArticle = new Vector();
-
- private Map characterListMapping = new HashMap();
-
- private String lineSeparator = System.getProperty("line.separator");
- private String pageSeparator = System.getProperty("line.separator");
- private String wordSeparator = " ";
-
- /**
- * The stream to write the output to.
- */
- protected Writer output;
-
- /**
- * Instantiate a new PDFTextStripper object. This object will load properties from
- * Resources/PDFTextStripper.properties.
- * @throws IOException If there is an error loading the properties.
- */
- public PDFTextStripper() throws IOException
- {
- super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
- }
-
- /**
- * This will return the text of a document. See writeText. <br />
- * NOTE: The document must not be encrypted when coming into this method.
- *
- * @param doc The document to get the text from.
- *
- * @return The text of the PDF document.
- *
- * @throws IOException if the doc state is invalid or it is encrypted.
- */
- public String getText( PDDocument doc ) throws IOException
- {
- StringWriter outputStream = new StringWriter();
- writeText( doc, outputStream );
- return outputStream.toString();
- }
-
- /**
- * @deprecated
- * @see PDFTextStripper#getText( PDDocument )
- * @param doc The document to extract the text from.
- * @return The document text.
- * @throws IOException If there is an error extracting the text.
- */
- public String getText( COSDocument doc ) throws IOException
- {
- return getText( new PDDocument( doc ) );
- }
-
- /**
- * @deprecated
- * @see PDFTextStripper#writeText( PDDocument, Writer )
- * @param doc The document to extract the text.
- * @param outputStream The stream to write the text to.
- * @throws IOException If there is an error extracting the text.
- */
- public void writeText( COSDocument doc, Writer outputStream ) throws IOException
- {
- writeText( new PDDocument( doc ), outputStream );
- }
-
- /**
- * This will take a PDDocument and write the text of that document to the print writer.
- *
- * @param doc The document to get the data from.
- * @param outputStream The location to put the text.
- *
- * @throws IOException If the doc is in an invalid state.
- */
- public void writeText( PDDocument doc, Writer outputStream ) throws IOException
- {
-
- PDEncryptionDictionary encDictionary = doc.getEncryptionDictionary();
-
- //only care about standard encryption and if it was decrypted with the
- //user password
- if( encDictionary instanceof PDStandardEncryption &&
- !doc.wasDecryptedWithOwnerPassword() )
- {
- PDStandardEncryption stdEncryption = (PDStandardEncryption)encDictionary;
- if( !stdEncryption.canExtractContent() )
- {
- throw new IOException( "You do not have permission to extract text" );
- }
- }
- currentPageNo = 0;
- document = doc;
- output = outputStream;
- startDocument(document);
-
- if( document.isEncrypted() )
- {
- // We are expecting non-encrypted documents here, but it is common
- // for users to pass in a document that is encrypted with an empty
- // password (such a document appears to not be encrypted by
- // someone viewing the document, thus the confusion). We will
- // attempt to decrypt with the empty password to handle this case.
- //
- log.debug("Document is encrypted, decrypting with empty password");
- try
- {
- document.decrypt("");
- }
- catch (CryptographyException e)
- {
- throw new IOException("Error decrypting document, details: " + e.getMessage());
- }
- catch (InvalidPasswordException e)
- {
- throw new IOException("Error: document is encrypted");
- }
- }
-
- processPages( document.getDocumentCatalog().getAllPages() );
- endDocument(document);
- }
-
- /**
- * This will process all of the pages and the text that is in them.
- *
- * @param pages The pages object in the document.
- *
- * @throws IOException If there is an error parsing the text.
- */
- protected void processPages( List pages ) throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug( "processPages( " + pages + " )" );
- }
-
- if( startBookmark != null )
- {
- startBookmarkPageNumber = getPageNumber( startBookmark, pages );
- }
-
- if( endBookmark != null )
- {
- endBookmarkPageNumber = getPageNumber( endBookmark, pages );
- }
-
- if( startBookmarkPageNumber == -1 && startBookmark != null &&
- endBookmarkPageNumber == -1 && endBookmark != null &&
- startBookmark.getCOSObject() == endBookmark.getCOSObject() )
- {
- //this is a special case where both the start and end bookmark
- //are the same but point to nothing. In this case
- //we will not extract any text.
- startBookmarkPageNumber = 0;
- endBookmarkPageNumber = 0;
- }
-
-
- Iterator pageIter = pages.iterator();
- while( pageIter.hasNext() )
- {
- PDPage nextPage = (PDPage)pageIter.next();
- PDStream contentStream = nextPage.getContents();
- if( contentStream != null )
- {
- COSStream contents = contentStream.getStream();
- processPage( nextPage, contents );
- }
- }
- if( log.isDebugEnabled() )
- {
- log.debug( "processPages() end" );
- }
- }
-
- private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException
- {
- int pageNumber = -1;
- PDPage page = bookmark.findDestinationPage( document );
- if( page != null )
- {
- pageNumber = allPages.indexOf( page )+1;//use one based indexing
- }
- return pageNumber;
- }
-
- /**
- * This method is available for subclasses of this class. It will be called before processing
- * of the document start.
- *
- * @param pdf The PDF document that is being processed.
- * @throws IOException If an IO error occurs.
- */
- protected void startDocument(PDDocument pdf) throws IOException
- {
- // no default implementation, but available for subclasses
- }
-
- /**
- * This method is available for subclasses of this class. It will be called after processing
- * of the document finishes.
- *
- * @param pdf The PDF document that is being processed.
- * @throws IOException If an IO error occurs.
- */
- protected void endDocument(PDDocument pdf ) throws IOException
- {
- // no default implementation, but available for subclasses
- }
-
- /**
- * This will process the contents of a page.
- *
- * @param page The page to process.
- * @param content The contents of the page.
- *
- * @throws IOException If there is an error processing the page.
- */
- protected void processPage( PDPage page, COSStream content ) throws IOException
- {
- long start = System.currentTimeMillis();
- // System.out.println( "mruhmer processPage page.findRotation()"+page.findRotation());
- // System.out.println( "mruhmer processPage currentPageNo="+currentPageNo);
- // System.out.println( "mruhmer processPage page.getRotation()"+page.getRotation());
-
- if((currentPageNo==0) )
- {
- if (getFirstPageRotationFromThis)
- {
- page.setRotation(first_page_rotation);
- }
- }
- if( log.isDebugEnabled() )
- {
- log.debug( "processPage( " + page + ", " + content + " )" );
- }
- currentPageNo++;
- if( currentPageNo >= startPage && currentPageNo <= endPage &&
- (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
- (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
- {
- startPage( page );
- pageArticles = page.getThreadBeads();
- int numberOfArticleSections = 1 + pageArticles.size() * 2;
- if( !shouldSeparateByBeads )
- {
- numberOfArticleSections = 1;
- }
- int originalSize = charactersByArticle.size();
- charactersByArticle.setSize( numberOfArticleSections );
- for( int i=0; i<numberOfArticleSections; i++ )
- {
- if( numberOfArticleSections < originalSize )
- {
- ((List)charactersByArticle.get( i )).clear();
- }
- else
- {
- charactersByArticle.set( i, new ArrayList() );
- }
- }
-
- characterListMapping.clear();
- long startProcess = System.currentTimeMillis();
- processStream( page, page.findResources(), content );
- long stopProcess = System.currentTimeMillis();
- long startFlush = System.currentTimeMillis();
- flushText();
- long stopFlush = System.currentTimeMillis();
- if( log.isDebugEnabled() )
- {
- log.debug( "processStream time=" + (stopProcess-startProcess) );
- log.debug( "flushText time=" + (stopFlush-startFlush) );
- }
- endPage( page );
- }
- long stop = System.currentTimeMillis();
- if( log.isDebugEnabled() )
- {
- log.debug( "processPage() end time=" + (stop-start) );
- }
-
- }
-
- /**
- * Start a new paragraph. Default implementation is to do nothing. Subclasses
- * may provide additional information.
- *
- * @throws IOException If there is any error writing to the stream.
- */
- protected void startParagraph() throws IOException
- {
- //default is to do nothing.
- }
-
- /**
- * End a paragraph. Default implementation is to do nothing. Subclasses
- * may provide additional information.
- *
- * @throws IOException If there is any error writing to the stream.
- */
- protected void endParagraph() throws IOException
- {
- //default is to do nothing
- }
-
- /**
- * Start a new page. Default implementation is to do nothing. Subclasses
- * may provide additional information.
- *
- * @param page The page we are about to process.
- *
- * @throws IOException If there is any error writing to the stream.
- */
- protected void startPage( PDPage page ) throws IOException
- {
- //default is to do nothing.
- }
-
- /**
- * End a page. Default implementation is to do nothing. Subclasses
- * may provide additional information.
- *
- * @param page The page we are about to process.
- *
- * @throws IOException If there is any error writing to the stream.
- */
- protected void endPage( PDPage page ) throws IOException
- {
- //default is to do nothing
- }
-
- /**
- * This will print the text to the output stream.
- *
- * @throws IOException If there is an error writing the text.
- */
- protected void flushText() throws IOException
- {
- if( log.isDebugEnabled() )
- {
- log.debug( "flushText() start" );
- }
- float currentY = -1;
- float lastBaselineFontSize = -1;
- if( log.isDebugEnabled() )
- {
- log.debug("<Starting text object list>");
- }
- float endOfLastTextX = -1;
- float startOfNextWordX = -1;
- float lastWordSpacing = -1;
- TextPosition lastProcessedCharacter = null;
-
- for( int i=0; i<charactersByArticle.size(); i++)
- {
- startParagraph();
- List textList = (List)charactersByArticle.get( i );
- if( sortByPosition )
- {
- TextPositionComparator comparator = new TextPositionComparator( getCurrentPage() );
- Collections.sort( textList, comparator );
- }
- Iterator textIter = textList.iterator();
- while( textIter.hasNext() )
- {
- TextPosition position = (TextPosition)textIter.next();
- String characterValue = position.getCharacter();
-
- //wordSpacing = position.getWordSpacing();
- float wordSpacing = 0;
-
- if( wordSpacing == 0 )
- {
- //try to get width of a space character
- wordSpacing = position.getWidthOfSpace();
- //if still zero fall back to getting the width of the current
- //character
- if( wordSpacing == 0 )
- {
- wordSpacing = position.getWidth();
- }
- }
-
-
- // RDD - We add a conservative approximation for space determination.
- // basically if there is a blank area between two characters that is
- //equal to some percentage of the word spacing then that will be the
- //start of the next word
- if( lastWordSpacing <= 0 )
- {
- startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
- }
- else
- {
- startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
- }
-
- lastWordSpacing = wordSpacing;
-
- // RDD - We will suppress text that is very close to the current line
- // and which overwrites previously rendered text on this line.
- // This is done specifically to handle a reasonably common situation
- // where an application (MS Word, in the case of my examples) renders
- // text four times at small (1 point) offsets in order to accomplish
- // bold printing. You would not want to do this step if you were
- // going to render the TextPosition objects graphically.
- //
- /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
- (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
- {
- if (log.isDebugEnabled())
- {
- log.debug("Suppressing text overwrite" +
- " x: " + position.getX() +
- " endOfLastTextX: " + endOfLastTextX +
- " string: " + position.getCharacter());
- }
- continue;
- }*/
-
- // RDD - Here we determine whether this text object is on the current
- // line. We use the lastBaselineFontSize to handle the superscript
- // case, and the size of the current font to handle the subscript case.
- // Text must overlap with the last rendered baseline text by at least
- // a small amount in order to be considered as being on the same line.
- //
- int verticalScaling = 1;
- if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
- {
- verticalScaling = -1;
- }
- if (currentY != -1 &&
- ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
- (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
- {
- if (log.isDebugEnabled())
- {
- log.debug("<newline currentY=" + currentY + ", y=" + position.getY() +
- " fs=" + position.getFontSize()+ " lb fs=" + lastBaselineFontSize + ">");
- }
- output.write(lineSeparator);
- endOfLastTextX = -1;
- startOfNextWordX = -1;
- currentY = -1;
- lastBaselineFontSize = -1;
- }
-
- if (startOfNextWordX != -1 && startOfNextWordX < position.getX() &&
- lastProcessedCharacter != null &&
- //only bother adding a space if the last character was not a space
- lastProcessedCharacter.getCharacter() != null &&
- !lastProcessedCharacter.getCharacter().endsWith( " " ) )
- {
- if (log.isDebugEnabled())
- {
- log.debug("<space startOfNextWordX=" + startOfNextWordX + ", x=" + position.getX() + ">");
- }
- output.write( wordSeparator );
- }
-
-
- if (log.isDebugEnabled())
- {
- log.debug("flushText" +
- " x=" + position.getX() +
- " y=" + position.getY() +
- " xScale=" + position.getXScale() +
- " yScale=" + position.getYScale() +
- " width=" + position.getWidth() +
- " currentY=" + currentY +
- " endOfLastTextX=" + endOfLastTextX +
- " startOfNextWordX=" + startOfNextWordX +
- " fontSize=" + position.getFontSize() +
- " wordSpacing=" + wordSpacing +
- " string=\"" + characterValue + "\"");
- }
-
- if (currentY == -1)
- {
- currentY = position.getY();
- }
-
- if (currentY == position.getY())
- {
- lastBaselineFontSize = position.getFontSize();
- }
-
- // RDD - endX is what PDF considers to be the x coordinate of the
- // end position of the text. We use it in computing our metrics below.
- //
- endOfLastTextX = position.getX() + position.getWidth();
-
-
- if (characterValue != null)
- {
- output.write(characterValue);
- }
- else
- {
- log.debug( "Position.getString() is null so not writing anything" );
- }
- lastProcessedCharacter = position;
- }
- endParagraph();
- }
-
-
- // RDD - newline at end of flush - required for end of page (so that the top
- // of the next page starts on its own line.
- //
- if( log.isDebugEnabled() )
- {
- log.debug("<newline endOfFlush=\"true\">");
- }
- output.write(pageSeparator);
-
- output.flush();
- }
-
- /**
- * Write the string to the output stream.
- *
- * @param text The text to write to the stream.
- * @throws IOException If there is an error when writing the text.
- */
- protected void writeCharacters( TextPosition text ) throws IOException
- {
- output.write( text.getCharacter() );
- }
-
- /**
- * This will determine of two floating point numbers are within a specified variance.
- *
- * @param first The first number to compare to.
- * @param second The second number to compare to.
- * @param variance The allowed variance.
- */
- private boolean within( float first, float second, float variance )
- {
- return second > first - variance && second < first + variance;
- }
-
- /**
- * This will show add a character to the list of characters to be printed to
- * the text file.
- *
- * @param text The description of the character to display.
- */
- protected void showCharacter( TextPosition text )
- {
- boolean showCharacter = true;
- if( suppressDuplicateOverlappingText )
- {
- showCharacter = false;
- String textCharacter = text.getCharacter();
- float textX = text.getX();
- float textY = text.getY();
- List sameTextCharacters = (List)characterListMapping.get( textCharacter );
- if( sameTextCharacters == null )
- {
- sameTextCharacters = new ArrayList();
- characterListMapping.put( textCharacter, sameTextCharacters );
- }
-
- // RDD - Here we compute the value that represents the end of the rendered
- // text. This value is used to determine whether subsequent text rendered
- // on the same line overwrites the current text.
- //
- // We subtract any positive padding to handle cases where extreme amounts
- // of padding are applied, then backed off (not sure why this is done, but there
- // are cases where the padding is on the order of 10x the character width, and
- // the TJ just backs up to compensate after each character). Also, we subtract
- // an amount to allow for kerning (a percentage of the width of the last
- // character).
- //
- boolean suppressCharacter = false;
- float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
- for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
- {
- TextPosition character = (TextPosition)sameTextCharacters.get( i );
- String charCharacter = character.getCharacter();
- float charX = character.getX();
- float charY = character.getY();
- //only want to suppress
-
- if( charCharacter != null &&
- //charCharacter.equals( textCharacter ) &&
- within( charX, textX, tolerance ) &&
- within( charY,
- textY,
- tolerance ) )
- {
- if( log.isDebugEnabled() )
- {
- log.debug("suppressText" +
- " x=" + charX +
- " y=" + charY +
- " width=" + character.getWidth() +
- " fontSize=" + character.getFontSize() +
- " string=\"" + charCharacter + "\"");
- }
- suppressCharacter = true;
- }
- }
- if( !suppressCharacter )
- {
- sameTextCharacters.add( text );
- showCharacter = true;
- }
- }
-
- if( showCharacter )
- {
- //if we are showing the character then we need to determine which
- //article it belongs to.
- int foundArticleDivisionIndex = -1;
- int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
- int notFoundButFirstLeftArticleDivisionIndex = -1;
- int notFoundButFirstAboveArticleDivisionIndex = -1;
- float x = text.getX();
- float y = text.getY();
- if( shouldSeparateByBeads )
- {
- for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
- {
- PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
- if( bead != null )
- {
- PDRectangle rect = bead.getRectangle();
- if( rect.contains( x, y ) )
- {
- foundArticleDivisionIndex = i*2+1;
- }
- else if( (x < rect.getLowerLeftX() ||
- y < rect.getUpperRightY()) &&
- notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
- {
- notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
- }
- else if( x < rect.getLowerLeftX() &&
- notFoundButFirstLeftArticleDivisionIndex == -1)
- {
- notFoundButFirstLeftArticleDivisionIndex = i*2;
- }
- else if( y < rect.getUpperRightY() &&
- notFoundButFirstAboveArticleDivisionIndex == -1)
- {
- notFoundButFirstAboveArticleDivisionIndex = i*2;
- }
- }
- else
- {
- foundArticleDivisionIndex = 0;
- }
- }
- }
- else
- {
- foundArticleDivisionIndex = 0;
- }
- int articleDivisionIndex = -1;
- if( foundArticleDivisionIndex != -1 )
- {
- articleDivisionIndex = foundArticleDivisionIndex;
- }
- else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
- {
- articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
- }
- else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
- {
- articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
- }
- else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
- {
- articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
- }
- else
- {
- articleDivisionIndex = charactersByArticle.size()-1;
- }
- List textList = (List) charactersByArticle.get( articleDivisionIndex );
- textList.add( text );
- }
- }
-
- /**
- * This is the page that the text extraction will start on. The pages start
- * at page 1. For example in a 5 page PDF document, if the start page is 1
- * then all pages will be extracted. If the start page is 4 then pages 4 and 5
- * will be extracted. The default value is 1.
- *
- * @return Value of property startPage.
- */
- public int getStartPage()
- {
- return startPage;
- }
-
- /**
- * This will set the first page to be extracted by this class.
- *
- * @param startPageValue New value of property startPage.
- */
- public void setStartPage(int startPageValue)
- {
- startPage = startPageValue;
- }
-
- /**
- * This will get the last page that will be extracted. This is inclusive,
- * for example if a 5 page PDF an endPage value of 5 would extract the
- * entire document, an end page of 2 would extract pages 1 and 2. This defaults
- * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
- *
- * @return Value of property endPage.
- */
- public int getEndPage()
- {
- return endPage;
- }
-
- /**
- * This will set the last page to be extracted by this class.
- *
- * @param endPageValue New value of property endPage.
- */
- public void setEndPage(int endPageValue)
- {
- endPage = endPageValue;
- }
-
- /**
- * Set the desired line separator for output text. The line.separator
- * system property is used if the line separator preference is not set
- * explicitly using this method.
- *
- * @param separator The desired line separator string.
- */
- public void setLineSeparator(String separator)
- {
- lineSeparator = separator;
- }
-
- /**
- * This will get the line separator.
- *
- * @return The desired line separator string.
- */
- public String getLineSeparator()
- {
- return lineSeparator;
- }
-
- /**
- * Set the desired page separator for output text. The line.separator
- * system property is used if the page separator preference is not set
- * explicitly using this method.
- *
- * @param separator The desired page separator string.
- */
- public void setPageSeparator(String separator)
- {
- pageSeparator = separator;
- }
-
- /**
- * This will get the word separator.
- *
- * @return The desired word separator string.
- */
- public String getWordSeparator()
- {
- return wordSeparator;
- }
-
- /**
- * Set the desired word separator for output text. The PDFBox text extraction
- * algorithm will output a space character if there is enough space between
- * two words. By default a space character is used. If you need and accurate
- * count of characters that are found in a PDF document then you might want to
- * set the word separator to the empty string.
- *
- * @param separator The desired page separator string.
- */
- public void setWordSeparator(String separator)
- {
- wordSeparator = separator;
- }
-
- /**
- * This will get the page separator.
- *
- * @return The page separator string.
- */
- public String getPageSeparator()
- {
- return pageSeparator;
- }
- /**
- * @return Returns the suppressDuplicateOverlappingText.
- */
- public boolean shouldSuppressDuplicateOverlappingText()
- {
- return suppressDuplicateOverlappingText;
- }
-
- /**
- * Get the current page number that is being processed.
- *
- * @return A 1 based number representing the current page.
- */
- protected int getCurrentPageNo()
- {
- return currentPageNo;
- }
-
- /**
- * The output stream that is being written to.
- *
- * @return The stream that output is being written to.
- */
- protected Writer getOutput()
- {
- return output;
- }
-
- /**
- * Character strings are grouped by articles. It is quite common that there
- * will only be a single article. This returns a List that contains List objects,
- * the inner lists will contain TextPosition objects.
- *
- * @return A double List of TextPositions for all text strings on the page.
- */
- protected List getCharactersByArticle()
- {
- return charactersByArticle;
- }
-
- /**
- * By default the text stripper will attempt to remove text that overlapps each other.
- * Word paints the same character several times in order to make it look bold. By setting
- * this to false all text will be extracted, which means that certain sections will be
- * duplicated, but better performance will be noticed.
- *
- * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
- */
- public void setSuppressDuplicateOverlappingText(
- boolean suppressDuplicateOverlappingTextValue)
- {
- this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
- }
-
- /**
- * This will tell if the text stripper should separate by beads.
- *
- * @return If the text will be grouped by beads.
- */
- public boolean shouldSeparateByBeads()
- {
- return shouldSeparateByBeads;
- }
-
- /**
- * Set if the text stripper should group the text output by a list of beads. The default value is true!
- *
- * @param aShouldSeparateByBeads The new grouping of beads.
- */
- public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
- {
- this.shouldSeparateByBeads = aShouldSeparateByBeads;
- }
-
- /**
- * Get the bookmark where text extraction should end, inclusive. Default is null.
- *
- * @return The ending bookmark.
- */
- public PDOutlineItem getEndBookmark()
- {
- return endBookmark;
- }
-
- /**
- * Set the bookmark where the text extraction should stop.
- *
- * @param aEndBookmark The ending bookmark.
- */
- public void setEndBookmark(PDOutlineItem aEndBookmark)
- {
- endBookmark = aEndBookmark;
- }
-
- /**
- * Get the bookmark where text extraction should start, inclusive. Default is null.
- *
- * @return The starting bookmark.
- */
- public PDOutlineItem getStartBookmark()
- {
- return startBookmark;
- }
-
- /**
- * Set the bookmark where text extraction should start, inclusive.
- *
- * @param aStartBookmark The starting bookmark.
- */
- public void setStartBookmark(PDOutlineItem aStartBookmark)
- {
- startBookmark = aStartBookmark;
- }
-
- /**
- * This will tell if the text stripper should sort the text tokens
- * before writing to the stream.
- *
- * @return true If the text tokens will be sorted before being written.
- */
- public boolean shouldSortByPosition()
- {
- return sortByPosition;
- }
-
- /**
- * The order of the text tokens in a PDF file may not be in the same
- * as they appear visually on the screen. For example, a PDF writer may
- * write out all text by font, so all bold or larger text, then make a second
- * pass and write out the normal text.<br/>
- * The default is to <b>not</b> sort by position.<br/>
- * <br/>
- * A PDF writer could choose to write each character in a different order. By
- * default PDFBox does <b>not</b> sort the text tokens before processing them due to
- * performance reasons.
- *
- * @param newSortByPosition Tell PDFBox to sort the text positions.
- */
- public void setSortByPosition(boolean newSortByPosition)
- {
- sortByPosition = newSortByPosition;
- }
- /**
- * the rotation of the first page 0|90|180|270
- * @param Rotation
- *
- */
- public void setFirstPageRotation(int Rotation)
- {
- first_page_rotation = Rotation;
- }
- /**
- * tells if rotation of first handled page should be concerned
- * @param newFirstPageRotationFromThis boolean
- */
- public void setGetFirstPageRotationFromThis(boolean newFirstPageRotationFromThis)
- {
- getFirstPageRotationFromThis = newFirstPageRotationFromThis;
- }
-} \ No newline at end of file