/** * Copyright (c) 2003-2005, www.pdfbox.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the name of pdfbox; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://www.pdfbox.org * */ package org.pdfbox.util; import java.io.IOException; import java.io.StringWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Vector; import org.pdfbox.cos.COSDocument; import org.pdfbox.cos.COSStream; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.pdmodel.common.PDRectangle; import org.pdfbox.pdmodel.common.PDStream; import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary; import org.pdfbox.pdmodel.encryption.PDStandardEncryption; import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; import org.pdfbox.exceptions.CryptographyException; import org.pdfbox.exceptions.InvalidPasswordException; import org.apache.log4j.Logger; /** * This class will take a pdf document and strip out all of the text and ignore the * formatting and such. * * @author Ben Litchfield (ben@benlitchfield.com) * @version $Revision: 1.62 $ */ public class PDFTextStripper extends PDFStreamEngine { private static Logger log = Logger.getLogger(PDFTextStripper.class); private int first_page_rotation = 0; private boolean getFirstPageRotationFromThis = false; private int currentPageNo = 0; private int startPage = 1; private int endPage = Integer.MAX_VALUE; private PDOutlineItem startBookmark = null; private int startBookmarkPageNumber = -1; private PDOutlineItem endBookmark = null; private int endBookmarkPageNumber = -1; private PDDocument document; private boolean suppressDuplicateOverlappingText = true; private boolean shouldSeparateByBeads = true; private boolean sortByPosition = false; private List pageArticles = null; /** * The charactersByArticle is used to extract text by article divisions. For example * a PDF that has two columns like a newspaper, we want to extract the first column and * then the second column. In this example the PDF would have 2 beads(or articles), one for * each column. The size of the charactersByArticle would be 5, because not all text on the * screen will fall into one of the articles. The five divisions are shown below * * Text before first article * first article text * text between first article and second article * second article text * text after second article * * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. */ protected Vector charactersByArticle = new Vector(); private Map characterListMapping = new HashMap(); private String lineSeparator = System.getProperty("line.separator"); private String pageSeparator = System.getProperty("line.separator"); private String wordSeparator = " "; /** * The stream to write the output to. */ protected Writer output; /** * Instantiate a new PDFTextStripper object. This object will load properties from * Resources/PDFTextStripper.properties. * @throws IOException If there is an error loading the properties. */ public PDFTextStripper() throws IOException { super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) ); } /** * This will return the text of a document. See writeText. <br /> * NOTE: The document must not be encrypted when coming into this method. * * @param doc The document to get the text from. * * @return The text of the PDF document. * * @throws IOException if the doc state is invalid or it is encrypted. */ public String getText( PDDocument doc ) throws IOException { StringWriter outputStream = new StringWriter(); writeText( doc, outputStream ); return outputStream.toString(); } /** * @deprecated * @see PDFTextStripper#getText( PDDocument ) * @param doc The document to extract the text from. * @return The document text. * @throws IOException If there is an error extracting the text. */ public String getText( COSDocument doc ) throws IOException { return getText( new PDDocument( doc ) ); } /** * @deprecated * @see PDFTextStripper#writeText( PDDocument, Writer ) * @param doc The document to extract the text. * @param outputStream The stream to write the text to. * @throws IOException If there is an error extracting the text. */ public void writeText( COSDocument doc, Writer outputStream ) throws IOException { writeText( new PDDocument( doc ), outputStream ); } /** * This will take a PDDocument and write the text of that document to the print writer. * * @param doc The document to get the data from. * @param outputStream The location to put the text. * * @throws IOException If the doc is in an invalid state. */ public void writeText( PDDocument doc, Writer outputStream ) throws IOException { PDEncryptionDictionary encDictionary = doc.getEncryptionDictionary(); //only care about standard encryption and if it was decrypted with the //user password if( encDictionary instanceof PDStandardEncryption && !doc.wasDecryptedWithOwnerPassword() ) { PDStandardEncryption stdEncryption = (PDStandardEncryption)encDictionary; if( !stdEncryption.canExtractContent() ) { throw new IOException( "You do not have permission to extract text" ); } } currentPageNo = 0; document = doc; output = outputStream; startDocument(document); if( document.isEncrypted() ) { // We are expecting non-encrypted documents here, but it is common // for users to pass in a document that is encrypted with an empty // password (such a document appears to not be encrypted by // someone viewing the document, thus the confusion). We will // attempt to decrypt with the empty password to handle this case. // log.debug("Document is encrypted, decrypting with empty password"); try { document.decrypt(""); } catch (CryptographyException e) { throw new IOException("Error decrypting document, details: " + e.getMessage()); } catch (InvalidPasswordException e) { throw new IOException("Error: document is encrypted"); } } processPages( document.getDocumentCatalog().getAllPages() ); endDocument(document); } /** * This will process all of the pages and the text that is in them. * * @param pages The pages object in the document. * * @throws IOException If there is an error parsing the text. */ protected void processPages( List pages ) throws IOException { if( log.isDebugEnabled() ) { log.debug( "processPages( " + pages + " )" ); } if( startBookmark != null ) { startBookmarkPageNumber = getPageNumber( startBookmark, pages ); } if( endBookmark != null ) { endBookmarkPageNumber = getPageNumber( endBookmark, pages ); } if( startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1 && endBookmark != null && startBookmark.getCOSObject() == endBookmark.getCOSObject() ) { //this is a special case where both the start and end bookmark //are the same but point to nothing. In this case //we will not extract any text. startBookmarkPageNumber = 0; endBookmarkPageNumber = 0; } Iterator pageIter = pages.iterator(); while( pageIter.hasNext() ) { PDPage nextPage = (PDPage)pageIter.next(); PDStream contentStream = nextPage.getContents(); if( contentStream != null ) { COSStream contents = contentStream.getStream(); processPage( nextPage, contents ); } } if( log.isDebugEnabled() ) { log.debug( "processPages() end" ); } } private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException { int pageNumber = -1; PDPage page = bookmark.findDestinationPage( document ); if( page != null ) { pageNumber = allPages.indexOf( page )+1;//use one based indexing } return pageNumber; } /** * This method is available for subclasses of this class. It will be called before processing * of the document start. * * @param pdf The PDF document that is being processed. * @throws IOException If an IO error occurs. */ protected void startDocument(PDDocument pdf) throws IOException { // no default implementation, but available for subclasses } /** * This method is available for subclasses of this class. It will be called after processing * of the document finishes. * * @param pdf The PDF document that is being processed. * @throws IOException If an IO error occurs. */ protected void endDocument(PDDocument pdf ) throws IOException { // no default implementation, but available for subclasses } /** * This will process the contents of a page. * * @param page The page to process. * @param content The contents of the page. * * @throws IOException If there is an error processing the page. */ protected void processPage( PDPage page, COSStream content ) throws IOException { long start = System.currentTimeMillis(); // System.out.println( "mruhmer processPage page.findRotation()"+page.findRotation()); // System.out.println( "mruhmer processPage currentPageNo="+currentPageNo); // System.out.println( "mruhmer processPage page.getRotation()"+page.getRotation()); if((currentPageNo==0) ) { if (getFirstPageRotationFromThis) { page.setRotation(first_page_rotation); } } if( log.isDebugEnabled() ) { log.debug( "processPage( " + page + ", " + content + " )" ); } currentPageNo++; if( currentPageNo >= startPage && currentPageNo <= endPage && (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) && (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber )) { startPage( page ); pageArticles = page.getThreadBeads(); int numberOfArticleSections = 1 + pageArticles.size() * 2; if( !shouldSeparateByBeads ) { numberOfArticleSections = 1; } int originalSize = charactersByArticle.size(); charactersByArticle.setSize( numberOfArticleSections ); for( int i=0; i<numberOfArticleSections; i++ ) { if( numberOfArticleSections < originalSize ) { ((List)charactersByArticle.get( i )).clear(); } else { charactersByArticle.set( i, new ArrayList() ); } } characterListMapping.clear(); long startProcess = System.currentTimeMillis(); processStream( page, page.findResources(), content ); long stopProcess = System.currentTimeMillis(); long startFlush = System.currentTimeMillis(); flushText(); long stopFlush = System.currentTimeMillis(); if( log.isDebugEnabled() ) { log.debug( "processStream time=" + (stopProcess-startProcess) ); log.debug( "flushText time=" + (stopFlush-startFlush) ); } endPage( page ); } long stop = System.currentTimeMillis(); if( log.isDebugEnabled() ) { log.debug( "processPage() end time=" + (stop-start) ); } } /** * Start a new paragraph. Default implementation is to do nothing. Subclasses * may provide additional information. * * @throws IOException If there is any error writing to the stream. */ protected void startParagraph() throws IOException { //default is to do nothing. } /** * End a paragraph. Default implementation is to do nothing. Subclasses * may provide additional information. * * @throws IOException If there is any error writing to the stream. */ protected void endParagraph() throws IOException { //default is to do nothing } /** * Start a new page. Default implementation is to do nothing. Subclasses * may provide additional information. * * @param page The page we are about to process. * * @throws IOException If there is any error writing to the stream. */ protected void startPage( PDPage page ) throws IOException { //default is to do nothing. } /** * End a page. Default implementation is to do nothing. Subclasses * may provide additional information. * * @param page The page we are about to process. * * @throws IOException If there is any error writing to the stream. */ protected void endPage( PDPage page ) throws IOException { //default is to do nothing } /** * This will print the text to the output stream. * * @throws IOException If there is an error writing the text. */ protected void flushText() throws IOException { if( log.isDebugEnabled() ) { log.debug( "flushText() start" ); } float currentY = -1; float lastBaselineFontSize = -1; if( log.isDebugEnabled() ) { log.debug("<Starting text object list>"); } float endOfLastTextX = -1; float startOfNextWordX = -1; float lastWordSpacing = -1; TextPosition lastProcessedCharacter = null; for( int i=0; i<charactersByArticle.size(); i++) { startParagraph(); List textList = (List)charactersByArticle.get( i ); if( sortByPosition ) { TextPositionComparator comparator = new TextPositionComparator( getCurrentPage() ); Collections.sort( textList, comparator ); } Iterator textIter = textList.iterator(); while( textIter.hasNext() ) { TextPosition position = (TextPosition)textIter.next(); String characterValue = position.getCharacter(); //wordSpacing = position.getWordSpacing(); float wordSpacing = 0; if( wordSpacing == 0 ) { //try to get width of a space character wordSpacing = position.getWidthOfSpace(); //if still zero fall back to getting the width of the current //character if( wordSpacing == 0 ) { wordSpacing = position.getWidth(); } } // RDD - We add a conservative approximation for space determination. // basically if there is a blank area between two characters that is //equal to some percentage of the word spacing then that will be the //start of the next word if( lastWordSpacing <= 0 ) { startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f); } else { startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f); } lastWordSpacing = wordSpacing; // RDD - We will suppress text that is very close to the current line // and which overwrites previously rendered text on this line. // This is done specifically to handle a reasonably common situation // where an application (MS Word, in the case of my examples) renders // text four times at small (1 point) offsets in order to accomplish // bold printing. You would not want to do this step if you were // going to render the TextPosition objects graphically. // /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) && (currentY != -1 && Math.abs(position.getY() - currentY) < 1)) { if (log.isDebugEnabled()) { log.debug("Suppressing text overwrite" + " x: " + position.getX() + " endOfLastTextX: " + endOfLastTextX + " string: " + position.getCharacter()); } continue; }*/ // RDD - Here we determine whether this text object is on the current // line. We use the lastBaselineFontSize to handle the superscript // case, and the size of the current font to handle the subscript case. // Text must overlap with the last rendered baseline text by at least // a small amount in order to be considered as being on the same line. // int verticalScaling = 1; if( lastBaselineFontSize < 0 || position.getFontSize() < 0 ) { verticalScaling = -1; } if (currentY != -1 && ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) || (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling))))) { if (log.isDebugEnabled()) { log.debug("<newline currentY=" + currentY + ", y=" + position.getY() + " fs=" + position.getFontSize()+ " lb fs=" + lastBaselineFontSize + ">"); } output.write(lineSeparator); endOfLastTextX = -1; startOfNextWordX = -1; currentY = -1; lastBaselineFontSize = -1; } if (startOfNextWordX != -1 && startOfNextWordX < position.getX() && lastProcessedCharacter != null && //only bother adding a space if the last character was not a space lastProcessedCharacter.getCharacter() != null && !lastProcessedCharacter.getCharacter().endsWith( " " ) ) { if (log.isDebugEnabled()) { log.debug("<space startOfNextWordX=" + startOfNextWordX + ", x=" + position.getX() + ">"); } output.write( wordSeparator ); } if (log.isDebugEnabled()) { log.debug("flushText" + " x=" + position.getX() + " y=" + position.getY() + " xScale=" + position.getXScale() + " yScale=" + position.getYScale() + " width=" + position.getWidth() + " currentY=" + currentY + " endOfLastTextX=" + endOfLastTextX + " startOfNextWordX=" + startOfNextWordX + " fontSize=" + position.getFontSize() + " wordSpacing=" + wordSpacing + " string=\"" + characterValue + "\""); } if (currentY == -1) { currentY = position.getY(); } if (currentY == position.getY()) { lastBaselineFontSize = position.getFontSize(); } // RDD - endX is what PDF considers to be the x coordinate of the // end position of the text. We use it in computing our metrics below. // endOfLastTextX = position.getX() + position.getWidth(); if (characterValue != null) { output.write(characterValue); } else { log.debug( "Position.getString() is null so not writing anything" ); } lastProcessedCharacter = position; } endParagraph(); } // RDD - newline at end of flush - required for end of page (so that the top // of the next page starts on its own line. // if( log.isDebugEnabled() ) { log.debug("<newline endOfFlush=\"true\">"); } output.write(pageSeparator); output.flush(); } /** * Write the string to the output stream. * * @param text The text to write to the stream. * @throws IOException If there is an error when writing the text. */ protected void writeCharacters( TextPosition text ) throws IOException { output.write( text.getCharacter() ); } /** * This will determine of two floating point numbers are within a specified variance. * * @param first The first number to compare to. * @param second The second number to compare to. * @param variance The allowed variance. */ private boolean within( float first, float second, float variance ) { return second > first - variance && second < first + variance; } /** * This will show add a character to the list of characters to be printed to * the text file. * * @param text The description of the character to display. */ protected void showCharacter( TextPosition text ) { boolean showCharacter = true; if( suppressDuplicateOverlappingText ) { showCharacter = false; String textCharacter = text.getCharacter(); float textX = text.getX(); float textY = text.getY(); List sameTextCharacters = (List)characterListMapping.get( textCharacter ); if( sameTextCharacters == null ) { sameTextCharacters = new ArrayList(); characterListMapping.put( textCharacter, sameTextCharacters ); } // RDD - Here we compute the value that represents the end of the rendered // text. This value is used to determine whether subsequent text rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme amounts // of padding are applied, then backed off (not sure why this is done, but there // are cases where the padding is on the order of 10x the character width, and // the TJ just backs up to compensate after each character). Also, we subtract // an amount to allow for kerning (a percentage of the width of the last // character). // boolean suppressCharacter = false; float tolerance = (text.getWidth()/textCharacter.length())/3.0f; for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ ) { TextPosition character = (TextPosition)sameTextCharacters.get( i ); String charCharacter = character.getCharacter(); float charX = character.getX(); float charY = character.getY(); //only want to suppress if( charCharacter != null && //charCharacter.equals( textCharacter ) && within( charX, textX, tolerance ) && within( charY, textY, tolerance ) ) { if( log.isDebugEnabled() ) { log.debug("suppressText" + " x=" + charX + " y=" + charY + " width=" + character.getWidth() + " fontSize=" + character.getFontSize() + " string=\"" + charCharacter + "\""); } suppressCharacter = true; } } if( !suppressCharacter ) { sameTextCharacters.add( text ); showCharacter = true; } } if( showCharacter ) { //if we are showing the character then we need to determine which //article it belongs to. int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; int notFoundButFirstLeftArticleDivisionIndex = -1; int notFoundButFirstAboveArticleDivisionIndex = -1; float x = text.getX(); float y = text.getY(); if( shouldSeparateByBeads ) { for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ ) { PDThreadBead bead = (PDThreadBead)pageArticles.get( i ); if( bead != null ) { PDRectangle rect = bead.getRectangle(); if( rect.contains( x, y ) ) { foundArticleDivisionIndex = i*2+1; } else if( (x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2; } else if( x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i*2; } else if( y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i*2; } } else { foundArticleDivisionIndex = 0; } } } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex = -1; if( foundArticleDivisionIndex != -1 ) { articleDivisionIndex = foundArticleDivisionIndex; } else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 ) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; } else if( notFoundButFirstLeftArticleDivisionIndex != -1 ) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; } else if( notFoundButFirstAboveArticleDivisionIndex != -1 ) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; } else { articleDivisionIndex = charactersByArticle.size()-1; } List textList = (List) charactersByArticle.get( articleDivisionIndex ); textList.add( text ); } } /** * This is the page that the text extraction will start on. The pages start * at page 1. For example in a 5 page PDF document, if the start page is 1 * then all pages will be extracted. If the start page is 4 then pages 4 and 5 * will be extracted. The default value is 1. * * @return Value of property startPage. */ public int getStartPage() { return startPage; } /** * This will set the first page to be extracted by this class. * * @param startPageValue New value of property startPage. */ public void setStartPage(int startPageValue) { startPage = startPageValue; } /** * This will get the last page that will be extracted. This is inclusive, * for example if a 5 page PDF an endPage value of 5 would extract the * entire document, an end page of 2 would extract pages 1 and 2. This defaults * to Integer.MAX_VALUE such that all pages of the pdf will be extracted. * * @return Value of property endPage. */ public int getEndPage() { return endPage; } /** * This will set the last page to be extracted by this class. * * @param endPageValue New value of property endPage. */ public void setEndPage(int endPageValue) { endPage = endPageValue; } /** * Set the desired line separator for output text. The line.separator * system property is used if the line separator preference is not set * explicitly using this method. * * @param separator The desired line separator string. */ public void setLineSeparator(String separator) { lineSeparator = separator; } /** * This will get the line separator. * * @return The desired line separator string. */ public String getLineSeparator() { return lineSeparator; } /** * Set the desired page separator for output text. The line.separator * system property is used if the page separator preference is not set * explicitly using this method. * * @param separator The desired page separator string. */ public void setPageSeparator(String separator) { pageSeparator = separator; } /** * This will get the word separator. * * @return The desired word separator string. */ public String getWordSeparator() { return wordSeparator; } /** * Set the desired word separator for output text. The PDFBox text extraction * algorithm will output a space character if there is enough space between * two words. By default a space character is used. If you need and accurate * count of characters that are found in a PDF document then you might want to * set the word separator to the empty string. * * @param separator The desired page separator string. */ public void setWordSeparator(String separator) { wordSeparator = separator; } /** * This will get the page separator. * * @return The page separator string. */ public String getPageSeparator() { return pageSeparator; } /** * @return Returns the suppressDuplicateOverlappingText. */ public boolean shouldSuppressDuplicateOverlappingText() { return suppressDuplicateOverlappingText; } /** * Get the current page number that is being processed. * * @return A 1 based number representing the current page. */ protected int getCurrentPageNo() { return currentPageNo; } /** * The output stream that is being written to. * * @return The stream that output is being written to. */ protected Writer getOutput() { return output; } /** * Character strings are grouped by articles. It is quite common that there * will only be a single article. This returns a List that contains List objects, * the inner lists will contain TextPosition objects. * * @return A double List of TextPositions for all text strings on the page. */ protected List getCharactersByArticle() { return charactersByArticle; } /** * By default the text stripper will attempt to remove text that overlapps each other. * Word paints the same character several times in order to make it look bold. By setting * this to false all text will be extracted, which means that certain sections will be * duplicated, but better performance will be noticed. * * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set. */ public void setSuppressDuplicateOverlappingText( boolean suppressDuplicateOverlappingTextValue) { this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; } /** * This will tell if the text stripper should separate by beads. * * @return If the text will be grouped by beads. */ public boolean shouldSeparateByBeads() { return shouldSeparateByBeads; } /** * Set if the text stripper should group the text output by a list of beads. The default value is true! * * @param aShouldSeparateByBeads The new grouping of beads. */ public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) { this.shouldSeparateByBeads = aShouldSeparateByBeads; } /** * Get the bookmark where text extraction should end, inclusive. Default is null. * * @return The ending bookmark. */ public PDOutlineItem getEndBookmark() { return endBookmark; } /** * Set the bookmark where the text extraction should stop. * * @param aEndBookmark The ending bookmark. */ public void setEndBookmark(PDOutlineItem aEndBookmark) { endBookmark = aEndBookmark; } /** * Get the bookmark where text extraction should start, inclusive. Default is null. * * @return The starting bookmark. */ public PDOutlineItem getStartBookmark() { return startBookmark; } /** * Set the bookmark where text extraction should start, inclusive. * * @param aStartBookmark The starting bookmark. */ public void setStartBookmark(PDOutlineItem aStartBookmark) { startBookmark = aStartBookmark; } /** * This will tell if the text stripper should sort the text tokens * before writing to the stream. * * @return true If the text tokens will be sorted before being written. */ public boolean shouldSortByPosition() { return sortByPosition; } /** * The order of the text tokens in a PDF file may not be in the same * as they appear visually on the screen. For example, a PDF writer may * write out all text by font, so all bold or larger text, then make a second * pass and write out the normal text.<br/> * The default is to <b>not</b> sort by position.<br/> * <br/> * A PDF writer could choose to write each character in a different order. By * default PDFBox does <b>not</b> sort the text tokens before processing them due to * performance reasons. * * @param newSortByPosition Tell PDFBox to sort the text positions. */ public void setSortByPosition(boolean newSortByPosition) { sortByPosition = newSortByPosition; } /** * the rotation of the first page 0|90|180|270 * @param Rotation * */ public void setFirstPageRotation(int Rotation) { first_page_rotation = Rotation; } /** * tells if rotation of first handled page should be concerned * @param newFirstPageRotationFromThis boolean */ public void setGetFirstPageRotationFromThis(boolean newFirstPageRotationFromThis) { getFirstPageRotationFromThis = newFirstPageRotationFromThis; } }