From c68ad0ec056b37c82debebcecfcde1866d61b4d9 Mon Sep 17 00:00:00 2001 From: tknall Date: Tue, 25 Nov 2008 12:03:13 +0000 Subject: Removing pdfbox from source. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@301 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- src/main/java/org/pdfbox/util/PDFTextStripper.java | 1062 -------------------- 1 file changed, 1062 deletions(-) delete mode 100644 src/main/java/org/pdfbox/util/PDFTextStripper.java (limited to 'src/main/java/org/pdfbox/util/PDFTextStripper.java') diff --git a/src/main/java/org/pdfbox/util/PDFTextStripper.java b/src/main/java/org/pdfbox/util/PDFTextStripper.java deleted file mode 100644 index 62efb64..0000000 --- a/src/main/java/org/pdfbox/util/PDFTextStripper.java +++ /dev/null @@ -1,1062 +0,0 @@ -/** - * Copyright (c) 2003-2005, www.pdfbox.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * 3. Neither the name of pdfbox; nor the names of its - * contributors may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * http://www.pdfbox.org - * - */ -package org.pdfbox.util; - -import java.io.IOException; -import java.io.StringWriter; -import java.io.Writer; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Vector; - -import org.pdfbox.cos.COSDocument; -import org.pdfbox.cos.COSStream; - -import org.pdfbox.pdmodel.PDDocument; -import org.pdfbox.pdmodel.PDPage; - -import org.pdfbox.pdmodel.common.PDRectangle; -import org.pdfbox.pdmodel.common.PDStream; - -import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary; -import org.pdfbox.pdmodel.encryption.PDStandardEncryption; -import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; -import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; - -import org.pdfbox.exceptions.CryptographyException; -import org.pdfbox.exceptions.InvalidPasswordException; - -import org.apache.log4j.Logger; - - -/** - * This class will take a pdf document and strip out all of the text and ignore the - * formatting and such. - * - * @author Ben Litchfield (ben@benlitchfield.com) - * @version $Revision: 1.62 $ - */ -public class PDFTextStripper extends PDFStreamEngine -{ - private static Logger log = Logger.getLogger(PDFTextStripper.class); - private int first_page_rotation = 0; - private boolean getFirstPageRotationFromThis = false; - private int currentPageNo = 0; - private int startPage = 1; - private int endPage = Integer.MAX_VALUE; - private PDOutlineItem startBookmark = null; - private int startBookmarkPageNumber = -1; - private PDOutlineItem endBookmark = null; - private int endBookmarkPageNumber = -1; - private PDDocument document; - private boolean suppressDuplicateOverlappingText = true; - private boolean shouldSeparateByBeads = true; - private boolean sortByPosition = false; - - private List pageArticles = null; - /** - * The charactersByArticle is used to extract text by article divisions. For example - * a PDF that has two columns like a newspaper, we want to extract the first column and - * then the second column. In this example the PDF would have 2 beads(or articles), one for - * each column. The size of the charactersByArticle would be 5, because not all text on the - * screen will fall into one of the articles. The five divisions are shown below - * - * Text before first article - * first article text - * text between first article and second article - * second article text - * text after second article - * - * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. - */ - protected Vector charactersByArticle = new Vector(); - - private Map characterListMapping = new HashMap(); - - private String lineSeparator = System.getProperty("line.separator"); - private String pageSeparator = System.getProperty("line.separator"); - private String wordSeparator = " "; - - /** - * The stream to write the output to. - */ - protected Writer output; - - /** - * Instantiate a new PDFTextStripper object. This object will load properties from - * Resources/PDFTextStripper.properties. - * @throws IOException If there is an error loading the properties. - */ - public PDFTextStripper() throws IOException - { - super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) ); - } - - /** - * This will return the text of a document. See writeText.
- * NOTE: The document must not be encrypted when coming into this method. - * - * @param doc The document to get the text from. - * - * @return The text of the PDF document. - * - * @throws IOException if the doc state is invalid or it is encrypted. - */ - public String getText( PDDocument doc ) throws IOException - { - StringWriter outputStream = new StringWriter(); - writeText( doc, outputStream ); - return outputStream.toString(); - } - - /** - * @deprecated - * @see PDFTextStripper#getText( PDDocument ) - * @param doc The document to extract the text from. - * @return The document text. - * @throws IOException If there is an error extracting the text. - */ - public String getText( COSDocument doc ) throws IOException - { - return getText( new PDDocument( doc ) ); - } - - /** - * @deprecated - * @see PDFTextStripper#writeText( PDDocument, Writer ) - * @param doc The document to extract the text. - * @param outputStream The stream to write the text to. - * @throws IOException If there is an error extracting the text. - */ - public void writeText( COSDocument doc, Writer outputStream ) throws IOException - { - writeText( new PDDocument( doc ), outputStream ); - } - - /** - * This will take a PDDocument and write the text of that document to the print writer. - * - * @param doc The document to get the data from. - * @param outputStream The location to put the text. - * - * @throws IOException If the doc is in an invalid state. - */ - public void writeText( PDDocument doc, Writer outputStream ) throws IOException - { - - PDEncryptionDictionary encDictionary = doc.getEncryptionDictionary(); - - //only care about standard encryption and if it was decrypted with the - //user password - if( encDictionary instanceof PDStandardEncryption && - !doc.wasDecryptedWithOwnerPassword() ) - { - PDStandardEncryption stdEncryption = (PDStandardEncryption)encDictionary; - if( !stdEncryption.canExtractContent() ) - { - throw new IOException( "You do not have permission to extract text" ); - } - } - currentPageNo = 0; - document = doc; - output = outputStream; - startDocument(document); - - if( document.isEncrypted() ) - { - // We are expecting non-encrypted documents here, but it is common - // for users to pass in a document that is encrypted with an empty - // password (such a document appears to not be encrypted by - // someone viewing the document, thus the confusion). We will - // attempt to decrypt with the empty password to handle this case. - // - log.debug("Document is encrypted, decrypting with empty password"); - try - { - document.decrypt(""); - } - catch (CryptographyException e) - { - throw new IOException("Error decrypting document, details: " + e.getMessage()); - } - catch (InvalidPasswordException e) - { - throw new IOException("Error: document is encrypted"); - } - } - - processPages( document.getDocumentCatalog().getAllPages() ); - endDocument(document); - } - - /** - * This will process all of the pages and the text that is in them. - * - * @param pages The pages object in the document. - * - * @throws IOException If there is an error parsing the text. - */ - protected void processPages( List pages ) throws IOException - { - if( log.isDebugEnabled() ) - { - log.debug( "processPages( " + pages + " )" ); - } - - if( startBookmark != null ) - { - startBookmarkPageNumber = getPageNumber( startBookmark, pages ); - } - - if( endBookmark != null ) - { - endBookmarkPageNumber = getPageNumber( endBookmark, pages ); - } - - if( startBookmarkPageNumber == -1 && startBookmark != null && - endBookmarkPageNumber == -1 && endBookmark != null && - startBookmark.getCOSObject() == endBookmark.getCOSObject() ) - { - //this is a special case where both the start and end bookmark - //are the same but point to nothing. In this case - //we will not extract any text. - startBookmarkPageNumber = 0; - endBookmarkPageNumber = 0; - } - - - Iterator pageIter = pages.iterator(); - while( pageIter.hasNext() ) - { - PDPage nextPage = (PDPage)pageIter.next(); - PDStream contentStream = nextPage.getContents(); - if( contentStream != null ) - { - COSStream contents = contentStream.getStream(); - processPage( nextPage, contents ); - } - } - if( log.isDebugEnabled() ) - { - log.debug( "processPages() end" ); - } - } - - private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException - { - int pageNumber = -1; - PDPage page = bookmark.findDestinationPage( document ); - if( page != null ) - { - pageNumber = allPages.indexOf( page )+1;//use one based indexing - } - return pageNumber; - } - - /** - * This method is available for subclasses of this class. It will be called before processing - * of the document start. - * - * @param pdf The PDF document that is being processed. - * @throws IOException If an IO error occurs. - */ - protected void startDocument(PDDocument pdf) throws IOException - { - // no default implementation, but available for subclasses - } - - /** - * This method is available for subclasses of this class. It will be called after processing - * of the document finishes. - * - * @param pdf The PDF document that is being processed. - * @throws IOException If an IO error occurs. - */ - protected void endDocument(PDDocument pdf ) throws IOException - { - // no default implementation, but available for subclasses - } - - /** - * This will process the contents of a page. - * - * @param page The page to process. - * @param content The contents of the page. - * - * @throws IOException If there is an error processing the page. - */ - protected void processPage( PDPage page, COSStream content ) throws IOException - { - long start = System.currentTimeMillis(); - // System.out.println( "mruhmer processPage page.findRotation()"+page.findRotation()); - // System.out.println( "mruhmer processPage currentPageNo="+currentPageNo); - // System.out.println( "mruhmer processPage page.getRotation()"+page.getRotation()); - - if((currentPageNo==0) ) - { - if (getFirstPageRotationFromThis) - { - page.setRotation(first_page_rotation); - } - } - if( log.isDebugEnabled() ) - { - log.debug( "processPage( " + page + ", " + content + " )" ); - } - currentPageNo++; - if( currentPageNo >= startPage && currentPageNo <= endPage && - (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) && - (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber )) - { - startPage( page ); - pageArticles = page.getThreadBeads(); - int numberOfArticleSections = 1 + pageArticles.size() * 2; - if( !shouldSeparateByBeads ) - { - numberOfArticleSections = 1; - } - int originalSize = charactersByArticle.size(); - charactersByArticle.setSize( numberOfArticleSections ); - for( int i=0; i"); - } - float endOfLastTextX = -1; - float startOfNextWordX = -1; - float lastWordSpacing = -1; - TextPosition lastProcessedCharacter = null; - - for( int i=0; i (currentY + (position.getFontSize() * 0.9f * verticalScaling))))) - { - if (log.isDebugEnabled()) - { - log.debug(""); - } - output.write(lineSeparator); - endOfLastTextX = -1; - startOfNextWordX = -1; - currentY = -1; - lastBaselineFontSize = -1; - } - - if (startOfNextWordX != -1 && startOfNextWordX < position.getX() && - lastProcessedCharacter != null && - //only bother adding a space if the last character was not a space - lastProcessedCharacter.getCharacter() != null && - !lastProcessedCharacter.getCharacter().endsWith( " " ) ) - { - if (log.isDebugEnabled()) - { - log.debug(""); - } - output.write( wordSeparator ); - } - - - if (log.isDebugEnabled()) - { - log.debug("flushText" + - " x=" + position.getX() + - " y=" + position.getY() + - " xScale=" + position.getXScale() + - " yScale=" + position.getYScale() + - " width=" + position.getWidth() + - " currentY=" + currentY + - " endOfLastTextX=" + endOfLastTextX + - " startOfNextWordX=" + startOfNextWordX + - " fontSize=" + position.getFontSize() + - " wordSpacing=" + wordSpacing + - " string=\"" + characterValue + "\""); - } - - if (currentY == -1) - { - currentY = position.getY(); - } - - if (currentY == position.getY()) - { - lastBaselineFontSize = position.getFontSize(); - } - - // RDD - endX is what PDF considers to be the x coordinate of the - // end position of the text. We use it in computing our metrics below. - // - endOfLastTextX = position.getX() + position.getWidth(); - - - if (characterValue != null) - { - output.write(characterValue); - } - else - { - log.debug( "Position.getString() is null so not writing anything" ); - } - lastProcessedCharacter = position; - } - endParagraph(); - } - - - // RDD - newline at end of flush - required for end of page (so that the top - // of the next page starts on its own line. - // - if( log.isDebugEnabled() ) - { - log.debug(""); - } - output.write(pageSeparator); - - output.flush(); - } - - /** - * Write the string to the output stream. - * - * @param text The text to write to the stream. - * @throws IOException If there is an error when writing the text. - */ - protected void writeCharacters( TextPosition text ) throws IOException - { - output.write( text.getCharacter() ); - } - - /** - * This will determine of two floating point numbers are within a specified variance. - * - * @param first The first number to compare to. - * @param second The second number to compare to. - * @param variance The allowed variance. - */ - private boolean within( float first, float second, float variance ) - { - return second > first - variance && second < first + variance; - } - - /** - * This will show add a character to the list of characters to be printed to - * the text file. - * - * @param text The description of the character to display. - */ - protected void showCharacter( TextPosition text ) - { - boolean showCharacter = true; - if( suppressDuplicateOverlappingText ) - { - showCharacter = false; - String textCharacter = text.getCharacter(); - float textX = text.getX(); - float textY = text.getY(); - List sameTextCharacters = (List)characterListMapping.get( textCharacter ); - if( sameTextCharacters == null ) - { - sameTextCharacters = new ArrayList(); - characterListMapping.put( textCharacter, sameTextCharacters ); - } - - // RDD - Here we compute the value that represents the end of the rendered - // text. This value is used to determine whether subsequent text rendered - // on the same line overwrites the current text. - // - // We subtract any positive padding to handle cases where extreme amounts - // of padding are applied, then backed off (not sure why this is done, but there - // are cases where the padding is on the order of 10x the character width, and - // the TJ just backs up to compensate after each character). Also, we subtract - // an amount to allow for kerning (a percentage of the width of the last - // character). - // - boolean suppressCharacter = false; - float tolerance = (text.getWidth()/textCharacter.length())/3.0f; - for( int i=0; i - * The default is to not sort by position.
- *
- * A PDF writer could choose to write each character in a different order. By - * default PDFBox does not sort the text tokens before processing them due to - * performance reasons. - * - * @param newSortByPosition Tell PDFBox to sort the text positions. - */ - public void setSortByPosition(boolean newSortByPosition) - { - sortByPosition = newSortByPosition; - } - /** - * the rotation of the first page 0|90|180|270 - * @param Rotation - * - */ - public void setFirstPageRotation(int Rotation) - { - first_page_rotation = Rotation; - } - /** - * tells if rotation of first handled page should be concerned - * @param newFirstPageRotationFromThis boolean - */ - public void setGetFirstPageRotationFromThis(boolean newFirstPageRotationFromThis) - { - getFirstPageRotationFromThis = newFirstPageRotationFromThis; - } -} \ No newline at end of file -- cgit v1.2.3