From db52e4d66d60184d53a27ba4d6772461daacc03d Mon Sep 17 00:00:00 2001 From: tknall Date: Fri, 22 Mar 2013 08:57:51 +0000 Subject: Maintenance update (bugfixes, new features, cleanup...) Refer to /dok/RELEASE_NOTES-3.3.txt for further information. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@931 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/TextualSignature.java | 56 ++++++++++------------ 1 file changed, 26 insertions(+), 30 deletions(-) (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java') diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java index 35a0768..3ce690b 100644 --- a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java @@ -53,7 +53,7 @@ import com.lowagie.text.pdf.PdfWriter; /** * Contains helper function for textual signatures. - * + * * @author wprinz */ public class TextualSignature @@ -62,21 +62,22 @@ public class TextualSignature * The logger definition. */ private static final Logger logger_ = ConfigLogger.getLogger(TextualSignature.class); - + /** * Extracts the document text from a given pdf. - * + * * @param pdf_stream * The pdf_input stream. * @return Returns the extracted document text. - * @throws PDFDocumentException + * @throws PDFDocumentException * @throws TextExtractionException * Forwarded exception. */ - public static String extractTextTextual(PdfDataSource pdfDataSource, String encoding) throws PDFDocumentException + public static String extractTextTextual(PdfDataSource pdfDataSource, String encoding) throws PDFDocumentException { PerformanceCounters.textExtractions.increment(); - + PDDocument doc = null; + Document document = null; try { int first_page_rotation = 0; @@ -90,12 +91,8 @@ public class TextualSignature // byte[] bytes = normalizePDF(pdf_stream); - //iText - - byte [] pdf_data = pdfDataSource.getAsByteArray(); - PdfReader reader = new PdfReader(pdf_data); - PDFASUtils.checkReaderPermissions(reader); - //pdf_stream.close(); + + PdfReader reader = PDFASUtils.createPdfReaderCheckingPermissions(pdfDataSource); // PERF: PDF normalization needs byte array - this is costy ByteArrayOutputStream baos = new ByteArrayOutputStream(4096); @@ -108,7 +105,7 @@ public class TextualSignature // this method (although it works when a Table is appended)... very // fragile. - Document document = new Document(); + document = new Document(); PdfWriter writer = PdfWriter.getInstance(document, baos); document.open(); @@ -126,7 +123,7 @@ public class TextualSignature first_page_rotation = new_size_withrot.getRotation(); //logger_.info("iText first_page_rotation="+new_size_withrot.getRotation()); } - //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot); + //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot); //document.setPageSize(new_size); document.setPageSize(new_size_withrot); document.newPage(); @@ -163,15 +160,15 @@ public class TextualSignature //logger_.info("temporary_dir="+temporary_dir.getAbsolutePath()); parser.setTempDirectory(temporary_dir); parser.parse(); - - PDDocument doc = parser.getPDDocument(); + + doc = parser.getPDDocument(); //System.out.println("pdfBox.getNumberOfPages()"+doc.getNumberOfPages()); - + PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(false); stripper.setGetFirstPageRotationFromThis(true); stripper.setFirstPageRotation(first_page_rotation); - + // stripper.setStartPage(4); // stripper.setEndPage(4); logger_.debug("TextualSignator extractTextTextual: Begin stripping text"); @@ -182,8 +179,7 @@ public class TextualSignature throw new PDFDocumentException(ErrorCode.TEXT_EXTRACTION_EXCEPTION, "Unable to extract textual content.", e); } logger_.debug("TextualSignator extractTextTextual: Stripping text ended"); - - doc.close(); + //logger_.debug("TextualSignator extractTextTextual="+text); return text; @@ -199,39 +195,39 @@ public class TextualSignature catch (DocumentException e) { throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); + } finally { + PDFASUtils.closeQuietly(doc); + PDFASUtils.closeQuietly(document); } } - + /** * Normalizes a given binary PDF to a version PDFbox can handle correctly. - * + * *

* PDFbox has serious problems with documents that use incremental updates or * XObject forms. Therefor use this to remove incremental updates and create a * streamlined document. *

- * + * *

* Note that this has nothing to do with text normalization. It just unifies * the PDF documents that are fed into PDFbox for text extraction and page * length determination. *

- * + * * @param input_pdf * The input pdf to be normalized. * @return Returns the normalized pdf. * @throws IOException * @throws DocumentException - * @throws PDFDocumentException + * @throws PDFDocumentException */ public static byte[] normalizePDF(PdfDataSource pdfDataSource) throws IOException, DocumentException, PDFDocumentException { //iText - byte [] pdf_data = pdfDataSource.getAsByteArray(); - PdfReader reader = new PdfReader(pdf_data); - PDFASUtils.checkReaderPermissions(reader); - //input_pdf.close(); - + PdfReader reader = PDFASUtils.createPdfReaderCheckingPermissions(pdfDataSource); + // PERF: PDF Normalization needs byte array ByteArrayOutputStream baos = new ByteArrayOutputStream(); // For some reason the Reader -> ImportPage -> Writer mechanism produces -- cgit v1.2.3