aboutsummaryrefslogtreecommitdiff
path: root/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
diff options
context:
space:
mode:
Diffstat (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java')
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java56
1 files changed, 26 insertions, 30 deletions
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
index 35a0768..3ce690b 100644
--- a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/TextualSignature.java
@@ -53,7 +53,7 @@ import com.lowagie.text.pdf.PdfWriter;
/**
* Contains helper function for textual signatures.
- *
+ *
* @author wprinz
*/
public class TextualSignature
@@ -62,21 +62,22 @@ public class TextualSignature
* The logger definition.
*/
private static final Logger logger_ = ConfigLogger.getLogger(TextualSignature.class);
-
+
/**
* Extracts the document text from a given pdf.
- *
+ *
* @param pdf_stream
* The pdf_input stream.
* @return Returns the extracted document text.
- * @throws PDFDocumentException
+ * @throws PDFDocumentException
* @throws TextExtractionException
* Forwarded exception.
*/
- public static String extractTextTextual(PdfDataSource pdfDataSource, String encoding) throws PDFDocumentException
+ public static String extractTextTextual(PdfDataSource pdfDataSource, String encoding) throws PDFDocumentException
{
PerformanceCounters.textExtractions.increment();
-
+ PDDocument doc = null;
+ Document document = null;
try
{
int first_page_rotation = 0;
@@ -90,12 +91,8 @@ public class TextualSignature
// byte[] bytes = normalizePDF(pdf_stream);
- //iText
-
- byte [] pdf_data = pdfDataSource.getAsByteArray();
- PdfReader reader = new PdfReader(pdf_data);
- PDFASUtils.checkReaderPermissions(reader);
- //pdf_stream.close();
+
+ PdfReader reader = PDFASUtils.createPdfReaderCheckingPermissions(pdfDataSource);
// PERF: PDF normalization needs byte array - this is costy
ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
@@ -108,7 +105,7 @@ public class TextualSignature
// this method (although it works when a Table is appended)... very
// fragile.
- Document document = new Document();
+ document = new Document();
PdfWriter writer = PdfWriter.getInstance(document, baos);
document.open();
@@ -126,7 +123,7 @@ public class TextualSignature
first_page_rotation = new_size_withrot.getRotation();
//logger_.info("iText first_page_rotation="+new_size_withrot.getRotation());
}
- //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot);
+ //logger_.info("iText set PageSize of page:"+page_num+" to: "+new_size_withrot);
//document.setPageSize(new_size);
document.setPageSize(new_size_withrot);
document.newPage();
@@ -163,15 +160,15 @@ public class TextualSignature
//logger_.info("temporary_dir="+temporary_dir.getAbsolutePath());
parser.setTempDirectory(temporary_dir);
parser.parse();
-
- PDDocument doc = parser.getPDDocument();
+
+ doc = parser.getPDDocument();
//System.out.println("pdfBox.getNumberOfPages()"+doc.getNumberOfPages());
-
+
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(false);
stripper.setGetFirstPageRotationFromThis(true);
stripper.setFirstPageRotation(first_page_rotation);
-
+
// stripper.setStartPage(4);
// stripper.setEndPage(4);
logger_.debug("TextualSignator extractTextTextual: Begin stripping text");
@@ -182,8 +179,7 @@ public class TextualSignature
throw new PDFDocumentException(ErrorCode.TEXT_EXTRACTION_EXCEPTION, "Unable to extract textual content.", e);
}
logger_.debug("TextualSignator extractTextTextual: Stripping text ended");
-
- doc.close();
+
//logger_.debug("TextualSignator extractTextTextual="+text);
return text;
@@ -199,39 +195,39 @@ public class TextualSignature
catch (DocumentException e)
{
throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
+ } finally {
+ PDFASUtils.closeQuietly(doc);
+ PDFASUtils.closeQuietly(document);
}
}
-
+
/**
* Normalizes a given binary PDF to a version PDFbox can handle correctly.
- *
+ *
* <p>
* PDFbox has serious problems with documents that use incremental updates or
* XObject forms. Therefor use this to remove incremental updates and create a
* streamlined document.
* </p>
- *
+ *
* <p>
* Note that this has nothing to do with text normalization. It just unifies
* the PDF documents that are fed into PDFbox for text extraction and page
* length determination.
* </p>
- *
+ *
* @param input_pdf
* The input pdf to be normalized.
* @return Returns the normalized pdf.
* @throws IOException
* @throws DocumentException
- * @throws PDFDocumentException
+ * @throws PDFDocumentException
*/
public static byte[] normalizePDF(PdfDataSource pdfDataSource) throws IOException, DocumentException, PDFDocumentException
{
//iText
- byte [] pdf_data = pdfDataSource.getAsByteArray();
- PdfReader reader = new PdfReader(pdf_data);
- PDFASUtils.checkReaderPermissions(reader);
- //input_pdf.close();
-
+ PdfReader reader = PDFASUtils.createPdfReaderCheckingPermissions(pdfDataSource);
+
// PERF: PDF Normalization needs byte array
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// For some reason the Reader -> ImportPage -> Writer mechanism produces