From 1b303f5abfdadd03a9e863ebd3cb8713c4d67cc6 Mon Sep 17 00:00:00 2001 From: ferbas Date: Tue, 5 Jan 2010 14:35:20 +0000 Subject: added text sig version 1.2.0 fixed text extraction encoding bug git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@529 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../pdfas/impl/vfilter/VerificationFilterImpl.java | 46 ++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/main/java/at/gv/egiz/pdfas/impl/vfilter/VerificationFilterImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/vfilter/VerificationFilterImpl.java index acf6622..47f217a 100644 --- a/src/main/java/at/gv/egiz/pdfas/impl/vfilter/VerificationFilterImpl.java +++ b/src/main/java/at/gv/egiz/pdfas/impl/vfilter/VerificationFilterImpl.java @@ -504,14 +504,18 @@ public class VerificationFilterImpl implements VerificationFilter return blockPartitions; } - protected String extractText(PdfDataSource pdf, int endOfDocument) throws PresentableException + protected String extractText(PdfDataSource pdf, int endOfDocument) throws PresentableException { + return extractText(pdf, endOfDocument, "utf8"); + } + protected String extractText(PdfDataSource pdf, int endOfDocument, String encoding) throws PresentableException { + log.debug("EXTRACTING TEXT... end index = " + endOfDocument); DelimitedPdfDataSource dds = new DelimitedPdfDataSource(pdf, endOfDocument); //DelimitedInputStream dis = new DelimitedInputStream(pdf.createInputStream(), endOfDocument); - return PdfAS.extractNormalizedTextTextual(dds); + return PdfAS.extractNormalizedTextTextual(dds, encoding); } @@ -714,6 +718,42 @@ public class VerificationFilterImpl implements VerificationFilter log.debug("Extracting signatures finished."); log.debug("Number of found signatures: " + extractedSignatures.size()); + if (extractedSignatures.size() > 0) { + List cp1252SignaturesPositions = new ArrayList(); + //boolean iscp1252Sig = false; + for (int i = 0; i < extractedSignatures.size(); i++) { + SignatureHolder sh = (SignatureHolder)extractedSignatures.get(i); + PdfASID kzid = sh.getSignatureObject().getKZ(); + if (kzid != null && kzid.isOldCp1252Version()) { + log.debug("found cp1252 signature"); + cp1252SignaturesPositions.add(new Integer(i)); + //iscp1252Sig = true; + //break; + } + } + if (cp1252SignaturesPositions.size() > 0) { + log.debug("redo text and signature extraction with cp1252 encoding"); + extractedText = extractText(pdf, endOfDocument, "cp1252"); + log.debug("Extracting text finished."); + + log.debug("Extracting signatures:"); + List cp1252ExtractedSignatures = extractNewSignaturesFromText(extractedText); + log.debug("Extracting signatures finished."); + log.debug("Number of found signatures: " + extractedSignatures.size()); + + if (cp1252ExtractedSignatures.size() != extractedSignatures.size()) { + log.error("Invalid cp1252 signatures found. Skipping cp1252 compatibility."); + } + // merge signature holders + for (int i = 0; i < cp1252SignaturesPositions.size(); i++) { + int replaceIndex = ((Integer)cp1252SignaturesPositions.get(i)).intValue(); + extractedSignatures.remove(replaceIndex); + extractedSignatures.add(replaceIndex, cp1252ExtractedSignatures.get(replaceIndex)); + } + } + + } + if (log.isDebugEnabled()) { log.debug("extracted signatures:"); @@ -788,7 +828,7 @@ public class VerificationFilterImpl implements VerificationFilter DelimitedPdfDataSource dds = new DelimitedPdfDataSource(pdf, pdf.getLength()); String text = null; try { - text = PdfAS.extractNormalizedTextTextual(dds); + text = PdfAS.extractNormalizedTextTextual(dds, "utf-8"); } catch (PresentableException e) { throw new VerificationFilterException(e); } -- cgit v1.2.3