From f7f25c895855b4fd4f3d778e26242385c58f0829 Mon Sep 17 00:00:00 2001 From: ferbas Date: Tue, 5 Jan 2010 14:37:21 +0000 Subject: added text sig version 1.2.0 fixed text extraction encoding bug git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@530 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- src/main/java/at/knowcenter/wag/egov/egiz/PdfAS.java | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/PdfAS.java b/src/main/java/at/knowcenter/wag/egov/egiz/PdfAS.java index 78e4eec..57e003a 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/PdfAS.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/PdfAS.java @@ -1099,10 +1099,24 @@ public abstract class PdfAS return ObjectExtractor.extractNonTextInfo(pdfDs); } + + /** + * @deprecated + * Use version with explicit encoding {@link PdfAS#extractNormalizedTextTextual(PdfDataSource, String)}. + * This one uses cp1252. + * + * @param pdfDataSource + * @return + * @throws PresentableException + */ + public static String extractNormalizedTextTextual(PdfDataSource pdfDataSource) throws PresentableException { + return extractNormalizedTextTextual(pdfDataSource, "cp1252"); + + } - public static String extractNormalizedTextTextual(PdfDataSource pdfDataSource) throws PresentableException + public static String extractNormalizedTextTextual(PdfDataSource pdfDataSource, String encoding) throws PresentableException { - String raw_document_text = TextualSignature.extractTextTextual(pdfDataSource); + String raw_document_text = TextualSignature.extractTextTextual(pdfDataSource, encoding); String document_text = normalizeText(raw_document_text); return document_text; } @@ -1112,7 +1126,7 @@ public abstract class PdfAS public static String extractNormalizedTextTextual(byte [] pdf, int length) throws PresentableException { ByteArrayPdfDataSourceImpl pdfDataSource = new ByteArrayPdfDataSourceImpl(pdf, length); - String raw_document_text = TextualSignature.extractTextTextual(pdfDataSource); + String raw_document_text = TextualSignature.extractTextTextual(pdfDataSource, "cp1252"); String document_text = normalizeText(raw_document_text); return document_text; } -- cgit v1.2.3