From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../knowcenter/wag/egov/egiz/pdf/Placeholder.java | 572 +++++++++++++++++++++ 1 file changed, 572 insertions(+) create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java') diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java new file mode 100644 index 0000000..9249985 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java @@ -0,0 +1,572 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: Placeholder.java,v 1.5 2006/10/31 08:17:50 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.pdf; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import javax.sound.midi.SysexMessage; + +import org.apache.commons.codec.net.URLCodec; +import org.apache.log4j.Logger; + +import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; +import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; +import at.knowcenter.wag.egov.egiz.exceptions.PlaceholderException; +import at.knowcenter.wag.exactparser.ByteArrayUtils; + +/** + * Helper class that provides functionality for dealing with placeholders and + * replacements in pdf. + * + * @author wprinz + */ +public abstract class Placeholder +{ + /** + * The logger definition. + */ + private static final Logger logger_ = ConfigLogger.getLogger(Placeholder.class); + + /** + * Escapes the String to be a suitable Literal String.. + * + * @param data + * The String to be escaped. + * @return Returns the escaped PDF String. + */ + public static byte[] escapePDFString(byte[] data) + { + try + { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (int i = 0; i < data.length; i++) + { + byte[] escaped_bytes = escapeByte(data[i]); + baos.write(escaped_bytes); + } + return baos.toByteArray(); + } + catch (IOException e) + { + logger_.error(e.getMessage(), e); + return null; + } + } + + /** + * Unescapes the PDF String. + * + * @param data + * The escaped String. + * @return Returns the unescaped String. + */ + public static byte[] unescapePDFString(byte[] data) + { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (int i = 0; i < data.length; i++) + { + if (data[i] == '\\' && data[i + 1] == '\\') + { + baos.write('\\'); + i++; + continue; + } + if (data[i] == '\\' && data[i + 1] == '(') + + { + baos.write('('); + i++; + continue; + } + if (data[i] == '\\' && data[i + 1] == ')') + { + baos.write(')'); + i++; + continue; + } + baos.write(data[i]); + } + return baos.toByteArray(); + } + + /** + * Reconstructs the string from a partition of placeholders. + * + * @param pdf + * The PDF to read the string from. + * @param sis + * The list of StringInfo objects that specify the bytes of the + * string in the pdf. + * @return Returns the extracted and reconverted string. + * @throws IOException + * Forwarded exception. + */ + public static String reconstructStringFromPartition(byte[] pdf, List sis, + byte[] enc) throws IOException + { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + Iterator it = sis.iterator(); + while (it.hasNext()) + { + StringInfo si = (StringInfo) it.next(); + + for (int i = si.string_start; i < si.string_start + si.string_length; i++) + { + if (pdf[i] != 0) + { + baos.write(pdf[i]); + } + } + } + + baos.close(); + byte[] bytes = baos.toByteArray(); + + byte[] unescaped_bytes = unescapePDFString(bytes); + + if (!ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_WIN) && !ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL)) + { + String enc_str = new String(enc, "US-ASCII"); + logger_.warn("The encoding " + enc_str + " is not known by this application - trying to proceed anyways."); + } + + String text = new String(unescaped_bytes, "windows-1252"); + + String str = text; + if (ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL)) + { + str = unapplyURLEncoding(str); + } + + return str; + } + + /** + * Prepares the given String to a byte array that can be substituted into the + * placeholder. + * + * @param text + * The text to be prepared for substitution. + * @return Returns the prepared byte array. + */ + public static byte[] applyWinAnsiEncoding(String text) + { + // text = text.replace("\\", "\\\\"); + // text = text.replace("(", "\\("); + // text = text.replace(")", "\\)"); + + byte[] replace_bytes; + try + { + replace_bytes = text.getBytes("windows-1252");// CP1252 = WinAnsiEncoding + + // test the opposite way: + // String restored_string = new String (replace_bytes, "windows-1252"); + // if (!restored_string.equals(text)) + // { + // String url_encoded = URLEncoder.encode(text); + // replace_bytes = url_encoded.getBytes("windows-1252"); + // } + } + catch (UnsupportedEncodingException e) + { + logger_.error(e.getMessage(), e); + return null; + } + return replace_bytes; + } + + /** + * Unapplies the WinAnsi encoding. + * + * @param replace_bytes + * The bytes. + * @return Returns the decoded String. + */ + public static String unapplyWinAnsiEncoding(byte[] replace_bytes) + { + try + { + String text = new String(replace_bytes, "windows-1252"); + + return text; + } + catch (UnsupportedEncodingException e) + { + logger_.error(e.getMessage(), e); + return null; + } + + } + + /** + * Applies the URL encoding to the text. + * + * @param text + * The text + * @return Returns the URL and WinAnsi encoded text. + */ + public static byte[] applyURLEncoding(String text) + { + URLCodec utf8_url_codec = new URLCodec("UTF-8"); + String url_encoded = null; + try + { + url_encoded = utf8_url_codec.encode(text, "UTF-8"); + } + catch (UnsupportedEncodingException e) + { + throw new RuntimeException("Couldn't url encode : " + text, e); + } + // String url_encoded = URLEncoder.encode(text); + return applyWinAnsiEncoding(url_encoded); + } + + /** + * Unapplies the WinAnsi and URL encoding. + * + * @param winansi_str + * The Winansi and URL text. + * @return Returns the decoded text. + */ + public static String unapplyURLEncoding(String winansi_str) + { + URLCodec utf8_url_codec = new URLCodec("UTF-8"); + String url_decoded = null; + try + { + url_decoded = utf8_url_codec.decode(winansi_str, "UTF-8"); + } + catch (Exception e) + { + throw new RuntimeException("Couldn't url decode : " + winansi_str, e); + } + // String url_decoded = URLDecoder.decode(winansi_str); + return url_decoded; + } + + /** + * Restores the String from a previously prepared byte array. + * + * @param pdf_string + * The byte array. + * @return Returns the unprepared String. + */ + public static String unprepareAndUnescapeString(byte[] pdf_string) + { + try + { + String text = new String(pdf_string, "windows-1252"); + + // This makes problems when "+" appears. + // if (isURLEncoded(text)) + // { + // text = URLDecoder.decode(text); + // } + +// text = text.replace("\\)", ")"); +// text = text.replace("\\(", "("); +// text = text.replace("\\\\", "\\"); + + // TODO: replace jdk1.5-code with jdf1.4-code (should be tested) + /* */ + text = text.replaceAll("\\\\\\)", ")"); + text = text.replaceAll("\\\\\\(", "("); + text = text.replaceAll("\\\\\\\\", "\\\\"); + + + return text; + } + catch (UnsupportedEncodingException e) + { + logger_.error(e.getMessage(), e); + return null; + } + } + + /** + * Checks the presence of typical URL encoded characters to tell if the string + * is URL encoded. + * + *

+ * This heuristic checks if there are any non URL encoded characters in the + * String, like ASCII control characters, which aren't allowed in the + * URLEncoding characterset. + *

+ * + * @param text + * The text under suspicion. + * @return Returns true if the String is URL encoded, false otherwise. + */ + protected static boolean isURLEncoded(String text) + { + if (text.indexOf(' ') >= 0) + { + return false; + } + for (int i = 0; i < text.length(); i++) + { + char c = text.charAt(i); + if (0x00 <= c && c <= 0x1f) + { + return false; + } + if (c == 0x7F) + { + return false; + } + if (0x80 <= c) + { + return false; + } + } + return true; + } + + /** + * Tells, if a break can occur behind the given character. + * + * @param character + * The character. + * @return Returns true, if a break may occur behind the character, false + * otherwise. + */ + protected static boolean canBreakAfter(byte character) + { + return (character == ' ' || character == '.' || character == ',' || character == ';' || character == '-' || character == '\n') ; + } + + /** + * Scans the given PDF content stream for literal PDF strings. + * + * @param pdf + * The PDF. + * @param stream_start + * The start of the content stream to be scanned. + * @param stream_next + * The end of the content stream. + * @return Returns a list of StringInfo objects specifying the strings that + * could be found. + */ + public static List parseStrings(byte[] pdf, int stream_start, int stream_next) + { + List strings = new ArrayList(); + StringInfo cur_string = null; + for (int i = stream_start; i < stream_next; i++) + { + byte cur_byte = pdf[i]; + + if (cur_byte == '(' && pdf[i - 1] != '\\') + { + cur_string = new StringInfo(); + cur_string.pdf = pdf; + cur_string.string_start = i + 1; + cur_string.string_length = -1; + // logger_.debug("String start = " + cur_string.string_start); + continue; + } + if (cur_byte == ')' && pdf[i - 1] != '\\') + { + cur_string.string_length = i - cur_string.string_start; + // logger_.debug("String length = " + cur_string.string_length); + strings.add(cur_string); + + cur_string = null; + continue; + } + } + + return strings; + } + + /** + * Escapes the data byte if necessary. + * + *

+ * Before bytes can be written into the pdf Strings, they have to be escaped. + * Special care has to be taken that escaped sequences are not split due to + * line breaks. This could have fatal consequences and usually renders the + * whole document invalid. + *

+ * + * @param data + * The data byte to be escaped. + * @return Returns a new byte array escaping the data byte. If the byte needs + * not to be escaped, this new array will contain only the original + * data byte. + */ + public static byte[] escapeByte(byte data) + { + if (data == '\\') + { + return new byte[] { '\\', '\\' }; + } + if (data == '(') + { + return new byte[] { '\\', '(' }; + } + if (data == ')') + { + return new byte[] { '\\', ')' }; + } + return new byte[] { data }; + } + + /** + * Replaces the placeholder with the given String breaking lines with a given + * tolerance. + * + * @param pdf + * The PDF. + * @param sis + * The list of StringInfo objects describing the positions where the + * String should be filled in. + * @param replace_bytes + * The unescaped bytes to be filled in. Escaping is performed by this + * method. + * @param tolerance + * The tolerance for line wrapping. The tolerance counts from the end + * of a StringInfo backwards to its start. If a word that starts + * within the tolerance doesn't fit, it is wrapped into the next + * line. + * @throws PDFDocumentException + * Forwarded exception. + */ + public static void replacePlaceholderWithTolerance(byte[] pdf, List sis, + byte[] replace_bytes, int tolerance) throws PDFDocumentException + { + try + { + // String rep_str = new String(replace_bytes); + + SplitStrings ss = new SplitStrings(pdf, sis); + + int read_index = 0; + while (read_index < replace_bytes.length) + { + if (!ss.isValidLine()) + { + break; + } + + byte[] token = readToken(replace_bytes, read_index); + // String token_str = new String(token); + byte[] escaped_token = escapeToken(token); + + if (ss.fits(escaped_token)) + { + ss.write(escaped_token); + read_index += token.length; + continue; + } + else + { + if (ss.getAvailable() < tolerance) + { + ss.newline(); + continue; + } + else + { + // break the token + for (; read_index < replace_bytes.length; read_index++) + { + byte data = replace_bytes[read_index]; + + byte[] escaped_data = escapeByte(data); + + if (ss.fits(escaped_data)) + { + ss.write(escaped_data); + } + else + { + ss.newline(); + break; + } + } + continue; + + } + } + } + ss.fillRest(); + + if (read_index < replace_bytes.length) + { + logger_.error("The replace string was longer than the reserved placeholder."); + throw new PlaceholderException(null, replace_bytes.length - read_index); + } + + } + catch (IOException e) + { + throw new PDFDocumentException(201, e); + } + + } + + protected static byte[] readToken(byte[] bytes, int index) + { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (; index < bytes.length; index++) + { + byte data = bytes[index]; + + + // byte [] escaped_data = escapeByte(data); + baos.write(data); + + if (canBreakAfter(data)) + { + break; + } + } + + return baos.toByteArray(); + } + + + +protected static byte[] escapeToken(byte[] token) throws IOException + { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + for (int i = 0; i < token.length; i++) + { + byte[] escaped_data = escapeByte(token[i]); + baos.write(escaped_data); + } + + return baos.toByteArray(); + } +} -- cgit v1.2.3