/** * Copyright (c) 2006 by Know-Center, Graz, Austria * * This software is the confidential and proprietary information of Know-Center, * Graz, Austria. You shall not disclose such Confidential Information and shall * use it only in accordance with the terms of the license agreement you entered * into with Know-Center. * * KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR * NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY * LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS * DERIVATIVES. * * $Id: Placeholder.java,v 1.5 2006/10/31 08:17:50 wprinz Exp $ */ package at.knowcenter.wag.egov.egiz.pdf; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.codec.net.URLCodec; import org.apache.log4j.Logger; import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; import at.knowcenter.wag.egov.egiz.exceptions.PlaceholderException; import at.knowcenter.wag.exactparser.ByteArrayUtils; /** * Helper class that provides functionality for dealing with placeholders and * replacements in pdf. * * @author wprinz */ public abstract class Placeholder { /** * The logger definition. */ private static final Logger logger_ = ConfigLogger.getLogger(Placeholder.class); /** * Escapes the String to be a suitable Literal String.. * * @param data * The String to be escaped. * @return Returns the escaped PDF String. */ public static byte[] escapePDFString(byte[] data) { try { ByteArrayOutputStream baos = new ByteArrayOutputStream(); for (int i = 0; i < data.length; i++) { byte[] escaped_bytes = escapeByte(data[i]); baos.write(escaped_bytes); } return baos.toByteArray(); } catch (IOException e) { logger_.error(e.getMessage(), e); return null; } } /** * Unescapes the PDF String. * * @param data * The escaped String. * @return Returns the unescaped String. */ public static byte[] unescapePDFString(byte[] data) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); for (int i = 0; i < data.length; i++) { if (data[i] == '\\' && data[i + 1] == '\\') { baos.write('\\'); i++; continue; } if (data[i] == '\\' && data[i + 1] == '(') { baos.write('('); i++; continue; } if (data[i] == '\\' && data[i + 1] == ')') { baos.write(')'); i++; continue; } baos.write(data[i]); } return baos.toByteArray(); } /** * Reconstructs the string from a partition of placeholders. * * @param pdf * The PDF to read the string from. * @param sis * The list of StringInfo objects that specify the bytes of the * string in the pdf. * @return Returns the extracted and reconverted string. * @throws IOException * Forwarded exception. */ public static String reconstructStringFromPartition(byte[] pdf, List sis, byte[] enc) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); Iterator it = sis.iterator(); while (it.hasNext()) { StringInfo si = (StringInfo) it.next(); for (int i = si.string_start; i < si.string_start + si.string_length; i++) { if (pdf[i] != 0) { baos.write(pdf[i]); } } } baos.close(); byte[] bytes = baos.toByteArray(); byte[] unescaped_bytes = unescapePDFString(bytes); if (!ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_WIN) && !ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL)) { String enc_str = new String(enc, "US-ASCII"); logger_.warn("The encoding " + enc_str + " is not known by this application - trying to proceed anyways."); } String text = new String(unescaped_bytes, "windows-1252"); String str = text; if (ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL)) { str = unapplyURLEncoding(str); } return str; } /** * Prepares the given String to a byte array that can be substituted into the * placeholder. * * @param text * The text to be prepared for substitution. * @return Returns the prepared byte array. */ public static byte[] applyWinAnsiEncoding(String text) { // text = text.replace("\\", "\\\\"); // text = text.replace("(", "\$"); // text = text.replace(")", "\$"); byte[] replace_bytes; try { replace_bytes = text.getBytes("windows-1252");// CP1252 = WinAnsiEncoding // test the opposite way: // String restored_string = new String (replace_bytes, "windows-1252"); // if (!restored_string.equals(text)) // { // String url_encoded = URLEncoder.encode(text); // replace_bytes = url_encoded.getBytes("windows-1252"); // } } catch (UnsupportedEncodingException e) { logger_.error(e.getMessage(), e); return null; } return replace_bytes; } /** * Unapplies the WinAnsi encoding. * * @param replace_bytes * The bytes. * @return Returns the decoded String. */ public static String unapplyWinAnsiEncoding(byte[] replace_bytes) { try { String text = new String(replace_bytes, "windows-1252"); return text; } catch (UnsupportedEncodingException e) { logger_.error(e.getMessage(), e); return null; } } /** * Applies the URL encoding to the text. * * @param text * The text * @return Returns the URL and WinAnsi encoded text. */ public static byte[] applyURLEncoding(String text) { URLCodec utf8_url_codec = new URLCodec("UTF-8"); String url_encoded = null; try { url_encoded = utf8_url_codec.encode(text, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Couldn't url encode : " + text, e); } // String url_encoded = URLEncoder.encode(text); return applyWinAnsiEncoding(url_encoded); } /** * Unapplies the WinAnsi and URL encoding. * * @param winansi_str * The Winansi and URL text. * @return Returns the decoded text. */ public static String unapplyURLEncoding(String winansi_str) { URLCodec utf8_url_codec = new URLCodec("UTF-8"); String url_decoded = null; try { url_decoded = utf8_url_codec.decode(winansi_str, "UTF-8"); } catch (Exception e) { throw new RuntimeException("Couldn't url decode : " + winansi_str, e); } // String url_decoded = URLDecoder.decode(winansi_str); return url_decoded; } /** * Restores the String from a previously prepared byte array. * * @param pdf_string * The byte array. * @return Returns the unprepared String. */ public static String unprepareAndUnescapeString(byte[] pdf_string) { try { String text = new String(pdf_string, "windows-1252"); // This makes problems when "+" appears. // if (isURLEncoded(text)) // { // text = URLDecoder.decode(text); // } // text = text.replace("\\)", ")"); // text = text.replace("\$", "("); // text = text.replace("\\\\", "\\"); // TODO: replace jdk1.5-code with jdf1.4-code (should be tested) /* */ text = text.replaceAll("\\\\\$", ")"); text = text.replaceAll("\\\\\\(", "("); text = text.replaceAll("\\\\\\\\", "\\\\"); return text; } catch (UnsupportedEncodingException e) { logger_.error(e.getMessage(), e); return null; } } /** * Checks the presence of typical URL encoded characters to tell if the string * is URL encoded. * *

* This heuristic checks if there are any non URL encoded characters in the * String, like ASCII control characters, which aren't allowed in the * URLEncoding characterset. *

* * @param text * The text under suspicion. * @return Returns true if the String is URL encoded, false otherwise. */ protected static boolean isURLEncoded(String text) { if (text.indexOf(' ') >= 0) { return false; } for (int i = 0; i < text.length(); i++) { char c = text.charAt(i); if (0x00 <= c && c <= 0x1f) { return false; } if (c == 0x7F) { return false; } if (0x80 <= c) { return false; } } return true; } /** * Tells, if a break can occur behind the given character. * * @param character * The character. * @return Returns true, if a break may occur behind the character, false * otherwise. */ protected static boolean canBreakAfter(byte character) { return (character == ' ' || character == ',' || character == ';' || character == '-'); } /** * Scans the given PDF content stream for literal PDF strings. * * @param pdf * The PDF. * @param stream_start * The start of the content stream to be scanned. * @param stream_next * The end of the content stream. * @return Returns a list of StringInfo objects specifying the strings that * could be found. */ public static List parseStrings(byte[] pdf, int stream_start, int stream_next) { List strings = new ArrayList(); StringInfo cur_string = null; for (int i = stream_start; i < stream_next; i++) { byte cur_byte = pdf[i]; if (cur_byte == '(' && pdf[i - 1] != '\\') { cur_string = new StringInfo(); cur_string.pdf = pdf; cur_string.string_start = i + 1; cur_string.string_length = -1; // logger_.debug("String start = " + cur_string.string_start); continue; } if (cur_byte == ')' && pdf[i - 1] != '\\') { cur_string.string_length = i - cur_string.string_start; // logger_.debug("String length = " + cur_string.string_length); strings.add(cur_string); cur_string = null; continue; } } return strings; } /** * Escapes the data byte if necessary. * *

* Before bytes can be written into the pdf Strings, they have to be escaped. * Special care has to be taken that escaped sequences are not split due to * line breaks. This could have fatal consequences and usually renders the * whole document invalid. *

* * @param data * The data byte to be escaped. * @return Returns a new byte array escaping the data byte. If the byte needs * not to be escaped, this new array will contain only the original * data byte. */ public static byte[] escapeByte(byte data) { if (data == '\\') { return new byte[] { '\\', '\\' }; } if (data == '(') { return new byte[] { '\\', '(' }; } if (data == ')') { return new byte[] { '\\', ')' }; } return new byte[] { data }; } /** * Replaces the placeholder with the given String breaking lines with a given * tolerance. * * @param pdf * The PDF. * @param sis * The list of StringInfo objects describing the positions where the * String should be filled in. * @param replace_bytes * The unescaped bytes to be filled in. Escaping is performed by this * method. * @param tolerance * The tolerance for line wrapping. The tolerance counts from the end * of a StringInfo backwards to its start. If a word that starts * within the tolerance doesn't fit, it is wrapped into the next * line. * @throws PDFDocumentException * Forwarded exception. */ public static void replacePlaceholderWithTolerance(byte[] pdf, List sis, byte[] replace_bytes, int tolerance) throws PDFDocumentException { try { // String rep_str = new String(replace_bytes); SplitStrings ss = new SplitStrings(pdf, sis); int read_index = 0; while (read_index < replace_bytes.length) { if (!ss.isValidLine()) { break; } byte[] token = readToken(replace_bytes, read_index); // String token_str = new String(token); byte[] escaped_token = escapeToken(token); if (ss.fits(escaped_token)) { ss.write(escaped_token); read_index += token.length; continue; } else { if (ss.getAvailable() < tolerance) { ss.newline(); continue; } else { // break the token for (; read_index < replace_bytes.length; read_index++) { byte data = replace_bytes[read_index]; byte[] escaped_data = escapeByte(data); if (ss.fits(escaped_data)) { ss.write(escaped_data); } else { ss.newline(); break; } } continue; } } } ss.fillRest(); if (read_index < replace_bytes.length) { logger_.error("The replace string was longer than the reserved placeholder."); throw new PlaceholderException(null, replace_bytes.length - read_index); } } catch (IOException e) { throw new PDFDocumentException(201, e); } } protected static byte[] readToken(byte[] bytes, int index) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); for (; index < bytes.length; index++) { byte data = bytes[index]; // byte [] escaped_data = escapeByte(data); baos.write(data); if (canBreakAfter(data)) { break; } } return baos.toByteArray(); } protected static byte[] escapeToken(byte[] token) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); for (int i = 0; i < token.length; i++) { byte[] escaped_data = escapeByte(token[i]); baos.write(escaped_data); } return baos.toByteArray(); } }