From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../knowcenter/wag/egov/egiz/pdf/Placeholder.java | 572 --------------------- 1 file changed, 572 deletions(-) delete mode 100644 src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java') diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java deleted file mode 100644 index 9249985..0000000 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java +++ /dev/null @@ -1,572 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - * - * $Id: Placeholder.java,v 1.5 2006/10/31 08:17:50 wprinz Exp $ - */ -package at.knowcenter.wag.egov.egiz.pdf; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import javax.sound.midi.SysexMessage; - -import org.apache.commons.codec.net.URLCodec; -import org.apache.log4j.Logger; - -import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; -import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; -import at.knowcenter.wag.egov.egiz.exceptions.PlaceholderException; -import at.knowcenter.wag.exactparser.ByteArrayUtils; - -/** - * Helper class that provides functionality for dealing with placeholders and - * replacements in pdf. - * - * @author wprinz - */ -public abstract class Placeholder -{ - /** - * The logger definition. - */ - private static final Logger logger_ = ConfigLogger.getLogger(Placeholder.class); - - /** - * Escapes the String to be a suitable Literal String.. - * - * @param data - * The String to be escaped. - * @return Returns the escaped PDF String. - */ - public static byte[] escapePDFString(byte[] data) - { - try - { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - for (int i = 0; i < data.length; i++) - { - byte[] escaped_bytes = escapeByte(data[i]); - baos.write(escaped_bytes); - } - return baos.toByteArray(); - } - catch (IOException e) - { - logger_.error(e.getMessage(), e); - return null; - } - } - - /** - * Unescapes the PDF String. - * - * @param data - * The escaped String. - * @return Returns the unescaped String. - */ - public static byte[] unescapePDFString(byte[] data) - { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - for (int i = 0; i < data.length; i++) - { - if (data[i] == '\\' && data[i + 1] == '\\') - { - baos.write('\\'); - i++; - continue; - } - if (data[i] == '\\' && data[i + 1] == '(') - - { - baos.write('('); - i++; - continue; - } - if (data[i] == '\\' && data[i + 1] == ')') - { - baos.write(')'); - i++; - continue; - } - baos.write(data[i]); - } - return baos.toByteArray(); - } - - /** - * Reconstructs the string from a partition of placeholders. - * - * @param pdf - * The PDF to read the string from. - * @param sis - * The list of StringInfo objects that specify the bytes of the - * string in the pdf. - * @return Returns the extracted and reconverted string. - * @throws IOException - * Forwarded exception. - */ - public static String reconstructStringFromPartition(byte[] pdf, List sis, - byte[] enc) throws IOException - { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - - Iterator it = sis.iterator(); - while (it.hasNext()) - { - StringInfo si = (StringInfo) it.next(); - - for (int i = si.string_start; i < si.string_start + si.string_length; i++) - { - if (pdf[i] != 0) - { - baos.write(pdf[i]); - } - } - } - - baos.close(); - byte[] bytes = baos.toByteArray(); - - byte[] unescaped_bytes = unescapePDFString(bytes); - - if (!ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_WIN) && !ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL)) - { - String enc_str = new String(enc, "US-ASCII"); - logger_.warn("The encoding " + enc_str + " is not known by this application - trying to proceed anyways."); - } - - String text = new String(unescaped_bytes, "windows-1252"); - - String str = text; - if (ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL)) - { - str = unapplyURLEncoding(str); - } - - return str; - } - - /** - * Prepares the given String to a byte array that can be substituted into the - * placeholder. - * - * @param text - * The text to be prepared for substitution. - * @return Returns the prepared byte array. - */ - public static byte[] applyWinAnsiEncoding(String text) - { - // text = text.replace("\\", "\\\\"); - // text = text.replace("(", "\\("); - // text = text.replace(")", "\\)"); - - byte[] replace_bytes; - try - { - replace_bytes = text.getBytes("windows-1252");// CP1252 = WinAnsiEncoding - - // test the opposite way: - // String restored_string = new String (replace_bytes, "windows-1252"); - // if (!restored_string.equals(text)) - // { - // String url_encoded = URLEncoder.encode(text); - // replace_bytes = url_encoded.getBytes("windows-1252"); - // } - } - catch (UnsupportedEncodingException e) - { - logger_.error(e.getMessage(), e); - return null; - } - return replace_bytes; - } - - /** - * Unapplies the WinAnsi encoding. - * - * @param replace_bytes - * The bytes. - * @return Returns the decoded String. - */ - public static String unapplyWinAnsiEncoding(byte[] replace_bytes) - { - try - { - String text = new String(replace_bytes, "windows-1252"); - - return text; - } - catch (UnsupportedEncodingException e) - { - logger_.error(e.getMessage(), e); - return null; - } - - } - - /** - * Applies the URL encoding to the text. - * - * @param text - * The text - * @return Returns the URL and WinAnsi encoded text. - */ - public static byte[] applyURLEncoding(String text) - { - URLCodec utf8_url_codec = new URLCodec("UTF-8"); - String url_encoded = null; - try - { - url_encoded = utf8_url_codec.encode(text, "UTF-8"); - } - catch (UnsupportedEncodingException e) - { - throw new RuntimeException("Couldn't url encode : " + text, e); - } - // String url_encoded = URLEncoder.encode(text); - return applyWinAnsiEncoding(url_encoded); - } - - /** - * Unapplies the WinAnsi and URL encoding. - * - * @param winansi_str - * The Winansi and URL text. - * @return Returns the decoded text. - */ - public static String unapplyURLEncoding(String winansi_str) - { - URLCodec utf8_url_codec = new URLCodec("UTF-8"); - String url_decoded = null; - try - { - url_decoded = utf8_url_codec.decode(winansi_str, "UTF-8"); - } - catch (Exception e) - { - throw new RuntimeException("Couldn't url decode : " + winansi_str, e); - } - // String url_decoded = URLDecoder.decode(winansi_str); - return url_decoded; - } - - /** - * Restores the String from a previously prepared byte array. - * - * @param pdf_string - * The byte array. - * @return Returns the unprepared String. - */ - public static String unprepareAndUnescapeString(byte[] pdf_string) - { - try - { - String text = new String(pdf_string, "windows-1252"); - - // This makes problems when "+" appears. - // if (isURLEncoded(text)) - // { - // text = URLDecoder.decode(text); - // } - -// text = text.replace("\\)", ")"); -// text = text.replace("\\(", "("); -// text = text.replace("\\\\", "\\"); - - // TODO: replace jdk1.5-code with jdf1.4-code (should be tested) - /* */ - text = text.replaceAll("\\\\\\)", ")"); - text = text.replaceAll("\\\\\\(", "("); - text = text.replaceAll("\\\\\\\\", "\\\\"); - - - return text; - } - catch (UnsupportedEncodingException e) - { - logger_.error(e.getMessage(), e); - return null; - } - } - - /** - * Checks the presence of typical URL encoded characters to tell if the string - * is URL encoded. - * - *

- * This heuristic checks if there are any non URL encoded characters in the - * String, like ASCII control characters, which aren't allowed in the - * URLEncoding characterset. - *

- * - * @param text - * The text under suspicion. - * @return Returns true if the String is URL encoded, false otherwise. - */ - protected static boolean isURLEncoded(String text) - { - if (text.indexOf(' ') >= 0) - { - return false; - } - for (int i = 0; i < text.length(); i++) - { - char c = text.charAt(i); - if (0x00 <= c && c <= 0x1f) - { - return false; - } - if (c == 0x7F) - { - return false; - } - if (0x80 <= c) - { - return false; - } - } - return true; - } - - /** - * Tells, if a break can occur behind the given character. - * - * @param character - * The character. - * @return Returns true, if a break may occur behind the character, false - * otherwise. - */ - protected static boolean canBreakAfter(byte character) - { - return (character == ' ' || character == '.' || character == ',' || character == ';' || character == '-' || character == '\n') ; - } - - /** - * Scans the given PDF content stream for literal PDF strings. - * - * @param pdf - * The PDF. - * @param stream_start - * The start of the content stream to be scanned. - * @param stream_next - * The end of the content stream. - * @return Returns a list of StringInfo objects specifying the strings that - * could be found. - */ - public static List parseStrings(byte[] pdf, int stream_start, int stream_next) - { - List strings = new ArrayList(); - StringInfo cur_string = null; - for (int i = stream_start; i < stream_next; i++) - { - byte cur_byte = pdf[i]; - - if (cur_byte == '(' && pdf[i - 1] != '\\') - { - cur_string = new StringInfo(); - cur_string.pdf = pdf; - cur_string.string_start = i + 1; - cur_string.string_length = -1; - // logger_.debug("String start = " + cur_string.string_start); - continue; - } - if (cur_byte == ')' && pdf[i - 1] != '\\') - { - cur_string.string_length = i - cur_string.string_start; - // logger_.debug("String length = " + cur_string.string_length); - strings.add(cur_string); - - cur_string = null; - continue; - } - } - - return strings; - } - - /** - * Escapes the data byte if necessary. - * - *

- * Before bytes can be written into the pdf Strings, they have to be escaped. - * Special care has to be taken that escaped sequences are not split due to - * line breaks. This could have fatal consequences and usually renders the - * whole document invalid. - *

- * - * @param data - * The data byte to be escaped. - * @return Returns a new byte array escaping the data byte. If the byte needs - * not to be escaped, this new array will contain only the original - * data byte. - */ - public static byte[] escapeByte(byte data) - { - if (data == '\\') - { - return new byte[] { '\\', '\\' }; - } - if (data == '(') - { - return new byte[] { '\\', '(' }; - } - if (data == ')') - { - return new byte[] { '\\', ')' }; - } - return new byte[] { data }; - } - - /** - * Replaces the placeholder with the given String breaking lines with a given - * tolerance. - * - * @param pdf - * The PDF. - * @param sis - * The list of StringInfo objects describing the positions where the - * String should be filled in. - * @param replace_bytes - * The unescaped bytes to be filled in. Escaping is performed by this - * method. - * @param tolerance - * The tolerance for line wrapping. The tolerance counts from the end - * of a StringInfo backwards to its start. If a word that starts - * within the tolerance doesn't fit, it is wrapped into the next - * line. - * @throws PDFDocumentException - * Forwarded exception. - */ - public static void replacePlaceholderWithTolerance(byte[] pdf, List sis, - byte[] replace_bytes, int tolerance) throws PDFDocumentException - { - try - { - // String rep_str = new String(replace_bytes); - - SplitStrings ss = new SplitStrings(pdf, sis); - - int read_index = 0; - while (read_index < replace_bytes.length) - { - if (!ss.isValidLine()) - { - break; - } - - byte[] token = readToken(replace_bytes, read_index); - // String token_str = new String(token); - byte[] escaped_token = escapeToken(token); - - if (ss.fits(escaped_token)) - { - ss.write(escaped_token); - read_index += token.length; - continue; - } - else - { - if (ss.getAvailable() < tolerance) - { - ss.newline(); - continue; - } - else - { - // break the token - for (; read_index < replace_bytes.length; read_index++) - { - byte data = replace_bytes[read_index]; - - byte[] escaped_data = escapeByte(data); - - if (ss.fits(escaped_data)) - { - ss.write(escaped_data); - } - else - { - ss.newline(); - break; - } - } - continue; - - } - } - } - ss.fillRest(); - - if (read_index < replace_bytes.length) - { - logger_.error("The replace string was longer than the reserved placeholder."); - throw new PlaceholderException(null, replace_bytes.length - read_index); - } - - } - catch (IOException e) - { - throw new PDFDocumentException(201, e); - } - - } - - protected static byte[] readToken(byte[] bytes, int index) - { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - for (; index < bytes.length; index++) - { - byte data = bytes[index]; - - - // byte [] escaped_data = escapeByte(data); - baos.write(data); - - if (canBreakAfter(data)) - { - break; - } - } - - return baos.toByteArray(); - } - - - -protected static byte[] escapeToken(byte[] token) throws IOException - { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - - for (int i = 0; i < token.length; i++) - { - byte[] escaped_data = escapeByte(token[i]); - baos.write(escaped_data); - } - - return baos.toByteArray(); - } -} -- cgit v1.2.3