aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java
diff options
context:
space:
mode:
authortknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2013-01-09 15:41:29 +0000
committertknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2013-01-09 15:41:29 +0000
commit535a04fa05f739ec16dd81666e3b0f82dfbd442d (patch)
tree0804f301c1a9ceb303a8441b7b29244fc8eb7ff0 /src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java
parent1efaf6fd5619dfa95c9d7e8c71eda4c2ffba4998 (diff)
downloadpdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.gz
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.bz2
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.zip
pdf-as-lib maven project files moved to pdf-as-lib
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java572
1 files changed, 0 insertions, 572 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java
deleted file mode 100644
index 9249985..0000000
--- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java
+++ /dev/null
@@ -1,572 +0,0 @@
-/**
- * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
- * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
- * joint initiative of the Federal Chancellery Austria and Graz University of
- * Technology.
- *
- * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
- * the European Commission - subsequent versions of the EUPL (the "Licence");
- * You may not use this work except in compliance with the Licence.
- * You may obtain a copy of the Licence at:
- * http://www.osor.eu/eupl/
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the Licence is distributed on an "AS IS" basis,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the Licence for the specific language governing permissions and
- * limitations under the Licence.
- *
- * This product combines work with different licenses. See the "NOTICE" text
- * file for details on the various modules and licenses.
- * The "NOTICE" text file is part of the distribution. Any derivative works
- * that you distribute must include a readable copy of the "NOTICE" text file.
- *
- * $Id: Placeholder.java,v 1.5 2006/10/31 08:17:50 wprinz Exp $
- */
-package at.knowcenter.wag.egov.egiz.pdf;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import javax.sound.midi.SysexMessage;
-
-import org.apache.commons.codec.net.URLCodec;
-import org.apache.log4j.Logger;
-
-import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
-import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException;
-import at.knowcenter.wag.egov.egiz.exceptions.PlaceholderException;
-import at.knowcenter.wag.exactparser.ByteArrayUtils;
-
-/**
- * Helper class that provides functionality for dealing with placeholders and
- * replacements in pdf.
- *
- * @author wprinz
- */
-public abstract class Placeholder
-{
- /**
- * The logger definition.
- */
- private static final Logger logger_ = ConfigLogger.getLogger(Placeholder.class);
-
- /**
- * Escapes the String to be a suitable Literal String..
- *
- * @param data
- * The String to be escaped.
- * @return Returns the escaped PDF String.
- */
- public static byte[] escapePDFString(byte[] data)
- {
- try
- {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- for (int i = 0; i < data.length; i++)
- {
- byte[] escaped_bytes = escapeByte(data[i]);
- baos.write(escaped_bytes);
- }
- return baos.toByteArray();
- }
- catch (IOException e)
- {
- logger_.error(e.getMessage(), e);
- return null;
- }
- }
-
- /**
- * Unescapes the PDF String.
- *
- * @param data
- * The escaped String.
- * @return Returns the unescaped String.
- */
- public static byte[] unescapePDFString(byte[] data)
- {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- for (int i = 0; i < data.length; i++)
- {
- if (data[i] == '\\' && data[i + 1] == '\\')
- {
- baos.write('\\');
- i++;
- continue;
- }
- if (data[i] == '\\' && data[i + 1] == '(')
-
- {
- baos.write('(');
- i++;
- continue;
- }
- if (data[i] == '\\' && data[i + 1] == ')')
- {
- baos.write(')');
- i++;
- continue;
- }
- baos.write(data[i]);
- }
- return baos.toByteArray();
- }
-
- /**
- * Reconstructs the string from a partition of placeholders.
- *
- * @param pdf
- * The PDF to read the string from.
- * @param sis
- * The list of StringInfo objects that specify the bytes of the
- * string in the pdf.
- * @return Returns the extracted and reconverted string.
- * @throws IOException
- * Forwarded exception.
- */
- public static String reconstructStringFromPartition(byte[] pdf, List sis,
- byte[] enc) throws IOException
- {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
-
- Iterator it = sis.iterator();
- while (it.hasNext())
- {
- StringInfo si = (StringInfo) it.next();
-
- for (int i = si.string_start; i < si.string_start + si.string_length; i++)
- {
- if (pdf[i] != 0)
- {
- baos.write(pdf[i]);
- }
- }
- }
-
- baos.close();
- byte[] bytes = baos.toByteArray();
-
- byte[] unescaped_bytes = unescapePDFString(bytes);
-
- if (!ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_WIN) && !ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL))
- {
- String enc_str = new String(enc, "US-ASCII");
- logger_.warn("The encoding " + enc_str + " is not known by this application - trying to proceed anyways.");
- }
-
- String text = new String(unescaped_bytes, "windows-1252");
-
- String str = text;
- if (ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL))
- {
- str = unapplyURLEncoding(str);
- }
-
- return str;
- }
-
- /**
- * Prepares the given String to a byte array that can be substituted into the
- * placeholder.
- *
- * @param text
- * The text to be prepared for substitution.
- * @return Returns the prepared byte array.
- */
- public static byte[] applyWinAnsiEncoding(String text)
- {
- // text = text.replace("\\", "\\\\");
- // text = text.replace("(", "\\(");
- // text = text.replace(")", "\\)");
-
- byte[] replace_bytes;
- try
- {
- replace_bytes = text.getBytes("windows-1252");// CP1252 = WinAnsiEncoding
-
- // test the opposite way:
- // String restored_string = new String (replace_bytes, "windows-1252");
- // if (!restored_string.equals(text))
- // {
- // String url_encoded = URLEncoder.encode(text);
- // replace_bytes = url_encoded.getBytes("windows-1252");
- // }
- }
- catch (UnsupportedEncodingException e)
- {
- logger_.error(e.getMessage(), e);
- return null;
- }
- return replace_bytes;
- }
-
- /**
- * Unapplies the WinAnsi encoding.
- *
- * @param replace_bytes
- * The bytes.
- * @return Returns the decoded String.
- */
- public static String unapplyWinAnsiEncoding(byte[] replace_bytes)
- {
- try
- {
- String text = new String(replace_bytes, "windows-1252");
-
- return text;
- }
- catch (UnsupportedEncodingException e)
- {
- logger_.error(e.getMessage(), e);
- return null;
- }
-
- }
-
- /**
- * Applies the URL encoding to the text.
- *
- * @param text
- * The text
- * @return Returns the URL and WinAnsi encoded text.
- */
- public static byte[] applyURLEncoding(String text)
- {
- URLCodec utf8_url_codec = new URLCodec("UTF-8");
- String url_encoded = null;
- try
- {
- url_encoded = utf8_url_codec.encode(text, "UTF-8");
- }
- catch (UnsupportedEncodingException e)
- {
- throw new RuntimeException("Couldn't url encode : " + text, e);
- }
- // String url_encoded = URLEncoder.encode(text);
- return applyWinAnsiEncoding(url_encoded);
- }
-
- /**
- * Unapplies the WinAnsi and URL encoding.
- *
- * @param winansi_str
- * The Winansi and URL text.
- * @return Returns the decoded text.
- */
- public static String unapplyURLEncoding(String winansi_str)
- {
- URLCodec utf8_url_codec = new URLCodec("UTF-8");
- String url_decoded = null;
- try
- {
- url_decoded = utf8_url_codec.decode(winansi_str, "UTF-8");
- }
- catch (Exception e)
- {
- throw new RuntimeException("Couldn't url decode : " + winansi_str, e);
- }
- // String url_decoded = URLDecoder.decode(winansi_str);
- return url_decoded;
- }
-
- /**
- * Restores the String from a previously prepared byte array.
- *
- * @param pdf_string
- * The byte array.
- * @return Returns the unprepared String.
- */
- public static String unprepareAndUnescapeString(byte[] pdf_string)
- {
- try
- {
- String text = new String(pdf_string, "windows-1252");
-
- // This makes problems when "+" appears.
- // if (isURLEncoded(text))
- // {
- // text = URLDecoder.decode(text);
- // }
-
-// text = text.replace("\\)", ")");
-// text = text.replace("\\(", "(");
-// text = text.replace("\\\\", "\\");
-
- // TODO: replace jdk1.5-code with jdf1.4-code (should be tested)
- /* */
- text = text.replaceAll("\\\\\\)", ")");
- text = text.replaceAll("\\\\\\(", "(");
- text = text.replaceAll("\\\\\\\\", "\\\\");
-
-
- return text;
- }
- catch (UnsupportedEncodingException e)
- {
- logger_.error(e.getMessage(), e);
- return null;
- }
- }
-
- /**
- * Checks the presence of typical URL encoded characters to tell if the string
- * is URL encoded.
- *
- * <p>
- * This heuristic checks if there are any non URL encoded characters in the
- * String, like ASCII control characters, which aren't allowed in the
- * URLEncoding characterset.
- * </p>
- *
- * @param text
- * The text under suspicion.
- * @return Returns true if the String is URL encoded, false otherwise.
- */
- protected static boolean isURLEncoded(String text)
- {
- if (text.indexOf(' ') >= 0)
- {
- return false;
- }
- for (int i = 0; i < text.length(); i++)
- {
- char c = text.charAt(i);
- if (0x00 <= c && c <= 0x1f)
- {
- return false;
- }
- if (c == 0x7F)
- {
- return false;
- }
- if (0x80 <= c)
- {
- return false;
- }
- }
- return true;
- }
-
- /**
- * Tells, if a break can occur behind the given character.
- *
- * @param character
- * The character.
- * @return Returns true, if a break may occur behind the character, false
- * otherwise.
- */
- protected static boolean canBreakAfter(byte character)
- {
- return (character == ' ' || character == '.' || character == ',' || character == ';' || character == '-' || character == '\n') ;
- }
-
- /**
- * Scans the given PDF content stream for literal PDF strings.
- *
- * @param pdf
- * The PDF.
- * @param stream_start
- * The start of the content stream to be scanned.
- * @param stream_next
- * The end of the content stream.
- * @return Returns a list of StringInfo objects specifying the strings that
- * could be found.
- */
- public static List parseStrings(byte[] pdf, int stream_start, int stream_next)
- {
- List strings = new ArrayList();
- StringInfo cur_string = null;
- for (int i = stream_start; i < stream_next; i++)
- {
- byte cur_byte = pdf[i];
-
- if (cur_byte == '(' && pdf[i - 1] != '\\')
- {
- cur_string = new StringInfo();
- cur_string.pdf = pdf;
- cur_string.string_start = i + 1;
- cur_string.string_length = -1;
- // logger_.debug("String start = " + cur_string.string_start);
- continue;
- }
- if (cur_byte == ')' && pdf[i - 1] != '\\')
- {
- cur_string.string_length = i - cur_string.string_start;
- // logger_.debug("String length = " + cur_string.string_length);
- strings.add(cur_string);
-
- cur_string = null;
- continue;
- }
- }
-
- return strings;
- }
-
- /**
- * Escapes the data byte if necessary.
- *
- * <p>
- * Before bytes can be written into the pdf Strings, they have to be escaped.
- * Special care has to be taken that escaped sequences are not split due to
- * line breaks. This could have fatal consequences and usually renders the
- * whole document invalid.
- * </p>
- *
- * @param data
- * The data byte to be escaped.
- * @return Returns a new byte array escaping the data byte. If the byte needs
- * not to be escaped, this new array will contain only the original
- * data byte.
- */
- public static byte[] escapeByte(byte data)
- {
- if (data == '\\')
- {
- return new byte[] { '\\', '\\' };
- }
- if (data == '(')
- {
- return new byte[] { '\\', '(' };
- }
- if (data == ')')
- {
- return new byte[] { '\\', ')' };
- }
- return new byte[] { data };
- }
-
- /**
- * Replaces the placeholder with the given String breaking lines with a given
- * tolerance.
- *
- * @param pdf
- * The PDF.
- * @param sis
- * The list of StringInfo objects describing the positions where the
- * String should be filled in.
- * @param replace_bytes
- * The unescaped bytes to be filled in. Escaping is performed by this
- * method.
- * @param tolerance
- * The tolerance for line wrapping. The tolerance counts from the end
- * of a StringInfo backwards to its start. If a word that starts
- * within the tolerance doesn't fit, it is wrapped into the next
- * line.
- * @throws PDFDocumentException
- * Forwarded exception.
- */
- public static void replacePlaceholderWithTolerance(byte[] pdf, List sis,
- byte[] replace_bytes, int tolerance) throws PDFDocumentException
- {
- try
- {
- // String rep_str = new String(replace_bytes);
-
- SplitStrings ss = new SplitStrings(pdf, sis);
-
- int read_index = 0;
- while (read_index < replace_bytes.length)
- {
- if (!ss.isValidLine())
- {
- break;
- }
-
- byte[] token = readToken(replace_bytes, read_index);
- // String token_str = new String(token);
- byte[] escaped_token = escapeToken(token);
-
- if (ss.fits(escaped_token))
- {
- ss.write(escaped_token);
- read_index += token.length;
- continue;
- }
- else
- {
- if (ss.getAvailable() < tolerance)
- {
- ss.newline();
- continue;
- }
- else
- {
- // break the token
- for (; read_index < replace_bytes.length; read_index++)
- {
- byte data = replace_bytes[read_index];
-
- byte[] escaped_data = escapeByte(data);
-
- if (ss.fits(escaped_data))
- {
- ss.write(escaped_data);
- }
- else
- {
- ss.newline();
- break;
- }
- }
- continue;
-
- }
- }
- }
- ss.fillRest();
-
- if (read_index < replace_bytes.length)
- {
- logger_.error("The replace string was longer than the reserved placeholder.");
- throw new PlaceholderException(null, replace_bytes.length - read_index);
- }
-
- }
- catch (IOException e)
- {
- throw new PDFDocumentException(201, e);
- }
-
- }
-
- protected static byte[] readToken(byte[] bytes, int index)
- {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- for (; index < bytes.length; index++)
- {
- byte data = bytes[index];
-
-
- // byte [] escaped_data = escapeByte(data);
- baos.write(data);
-
- if (canBreakAfter(data))
- {
- break;
- }
- }
-
- return baos.toByteArray();
- }
-
-
-
-protected static byte[] escapeToken(byte[] token) throws IOException
- {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
-
- for (int i = 0; i < token.length; i++)
- {
- byte[] escaped_data = escapeByte(token[i]);
- baos.write(escaped_data);
- }
-
- return baos.toByteArray();
- }
-}