/**
* Copyright (c) 2006 by Know-Center, Graz, Austria
*
* This software is the confidential and proprietary information of Know-Center,
* Graz, Austria. You shall not disclose such Confidential Information and shall
* use it only in accordance with the terms of the license agreement you entered
* into with Know-Center.
*
* KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
* THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR
* NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY
* LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS
* DERIVATIVES.
*
* $Id: Placeholder.java,v 1.5 2006/10/31 08:17:50 wprinz Exp $
*/
package at.knowcenter.wag.egov.egiz.pdf;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.codec.net.URLCodec;
import org.apache.log4j.Logger;
import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException;
import at.knowcenter.wag.egov.egiz.exceptions.PlaceholderException;
import at.knowcenter.wag.exactparser.ByteArrayUtils;
/**
* Helper class that provides functionality for dealing with placeholders and
* replacements in pdf.
*
* @author wprinz
*/
public abstract class Placeholder
{
/**
* The logger definition.
*/
private static final Logger logger_ = ConfigLogger.getLogger(Placeholder.class);
/**
* Escapes the String to be a suitable Literal String..
*
* @param data
* The String to be escaped.
* @return Returns the escaped PDF String.
*/
public static byte[] escapePDFString(byte[] data)
{
try
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
for (int i = 0; i < data.length; i++)
{
byte[] escaped_bytes = escapeByte(data[i]);
baos.write(escaped_bytes);
}
return baos.toByteArray();
}
catch (IOException e)
{
logger_.error(e.getMessage(), e);
return null;
}
}
/**
* Unescapes the PDF String.
*
* @param data
* The escaped String.
* @return Returns the unescaped String.
*/
public static byte[] unescapePDFString(byte[] data)
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
for (int i = 0; i < data.length; i++)
{
if (data[i] == '\\' && data[i + 1] == '\\')
{
baos.write('\\');
i++;
continue;
}
if (data[i] == '\\' && data[i + 1] == '(')
{
baos.write('(');
i++;
continue;
}
if (data[i] == '\\' && data[i + 1] == ')')
{
baos.write(')');
i++;
continue;
}
baos.write(data[i]);
}
return baos.toByteArray();
}
/**
* Reconstructs the string from a partition of placeholders.
*
* @param pdf
* The PDF to read the string from.
* @param sis
* The list of StringInfo objects that specify the bytes of the
* string in the pdf.
* @return Returns the extracted and reconverted string.
* @throws IOException
* Forwarded exception.
*/
public static String reconstructStringFromPartition(byte[] pdf, List sis,
byte[] enc) throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Iterator it = sis.iterator();
while (it.hasNext())
{
StringInfo si = (StringInfo) it.next();
for (int i = si.string_start; i < si.string_start + si.string_length; i++)
{
if (pdf[i] != 0)
{
baos.write(pdf[i]);
}
}
}
baos.close();
byte[] bytes = baos.toByteArray();
byte[] unescaped_bytes = unescapePDFString(bytes);
if (!ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_WIN) && !ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL))
{
String enc_str = new String(enc, "US-ASCII");
logger_.warn("The encoding " + enc_str + " is not known by this application - trying to proceed anyways.");
}
String text = new String(unescaped_bytes, "windows-1252");
String str = text;
if (ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL))
{
str = unapplyURLEncoding(str);
}
return str;
}
/**
* Prepares the given String to a byte array that can be substituted into the
* placeholder.
*
* @param text
* The text to be prepared for substitution.
* @return Returns the prepared byte array.
*/
public static byte[] applyWinAnsiEncoding(String text)
{
// text = text.replace("\\", "\\\\");
// text = text.replace("(", "\\(");
// text = text.replace(")", "\\)");
byte[] replace_bytes;
try
{
replace_bytes = text.getBytes("windows-1252");// CP1252 = WinAnsiEncoding
// test the opposite way:
// String restored_string = new String (replace_bytes, "windows-1252");
// if (!restored_string.equals(text))
// {
// String url_encoded = URLEncoder.encode(text);
// replace_bytes = url_encoded.getBytes("windows-1252");
// }
}
catch (UnsupportedEncodingException e)
{
logger_.error(e.getMessage(), e);
return null;
}
return replace_bytes;
}
/**
* Unapplies the WinAnsi encoding.
*
* @param replace_bytes
* The bytes.
* @return Returns the decoded String.
*/
public static String unapplyWinAnsiEncoding(byte[] replace_bytes)
{
try
{
String text = new String(replace_bytes, "windows-1252");
return text;
}
catch (UnsupportedEncodingException e)
{
logger_.error(e.getMessage(), e);
return null;
}
}
/**
* Applies the URL encoding to the text.
*
* @param text
* The text
* @return Returns the URL and WinAnsi encoded text.
*/
public static byte[] applyURLEncoding(String text)
{
URLCodec utf8_url_codec = new URLCodec("UTF-8");
String url_encoded = null;
try
{
url_encoded = utf8_url_codec.encode(text, "UTF-8");
}
catch (UnsupportedEncodingException e)
{
throw new RuntimeException("Couldn't url encode : " + text, e);
}
// String url_encoded = URLEncoder.encode(text);
return applyWinAnsiEncoding(url_encoded);
}
/**
* Unapplies the WinAnsi and URL encoding.
*
* @param winansi_str
* The Winansi and URL text.
* @return Returns the decoded text.
*/
public static String unapplyURLEncoding(String winansi_str)
{
URLCodec utf8_url_codec = new URLCodec("UTF-8");
String url_decoded = null;
try
{
url_decoded = utf8_url_codec.decode(winansi_str, "UTF-8");
}
catch (Exception e)
{
throw new RuntimeException("Couldn't url decode : " + winansi_str, e);
}
// String url_decoded = URLDecoder.decode(winansi_str);
return url_decoded;
}
/**
* Restores the String from a previously prepared byte array.
*
* @param pdf_string
* The byte array.
* @return Returns the unprepared String.
*/
public static String unprepareAndUnescapeString(byte[] pdf_string)
{
try
{
String text = new String(pdf_string, "windows-1252");
// This makes problems when "+" appears.
// if (isURLEncoded(text))
// {
// text = URLDecoder.decode(text);
// }
// text = text.replace("\\)", ")");
// text = text.replace("\\(", "(");
// text = text.replace("\\\\", "\\");
// TODO: replace jdk1.5-code with jdf1.4-code (should be tested)
/* */
text = text.replaceAll("\\\\\\)", ")");
text = text.replaceAll("\\\\\\(", "(");
text = text.replaceAll("\\\\\\\\", "\\\\");
return text;
}
catch (UnsupportedEncodingException e)
{
logger_.error(e.getMessage(), e);
return null;
}
}
/**
* Checks the presence of typical URL encoded characters to tell if the string
* is URL encoded.
*
*
* This heuristic checks if there are any non URL encoded characters in the
* String, like ASCII control characters, which aren't allowed in the
* URLEncoding characterset.
*
*
* @param text
* The text under suspicion.
* @return Returns true if the String is URL encoded, false otherwise.
*/
protected static boolean isURLEncoded(String text)
{
if (text.indexOf(' ') >= 0)
{
return false;
}
for (int i = 0; i < text.length(); i++)
{
char c = text.charAt(i);
if (0x00 <= c && c <= 0x1f)
{
return false;
}
if (c == 0x7F)
{
return false;
}
if (0x80 <= c)
{
return false;
}
}
return true;
}
/**
* Tells, if a break can occur behind the given character.
*
* @param character
* The character.
* @return Returns true, if a break may occur behind the character, false
* otherwise.
*/
protected static boolean canBreakAfter(byte character)
{
return (character == ' ' || character == ',' || character == ';' || character == '-');
}
/**
* Scans the given PDF content stream for literal PDF strings.
*
* @param pdf
* The PDF.
* @param stream_start
* The start of the content stream to be scanned.
* @param stream_next
* The end of the content stream.
* @return Returns a list of StringInfo objects specifying the strings that
* could be found.
*/
public static List parseStrings(byte[] pdf, int stream_start, int stream_next)
{
List strings = new ArrayList();
StringInfo cur_string = null;
for (int i = stream_start; i < stream_next; i++)
{
byte cur_byte = pdf[i];
if (cur_byte == '(' && pdf[i - 1] != '\\')
{
cur_string = new StringInfo();
cur_string.pdf = pdf;
cur_string.string_start = i + 1;
cur_string.string_length = -1;
// logger_.debug("String start = " + cur_string.string_start);
continue;
}
if (cur_byte == ')' && pdf[i - 1] != '\\')
{
cur_string.string_length = i - cur_string.string_start;
// logger_.debug("String length = " + cur_string.string_length);
strings.add(cur_string);
cur_string = null;
continue;
}
}
return strings;
}
/**
* Escapes the data byte if necessary.
*
*
* Before bytes can be written into the pdf Strings, they have to be escaped.
* Special care has to be taken that escaped sequences are not split due to
* line breaks. This could have fatal consequences and usually renders the
* whole document invalid.
*
*
* @param data
* The data byte to be escaped.
* @return Returns a new byte array escaping the data byte. If the byte needs
* not to be escaped, this new array will contain only the original
* data byte.
*/
public static byte[] escapeByte(byte data)
{
if (data == '\\')
{
return new byte[] { '\\', '\\' };
}
if (data == '(')
{
return new byte[] { '\\', '(' };
}
if (data == ')')
{
return new byte[] { '\\', ')' };
}
return new byte[] { data };
}
/**
* Replaces the placeholder with the given String breaking lines with a given
* tolerance.
*
* @param pdf
* The PDF.
* @param sis
* The list of StringInfo objects describing the positions where the
* String should be filled in.
* @param replace_bytes
* The unescaped bytes to be filled in. Escaping is performed by this
* method.
* @param tolerance
* The tolerance for line wrapping. The tolerance counts from the end
* of a StringInfo backwards to its start. If a word that starts
* within the tolerance doesn't fit, it is wrapped into the next
* line.
* @throws PDFDocumentException
* Forwarded exception.
*/
public static void replacePlaceholderWithTolerance(byte[] pdf, List sis,
byte[] replace_bytes, int tolerance) throws PDFDocumentException
{
try
{
// String rep_str = new String(replace_bytes);
SplitStrings ss = new SplitStrings(pdf, sis);
int read_index = 0;
while (read_index < replace_bytes.length)
{
if (!ss.isValidLine())
{
break;
}
byte[] token = readToken(replace_bytes, read_index);
// String token_str = new String(token);
byte[] escaped_token = escapeToken(token);
if (ss.fits(escaped_token))
{
ss.write(escaped_token);
read_index += token.length;
continue;
}
else
{
if (ss.getAvailable() < tolerance)
{
ss.newline();
continue;
}
else
{
// break the token
for (; read_index < replace_bytes.length; read_index++)
{
byte data = replace_bytes[read_index];
byte[] escaped_data = escapeByte(data);
if (ss.fits(escaped_data))
{
ss.write(escaped_data);
}
else
{
ss.newline();
break;
}
}
continue;
}
}
}
ss.fillRest();
if (read_index < replace_bytes.length)
{
logger_.error("The replace string was longer than the reserved placeholder.");
throw new PlaceholderException(null, replace_bytes.length - read_index);
}
}
catch (IOException e)
{
throw new PDFDocumentException(201, e);
}
}
protected static byte[] readToken(byte[] bytes, int index)
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
for (; index < bytes.length; index++)
{
byte data = bytes[index];
// byte [] escaped_data = escapeByte(data);
baos.write(data);
if (canBreakAfter(data))
{
break;
}
}
return baos.toByteArray();
}
protected static byte[] escapeToken(byte[] token) throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
for (int i = 0; i < token.length; i++)
{
byte[] escaped_data = escapeByte(token[i]);
baos.write(escaped_data);
}
return baos.toByteArray();
}
}