1 files changed, 572 insertions, 0 deletions
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java
new file mode 100644
index 0000000..9249985
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/Placeholder.java
@@ -0,0 +1,572 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: Placeholder.java,v 1.5 2006/10/31 08:17:50 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.pdf;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import javax.sound.midi.SysexMessage;
+
+import org.apache.commons.codec.net.URLCodec;
+import org.apache.log4j.Logger;
+
+import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
+import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException;
+import at.knowcenter.wag.egov.egiz.exceptions.PlaceholderException;
+import at.knowcenter.wag.exactparser.ByteArrayUtils;
+
+/**
+ * Helper class that provides functionality for dealing with placeholders and
+ * replacements in pdf.
+ * 
+ * @author wprinz
+ */
+public abstract class Placeholder
+{
+  /**
+   * The logger definition.
+   */
+  private static final Logger logger_ = ConfigLogger.getLogger(Placeholder.class);
+
+  /**
+   * Escapes the String to be a suitable Literal String..
+   * 
+   * @param data
+   *          The String to be escaped.
+   * @return Returns the escaped PDF String.
+   */
+  public static byte[] escapePDFString(byte[] data)
+  {
+    try
+    {
+      ByteArrayOutputStream baos = new ByteArrayOutputStream();
+      for (int i = 0; i < data.length; i++)
+      {
+        byte[] escaped_bytes = escapeByte(data[i]);
+        baos.write(escaped_bytes);
+      }
+      return baos.toByteArray();
+    }
+    catch (IOException e)
+    {
+      logger_.error(e.getMessage(), e);
+      return null;
+    }
+  }
+
+  /**
+   * Unescapes the PDF String.
+   * 
+   * @param data
+   *          The escaped String.
+   * @return Returns the unescaped String.
+   */
+  public static byte[] unescapePDFString(byte[] data)
+  {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    for (int i = 0; i < data.length; i++)
+    {
+      if (data[i] == '\\' && data[i + 1] == '\\')
+      {
+        baos.write('\\');
+        i++;
+        continue;
+      }
+      if (data[i] == '\\' && data[i + 1] == '(')
+
+      {
+        baos.write('(');
+        i++;
+        continue;
+      }
+      if (data[i] == '\\' && data[i + 1] == ')')
+      {
+        baos.write(')');
+        i++;
+        continue;
+      }
+      baos.write(data[i]);
+    }
+    return baos.toByteArray();
+  }
+
+  /**
+   * Reconstructs the string from a partition of placeholders.
+   * 
+   * @param pdf
+   *          The PDF to read the string from.
+   * @param sis
+   *          The list of StringInfo objects that specify the bytes of the
+   *          string in the pdf.
+   * @return Returns the extracted and reconverted string.
+   * @throws IOException
+   *           Forwarded exception.
+   */
+  public static String reconstructStringFromPartition(byte[] pdf, List sis,
+      byte[] enc) throws IOException
+  {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+    Iterator it = sis.iterator();
+    while (it.hasNext())
+    {
+      StringInfo si = (StringInfo) it.next();
+
+      for (int i = si.string_start; i < si.string_start + si.string_length; i++)
+      {
+        if (pdf[i] != 0)
+        {
+          baos.write(pdf[i]);
+        }
+      }
+    }
+
+    baos.close();
+    byte[] bytes = baos.toByteArray();
+
+    byte[] unescaped_bytes = unescapePDFString(bytes);
+
+    if (!ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_WIN) && !ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL))
+    {
+      String enc_str = new String(enc, "US-ASCII");
+      logger_.warn("The encoding " + enc_str + " is not known by this application - trying to proceed anyways.");
+    }
+
+    String text = new String(unescaped_bytes, "windows-1252");
+
+    String str = text;
+    if (ByteArrayUtils.compareByteArrays(enc, 0, BinarySignature.ENCODING_URL))
+    {
+      str = unapplyURLEncoding(str);
+    }
+
+    return str;
+  }
+
+  /**
+   * Prepares the given String to a byte array that can be substituted into the
+   * placeholder.
+   * 
+   * @param text
+   *          The text to be prepared for substitution.
+   * @return Returns the prepared byte array.
+   */
+  public static byte[] applyWinAnsiEncoding(String text)
+  {
+    // text = text.replace("\\", "\\\\");
+    // text = text.replace("(", "\\(");
+    // text = text.replace(")", "\\)");
+
+    byte[] replace_bytes;
+    try
+    {
+      replace_bytes = text.getBytes("windows-1252");// CP1252 = WinAnsiEncoding
+
+      // test the opposite way:
+      // String restored_string = new String (replace_bytes, "windows-1252");
+      // if (!restored_string.equals(text))
+      // {
+      // String url_encoded = URLEncoder.encode(text);
+      // replace_bytes = url_encoded.getBytes("windows-1252");
+      // }
+    }
+    catch (UnsupportedEncodingException e)
+    {
+      logger_.error(e.getMessage(), e);
+      return null;
+    }
+    return replace_bytes;
+  }
+
+  /**
+   * Unapplies the WinAnsi encoding.
+   * 
+   * @param replace_bytes
+   *          The bytes.
+   * @return Returns the decoded String.
+   */
+  public static String unapplyWinAnsiEncoding(byte[] replace_bytes)
+  {
+    try
+    {
+      String text = new String(replace_bytes, "windows-1252");
+
+      return text;
+    }
+    catch (UnsupportedEncodingException e)
+    {
+      logger_.error(e.getMessage(), e);
+      return null;
+    }
+
+  }
+
+  /**
+   * Applies the URL encoding to the text.
+   * 
+   * @param text
+   *          The text
+   * @return Returns the URL and WinAnsi encoded text.
+   */
+  public static byte[] applyURLEncoding(String text)
+  {
+    URLCodec utf8_url_codec = new URLCodec("UTF-8");
+    String url_encoded = null;
+    try
+    {
+      url_encoded = utf8_url_codec.encode(text, "UTF-8");
+    }
+    catch (UnsupportedEncodingException e)
+    {
+      throw new RuntimeException("Couldn't url encode : " + text, e);
+    }
+    // String url_encoded = URLEncoder.encode(text);
+    return applyWinAnsiEncoding(url_encoded);
+  }
+
+  /**
+   * Unapplies the WinAnsi and URL encoding.
+   * 
+   * @param winansi_str
+   *          The Winansi and URL text.
+   * @return Returns the decoded text.
+   */
+  public static String unapplyURLEncoding(String winansi_str)
+  {
+    URLCodec utf8_url_codec = new URLCodec("UTF-8");
+    String url_decoded = null;
+    try
+    {
+      url_decoded = utf8_url_codec.decode(winansi_str, "UTF-8");
+    }
+    catch (Exception e)
+    {
+      throw new RuntimeException("Couldn't url decode : " + winansi_str, e);
+    }
+    // String url_decoded = URLDecoder.decode(winansi_str);
+    return url_decoded;
+  }
+
+  /**
+   * Restores the String from a previously prepared byte array.
+   * 
+   * @param pdf_string
+   *          The byte array.
+   * @return Returns the unprepared String.
+   */
+  public static String unprepareAndUnescapeString(byte[] pdf_string)
+  {
+    try
+    {
+      String text = new String(pdf_string, "windows-1252");
+
+      // This makes problems when "+" appears.
+      // if (isURLEncoded(text))
+      // {
+      // text = URLDecoder.decode(text);
+      // }
+
+//      text = text.replace("\\)", ")");
+//      text = text.replace("\\(", "(");
+//      text = text.replace("\\\\", "\\");
+      
+      // TODO: replace jdk1.5-code with jdf1.4-code (should be tested)
+      /* */
+      text = text.replaceAll("\\\\\\)", ")");
+      text = text.replaceAll("\\\\\\(", "(");
+      text = text.replaceAll("\\\\\\\\", "\\\\");
+      
+
+      return text;
+    }
+    catch (UnsupportedEncodingException e)
+    {
+      logger_.error(e.getMessage(), e);
+      return null;
+    }
+  }
+
+  /**
+   * Checks the presence of typical URL encoded characters to tell if the string
+   * is URL encoded.
+   * 
+   * <p>
+   * This heuristic checks if there are any non URL encoded characters in the
+   * String, like ASCII control characters, which aren't allowed in the
+   * URLEncoding characterset.
+   * </p>
+   * 
+   * @param text
+   *          The text under suspicion.
+   * @return Returns true if the String is URL encoded, false otherwise.
+   */
+  protected static boolean isURLEncoded(String text)
+  {
+    if (text.indexOf(' ') >= 0)
+    {
+      return false;
+    }
+    for (int i = 0; i < text.length(); i++)
+    {
+      char c = text.charAt(i);
+      if (0x00 <= c && c <= 0x1f)
+      {
+        return false;
+      }
+      if (c == 0x7F)
+      {
+        return false;
+      }
+      if (0x80 <= c)
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Tells, if a break can occur behind the given character.
+   * 
+   * @param character
+   *          The character.
+   * @return Returns true, if a break may occur behind the character, false
+   *         otherwise.
+   */
+  protected static boolean canBreakAfter(byte character)
+  {
+     return (character == ' ' || character == '.' || character == ',' || character == ';' || character == '-' || character == '\n') ;
+  }
+
+  /**
+   * Scans the given PDF content stream for literal PDF strings.
+   * 
+   * @param pdf
+   *          The PDF.
+   * @param stream_start
+   *          The start of the content stream to be scanned.
+   * @param stream_next
+   *          The end of the content stream.
+   * @return Returns a list of StringInfo objects specifying the strings that
+   *         could be found.
+   */
+  public static List parseStrings(byte[] pdf, int stream_start, int stream_next)
+  {
+    List strings = new ArrayList();
+    StringInfo cur_string = null;
+    for (int i = stream_start; i < stream_next; i++)
+    {
+      byte cur_byte = pdf[i];
+
+      if (cur_byte == '(' && pdf[i - 1] != '\\')
+      {
+        cur_string = new StringInfo();
+        cur_string.pdf = pdf;
+        cur_string.string_start = i + 1;
+        cur_string.string_length = -1;
+        // logger_.debug("String start = " + cur_string.string_start);
+        continue;
+      }
+      if (cur_byte == ')' && pdf[i - 1] != '\\')
+      {
+        cur_string.string_length = i - cur_string.string_start;
+        // logger_.debug("String length = " + cur_string.string_length);
+        strings.add(cur_string);
+
+        cur_string = null;
+        continue;
+      }
+    }
+
+    return strings;
+  }
+
+  /**
+   * Escapes the data byte if necessary.
+   * 
+   * <p>
+   * Before bytes can be written into the pdf Strings, they have to be escaped.
+   * Special care has to be taken that escaped sequences are not split due to
+   * line breaks. This could have fatal consequences and usually renders the
+   * whole document invalid.
+   * </p>
+   * 
+   * @param data
+   *          The data byte to be escaped.
+   * @return Returns a new byte array escaping the data byte. If the byte needs
+   *         not to be escaped, this new array will contain only the original
+   *         data byte.
+   */
+  public static byte[] escapeByte(byte data)
+  {
+    if (data == '\\')
+    {
+      return new byte[] { '\\', '\\' };
+    }
+    if (data == '(')
+    {
+      return new byte[] { '\\', '(' };
+    }
+    if (data == ')')
+    {
+      return new byte[] { '\\', ')' };
+    }
+    return new byte[] { data };
+  }
+
+  /**
+   * Replaces the placeholder with the given String breaking lines with a given
+   * tolerance.
+   * 
+   * @param pdf
+   *          The PDF.
+   * @param sis
+   *          The list of StringInfo objects describing the positions where the
+   *          String should be filled in.
+   * @param replace_bytes
+   *          The unescaped bytes to be filled in. Escaping is performed by this
+   *          method.
+   * @param tolerance
+   *          The tolerance for line wrapping. The tolerance counts from the end
+   *          of a StringInfo backwards to its start. If a word that starts
+   *          within the tolerance doesn't fit, it is wrapped into the next
+   *          line.
+   * @throws PDFDocumentException
+   *           Forwarded exception.
+   */
+  public static void replacePlaceholderWithTolerance(byte[] pdf, List sis,
+      byte[] replace_bytes, int tolerance) throws PDFDocumentException
+  {
+    try
+    {
+      // String rep_str = new String(replace_bytes);
+
+      SplitStrings ss = new SplitStrings(pdf, sis);
+
+      int read_index = 0;
+      while (read_index < replace_bytes.length)
+      {
+        if (!ss.isValidLine())
+        {
+          break;
+        }
+
+        byte[] token = readToken(replace_bytes, read_index);
+        // String token_str = new String(token);
+        byte[] escaped_token = escapeToken(token);
+
+        if (ss.fits(escaped_token))
+        {
+          ss.write(escaped_token);
+          read_index += token.length;
+          continue;
+        }
+        else
+        {
+          if (ss.getAvailable() < tolerance)
+          {
+            ss.newline();
+            continue;
+          }
+          else
+          {
+            // break the token
+            for (; read_index < replace_bytes.length; read_index++)
+            {
+              byte data = replace_bytes[read_index];
+
+              byte[] escaped_data = escapeByte(data);
+
+              if (ss.fits(escaped_data))
+              {
+                ss.write(escaped_data);
+              }
+              else
+              {
+                ss.newline();
+                break;
+              }
+            }
+            continue;
+
+          }
+        }
+      }
+      ss.fillRest();
+
+      if (read_index < replace_bytes.length)
+      {
+        logger_.error("The replace string was longer than the reserved placeholder.");
+        throw new PlaceholderException(null, replace_bytes.length - read_index);
+      }
+
+    }
+    catch (IOException e)
+    {
+      throw new PDFDocumentException(201, e);
+    }
+
+  }
+
+  protected static byte[] readToken(byte[] bytes, int index)
+  {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    for (; index < bytes.length; index++)
+    {
+      byte data = bytes[index];
+
+     
+      // byte [] escaped_data = escapeByte(data);
+      baos.write(data);
+
+      if (canBreakAfter(data))
+      {
+        break;
+      }
+    }
+
+    return baos.toByteArray();
+  }
+
+
+
+protected static byte[] escapeToken(byte[] token) throws IOException
+  {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+    for (int i = 0; i < token.length; i++)
+    {
+      byte[] escaped_data = escapeByte(token[i]);
+      baos.write(escaped_data);
+    }
+
+    return baos.toByteArray();
+  }
+}