aboutsummaryrefslogtreecommitdiff
path: root/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools
diff options
context:
space:
mode:
Diffstat (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools')
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/CodingHelper.java301
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/DebugHelper.java90
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/FileHelper.java125
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalize.java55
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java184
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalizer.java280
6 files changed, 1035 insertions, 0 deletions
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/CodingHelper.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/CodingHelper.java
new file mode 100644
index 0000000..5132021
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/CodingHelper.java
@@ -0,0 +1,301 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: CodingHelper.java,v 1.6 2006/10/11 07:52:36 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+import org.apache.commons.codec.binary.Base64;
+
+import at.gv.egiz.pdfas.framework.input.DataSource;
+import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper;
+
+/**
+ * This class provides encoding and decoding methods and other coding methods.
+ * All methods are static!
+ *
+ * @author wlackner
+ */
+public class CodingHelper
+{
+
+ /**
+ * Static Base64 object
+ */
+ private static Base64 b64 = new Base64();
+
+ /**
+ * This method encodes a given Unicode (Java) String to UTF-8 bytes and then
+ * encodes these UTF-8 bytes to a Base64 US-ASCII (Java) String.
+ *
+ * @param plain_string
+ * to be encoded
+ * @return the UTF-8 and Base64 encoded string
+ */
+ public static String encodeUTF8AsBase64(String plain_string)
+ {
+ try
+ {
+ byte[] utf8_bytes = plain_string.getBytes("UTF-8");
+ byte[] base64_bytes = b64.encode(utf8_bytes);
+ String encoded_string = new String(base64_bytes, "US-ASCII");
+ return encoded_string;
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ e.printStackTrace();
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * This method decodes the UTF-8 bytes from a Base64 US-ASCII (Java) String
+ * and decodes the UTF-8 bytes to a unicode (Java) String.
+ *
+ * @param encoded_string
+ * to be decoded
+ * @return the Base64 and UTF-8 decoded string
+ */
+ public static String decodeUTF8FromBase64(String encoded_string)
+ {
+ try
+ {
+ byte[] base64_bytes = encoded_string.getBytes("US-ASCII");
+ byte[] utf8_bytes = b64.decode(base64_bytes);
+ String plain_string = new String(utf8_bytes, "UTF-8");
+ return plain_string;
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ e.printStackTrace();
+ throw new RuntimeException(e);
+ }
+ }
+
+ // /**
+ // * This method encodes a given string UTF-8
+ // *
+ // * @param theString to be encoded
+ // * @return the UTF-8 encoded string
+ // */
+ // public static byte[] encodeUTF8(String theString) {
+ // byte[] utf8 = null;
+ // try {
+ // utf8 = theString.getBytes("UTF-8");
+ // } catch (UnsupportedEncodingException e) {
+ // e.printStackTrace();
+ // }
+ // return utf8;
+ // }
+
+ // /**
+ // * This method decodes a given UTF-8 string
+ // *
+ // * @param theString to be decoded
+ // * @return the decoded UTF-8 string
+ // */
+ // public static String decodeUTF8(String theString) {
+ // byte[] ba = theString.getBytes();
+ // String the_string = decodeUTF8(ba);
+ // if (the_string != null) {
+ // return the_string;
+ // }
+ // return theString;
+ // }
+
+ // /**
+ // * This method decodes a given UTF-8 byte array
+ // *
+ // * @param ba the byte array to be decoded
+ // * @return the decoded UTF-8 string
+ // */
+ // public static String decodeUTF8(byte[] ba) {
+ // String the_string = null;
+ // try {
+ // the_string = new String(ba, "UTF-8");
+ // } catch (UnsupportedEncodingException e) {
+ // e.printStackTrace();
+ // }
+ // return the_string;
+ // }
+
+ /**
+ * This method decodes a given Base64 string.
+ *
+ * <p>
+ * Note that the given String must only contain Base64 characters. (The string
+ * will be converted to a byte array of "US-ASCII" (7 bit) bytes and then this
+ * byte array will be decoded using the Base64 algorithm.
+ * </p>
+ *
+ * @param theString
+ * to be decoded
+ * @return a Base64 decoded byte array
+ */
+ public static byte[] decodeBase64(String theString)
+ {
+ try
+ {
+ byte[] base64_bytes = theString.getBytes("US-ASCII");
+ return b64.decode(base64_bytes);
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ e.printStackTrace();
+ throw new RuntimeException("Very Strange: US-ASCII encoding not supported???", e);
+ }
+ }
+
+ /**
+ * This method decodes a given Base64 byte array
+ *
+ * @param ba
+ * the byte array to be decoded
+ * @return a Base64 decoded byte array
+ */
+ public static byte[] decodeBase64(byte[] ba)
+ {
+ return b64.decode(ba);
+ }
+
+ /**
+ * This method encodes a given byte array Base64
+ *
+ * @param plainString
+ * the byte array to be encoded
+ * @return the Base64 encoded string
+ */
+ public static String encodeBase64(byte[] plainString)
+ {
+ try
+ {
+ byte[] base64_bytes = b64.encode(plainString);
+ return new String(base64_bytes, "US-ASCII");
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ e.printStackTrace();
+ throw new RuntimeException("Very Strange: US-ASCII encoding not supported???", e);
+ }
+ }
+
+ // dferbas
+ /**
+ * This method builds an hash value of a given byte array.
+ *
+ * @param data
+ * the byte array to build the hash value for
+ * @param hashAlg hash algorithm for {@link MessageDigest} e.g. "SHA-1"
+ * @return the calculated hash value as a byte array
+ * @see MessageDigest
+ */
+ public static byte[] buildDigest(byte[] data, String hashAlg)
+ {
+ MessageDigest digester = null;
+ try
+ {
+ digester = MessageDigest.getInstance(hashAlg);
+ digester.update(data);
+ return digester.digest();
+ }
+ catch (NoSuchAlgorithmException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+ // dferbas
+ /**
+ * This method builds an hash value of a given byte array.
+ * @param input
+ * @param hashAlg hashAlg hash algorithm for {@link MessageDigest} e.g. "SHA-1"
+ * @return the calculated hash value as a byte array
+ * @throws IOException
+ */
+ public static byte[] buildDigest(DataSource input, String hashAlg) throws IOException
+ {
+ // PERF: digesting needs data source.
+ byte [] data = DataSourceHelper.convertDataSourceToByteArray(input);
+ return buildDigest(data, hashAlg);
+ }
+
+ /**
+ * This method escapes a given string with HTML entities.
+ *
+ * @param rawString
+ * the string to escaped
+ * @return the HTML escaped string
+ */
+ public static String htmlEscape(String rawString)
+ {
+ rawString = rawString.replaceAll("\\&", "&amp;");
+ rawString = rawString.replaceAll("\\<", "&lt;");
+ rawString = rawString.replaceAll("\\>", "&gt;");
+ rawString = rawString.replaceAll("\">", "&quot;");
+ return rawString;
+ }
+
+ /**
+ * This method checks, if a byte array contains chars that are not base64
+ * conform.
+ *
+ * @param byteArray
+ * the array to test
+ * @return boolean, if a byte array is base64 conform, false otherwise
+ */
+ public static boolean isB64(byte[] byteArray)
+ {
+ try
+ {
+ return Base64.isArrayByteBase64(byteArray);
+ }
+ catch (ArrayIndexOutOfBoundsException e)
+ {
+ return false;
+ }
+ }
+
+ /**
+ * This method checks, if a string contains chars that are not base64 conform.
+ *
+ * @param string
+ * the chars to test
+ * @return boolean, if the given string is base64 conform, false otherwise
+ */
+ public static boolean isB64(String string)
+ {
+ try
+ {
+ return Base64.isArrayByteBase64(string.getBytes());
+ }
+ catch (ArrayIndexOutOfBoundsException e)
+ {
+ return false;
+ }
+ }
+} \ No newline at end of file
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/DebugHelper.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/DebugHelper.java
new file mode 100644
index 0000000..762cb71
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/DebugHelper.java
@@ -0,0 +1,90 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import at.knowcenter.wag.egov.egiz.sig.connectors.moa.EnvelopingBase64MOAConnector;
+
+/**
+ * Contains useful helper methods for debugging.
+ *
+ * @author wprinz
+ */
+public final class DebugHelper
+{
+ /**
+ * The log.
+ */
+ private static Log log = LogFactory.getLog(DebugHelper.class);
+
+ /**
+ * Tells, if Strings should be debugged to a file.
+ *
+ * <p>
+ * If set to false, the corresponding methods will simply do nothing.
+ * </p>
+ */
+ public static boolean debug_to_file = true;
+
+ /**
+ * The directory under which the debug files are to be stored.
+ */
+ public static File debug_dir = new File("C:\\wprinz\\Filer\\egiz2"); //$NON-NLS-1$
+
+ /**
+ * Writes the given String to a file with the given file name in UTF-8
+ * encoding.
+ *
+ * @param str
+ * The String to be written.
+ * @param file_name
+ * The file the String will be written to.
+ */
+ public static void debugStringToFile(String str, String file_name)
+ {
+ if (!debug_to_file)
+ {
+ return;
+ }
+
+ try
+ {
+ File file = new File(debug_dir, file_name);
+ FileOutputStream fos = new FileOutputStream(file);
+ OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8"); //$NON-NLS-1$
+ osw.write(str);
+ osw.close();
+ }
+ catch (Exception e)
+ {
+ log.error(e.getMessage(), e);
+ }
+ }
+}
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/FileHelper.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/FileHelper.java
new file mode 100644
index 0000000..17b98d7
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/FileHelper.java
@@ -0,0 +1,125 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: FileHelper.java,v 1.2 2006/05/15 12:05:21 wlackner Exp $
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.apache.log4j.Logger;
+
+import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
+
+/**
+ * This class provides file reader and writer methods. All methods are static!
+ *
+ * @author wlackner
+ */
+public class FileHelper {
+ /**
+ * The logger definition.
+ */
+ private static final Logger logger_ = ConfigLogger.getLogger(FileHelper.class);
+
+ /**
+ * This method reads a file by reading line by line.
+ *
+ * @param fileName the file to be read
+ * @return the content string of the file
+ */
+ public static String readFromFile(String fileName) {
+ String file_string = null;
+ logger_.trace("Looking for file: " + fileName);
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));
+ logger_.trace("Reading file: " + fileName);
+ String line = null;
+ file_string = "";
+ while ((line = reader.readLine()) != null) {
+ file_string += line;
+ }
+ reader.close();
+ logger_.debug("File successfully read: " + fileName);
+ } catch (FileNotFoundException e) {
+ logger_.debug("File not found: " + fileName);
+ } catch (IOException e) {
+ logger_.debug("Error reading file: " + fileName);
+ }
+ return file_string;
+ }
+
+ /**
+ * This method reads a file by reading line by line.
+ *
+ * @param fileName the file to be read
+ * @return the content string of the file
+ */
+ public static String readFromInputStream(InputStream inputStream) {
+ String file_string = null;
+ if (inputStream == null) {
+ return null;
+ }
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
+ String line = null;
+ file_string = "";
+ while ((line = reader.readLine()) != null) {
+ file_string += line;
+ }
+ reader.close();
+ } catch (IOException e) {
+ logger_.debug("Error reading inputstream.");
+ }
+ return file_string;
+ }
+
+ /**
+ * This method writes a file line by line.
+ *
+ * @param fileName the file to be written
+ * @param fileString the content to be written
+ * @return true if the file could be written sucessfully, false otherwise
+ */
+ public static boolean writeToFile(String fileName, String fileString) {
+ BufferedWriter writer;
+ try {
+ FileWriter fwriter = new FileWriter(fileName);
+ writer = new BufferedWriter(fwriter);
+ writer.write(fileString);
+ writer.close();
+ } catch (IOException e) {
+ logger_.info("File:" + fileName + " can not be written. Cause:" + e.getMessage());
+ return false;
+ }
+ return true;
+ }
+} \ No newline at end of file
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalize.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalize.java
new file mode 100644
index 0000000..2b0b8c2
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalize.java
@@ -0,0 +1,55 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: Normalize.java,v 1.2 2006/05/15 12:05:21 wlackner Exp $
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+
+/**
+ * Defines an interface to get access to different normalizer implementations.
+ *
+ * @author wlackner
+ */
+public interface Normalize {
+
+ /**
+ * Normalize a given text.
+ * @param rawText the raw text to normalize
+ * @param keepMultipleLineBreaks if true multiple line breaks in a row will not be normalized to a single line break
+ * @return the normalized string
+ */
+ public String normalize(String rawText, boolean keepMultipleLineBreaks);
+ /**
+ * Return the current normalizer version string.
+ * @return the version string
+ */
+ public String getVersion();
+
+ /**
+ * Returns the normalizer line separator string.
+ * @return the line separator string
+ */
+ public String getNormCR();
+
+}
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
new file mode 100644
index 0000000..57b8e6f
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
@@ -0,0 +1,184 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+import java.io.Serializable;
+
+/**
+ * This ist the first version implementing a normalizer method. The normalize statements are
+ * performed by using regular expressions.
+ *
+ * @author wlackner
+ */
+public class NormalizeV01 implements Normalize, Serializable {
+// 04.11.2010 change by exthex - added keepMultipleLineBreaks parameter to normalize method
+// to allow multiple line breaks to not be normalized to a single one
+ /**
+ * SVUID.
+ */
+ private static final long serialVersionUID = 2302956630639871601L;
+
+ /**
+ * The space string
+ */
+ private final static String NORM_SP = " "; //\u0020
+ /**
+ * The line break string --> use only \n because XML-Parser ignores \r\n
+ */
+ private final static String NORM_CR = "\n"; //
+ /**
+ * The apostrophe string
+ */
+ private final static String NORM_AP = "'"; //\u0027
+ /**
+ * The quotation mark string
+ */
+ private final static String NORM_QU = "\""; //\u0022
+ /**
+ * The hypens string
+ */
+ private final static String NORM_HY = "-"; //\u002D
+ /**
+ * The current version string
+ */
+ protected static final String VERSION = "V01";
+
+ /**
+ * The empty constructor.
+ */
+ public NormalizeV01() {
+ }
+
+ /**
+ * The normalizer implementation. <br>
+ * Normalizer algorithums:
+ * <ol>
+ * <li>code all multiple line breaks as \n\n</li>
+ * <li>replace all Tabs and form feeds with spaces</li>
+ * <li>code line breaks as \n</li>
+ * <li>reduce all multiple line breaks into one line break (only if keepMultipleLineBreaks == false), code line break as \r</li>
+ * <li>replace all single line breaks with space</li>
+ * <li>normalize spaces</li>
+ * <li>remove spaces before and after a line break</li>
+ * <li>remove leading and trailing space or line break in the string</li>
+ * <li>normalize line breaks</li>
+ * <li>normalize apostrophes</li>
+ * <li>normalize quotations</li>
+ * <li>normalize hypens</li>
+ * </ol>
+ *
+ * @param rawText the text to normalize
+ * @param keepMultipleLineBreaks if true, multiple newlines in a row will not be normalized to a single line break
+ * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String)
+ */
+ public String normalize(String rawText, boolean keepMultipleLineBreaks) {
+ if (rawText == null || rawText.equals("null") || rawText.length() == 0) {
+ return "";
+ }
+ String normText = rawText;
+
+ // replace all null values
+ normText = normText.replaceAll("\u0000+", "");
+
+ // replace all Tabs and form feeds with spaces
+ normText = normText.replaceAll("[\t\f]", NORM_SP);
+
+ // replace all non breaking spaces with normal spaces
+ normText = normText.replaceAll("\u00a0+", NORM_SP);
+
+ // code all windows line breaks as \n
+ normText = normText.replaceAll("\r\n", "\n");
+
+ // code all mac line breaks as \n
+ normText = normText.replace('\r', '\n');
+
+ if (!keepMultipleLineBreaks)
+ {
+ // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r
+ normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r");
+ }
+
+ // replace all single line breaks with one line break
+ normText = normText.replace('\n', '\r');
+
+ // normalize spaces
+ normText = normText.replaceAll(" +", NORM_SP);
+
+ // remove spaces before and after a single line break
+ normText = normText.replaceAll(" ?\r ?", "\r");
+
+ if (keepMultipleLineBreaks)
+ {
+ // remove spaces before and after a multiple line breaks
+ normText = normText.replaceAll(" ?\r\r ?", "\r\r");
+ } else
+ {
+ normText = normText.replaceAll(" ?\r\r ?", "\r");
+ }
+
+ // remove leading and trailing space or line break in the string
+ int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0);
+ int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length());
+ if (end_idx < start_idx) {
+ end_idx = start_idx;
+ }
+
+ // System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" +
+ // normText_.length());
+ normText = normText.substring(start_idx, end_idx);
+
+ // normalize line breaks
+ normText = normText.replaceAll("\r", NORM_CR);
+
+ // normalize apostrophes
+ normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP);
+
+ // normalize quotations
+ normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU);
+
+ // normalize hypens
+ normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY);
+
+ return normText;
+ }
+
+ /**
+ * Return the version string.
+ *
+ * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion()
+ */
+ public String getVersion() {
+ return VERSION;
+ }
+
+ /**
+ * Returns the normalizer line separator string.
+ * @return the line separator string
+ */
+ public String getNormCR() {
+ return NORM_CR;
+ }
+} \ No newline at end of file
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalizer.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalizer.java
new file mode 100644
index 0000000..a08c12e
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalizer.java
@@ -0,0 +1,280 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: Normalizer.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+import java.io.Serializable;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+
+import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
+import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
+import at.knowcenter.wag.egov.egiz.exceptions.NormalizeException;
+import at.knowcenter.wag.egov.egiz.exceptions.SettingsException;
+
+/**
+ * This class provides wrapper methods to get an access to different normalizer implementations.
+ * <br>
+ * This class is to load the corresponding implementation of a normalizer class. Therefor it seams
+ * to be a factory. The factory settings are read from the configuration file calling the
+ * SettingsReader.
+ *
+ * @author wlackner
+ * @see at.knowcenter.wag.egov.egiz.tools.Normalizer
+ * @see at.knowcenter.wag.egov.egiz.tools.NormalizeV01
+ * @see at.knowcenter.wag.egov.egiz.cfg.SettingsReader
+ */
+public class Normalizer implements Serializable {
+// 04.11.2010 changed by exthex - normalize methods use and propagate the keepMultipleNewlines parameter
+
+ /**
+ * SVUID.
+ */
+ private static final long serialVersionUID = 4201772508393848555L;
+
+ /**
+ * The current raw string to normalize
+ */
+ private String rawString_ = null;
+ /**
+ * The current normalisation version string
+ */
+ private String normVersion_ = null;
+ /**
+ * The normalized string cache
+ */
+ private String normString_ = null;
+ /**
+ * The reference to the normalizer implementation
+ */
+ private Normalize normalize_ = null;
+// /**
+// * A given Encoding, not used now
+// */
+// private String encoding_ = null;
+ /**
+ * The SettingsReader instance
+ */
+ private SettingsReader settings_ = null;
+ /**
+ * The factory class prefix
+ */
+ private final static String CLASS_PREFIX = ".Normalize";
+ /**
+ * The default version string
+ */
+ protected final static String DEFAULT_VERSION = "V01";
+ /**
+ * The settings key defined in the settings file
+ *
+ * @see SettingsReader
+ */
+ protected final static String SETTINGS_VERSION_KEY = "normalizer.version";
+ /**
+ * The logger definition.
+ */
+ private static final Logger logger_ = ConfigLogger.getLogger(Normalizer.class);
+
+ /**
+ * New Normalizer init by the raw string and a normalizer version.
+ *
+ * @param rawString the raw string to normalize
+ * @param normVersion the nomalizer version that should be used
+ * @throws NormalizeException ErrorCode:400
+ */
+ public Normalizer(String rawString, String normVersion) throws NormalizeException {
+ rawString_ = rawString;
+ normVersion_ = normVersion;
+ init();
+ }
+
+ /**
+ * New Normalizer init by the raw string.
+ *
+ * @param rawString the raw string to normalize
+ * @throws NormalizeException ErrorCode:400
+ */
+ public Normalizer(String rawString) throws NormalizeException {
+ rawString_ = rawString;
+ init();
+ }
+
+ /**
+ * The empty constructor.
+ *
+ * @throws NormalizeException ErrorCode:400
+ */
+ public Normalizer() throws NormalizeException {
+ init();
+ }
+
+ /**
+ * Load the factory implementation. This method trys to load the configured normalizer library.
+ *
+ * @throws NormalizeException
+ */
+ public void init() throws NormalizeException {
+ loadSettings();
+ String class_name = this.getClass().getPackage().getName() + getClassName();
+ Class normalize_class = null;
+ try {
+ normalize_class = Class.forName(class_name);
+ } catch (ClassNotFoundException e) {
+ if (logger_.isEnabledFor(Level.FATAL)) {
+ logger_.fatal("Class not found:" + class_name);
+ }
+ throw new NormalizeException("Can not load normalizer library", e);
+ }
+ try {
+ normalize_ = (Normalize) normalize_class.newInstance();
+ } catch (InstantiationException e) {
+ if (logger_.isEnabledFor(Level.FATAL)) {
+ logger_.fatal("Can not instantiate:" + class_name);
+ }
+ throw new NormalizeException("Can not load normalizer library", e);
+ } catch (IllegalAccessException e) {
+ if (logger_.isEnabledFor(Level.FATAL)) {
+ logger_.fatal("Can not access:" + class_name);
+ }
+ throw new NormalizeException("Can not load normalizer library", e);
+ }
+ }
+
+ /**
+ * Returns the underlying normalizer instance.
+ * @author tknall
+ */
+ public Normalize getInstance() {
+ return this.normalize_;
+ }
+
+ /**
+ * Read the class postfix from the configuration file
+ *
+ * @return the full qualified class name
+ */
+ private String getClassName() {
+ if (normVersion_ == null) {
+ normVersion_ = settings_.getSetting(SETTINGS_VERSION_KEY, DEFAULT_VERSION);
+ }
+ return CLASS_PREFIX + normVersion_;
+ }
+
+ /*
+ * public void setEncoding(String encoding) { encoding_ = encoding; }
+ */
+
+ /**
+ * Set the raw string to normalize
+ */
+ public void setRawString(String rawString) {
+ rawString_ = rawString;
+ }
+
+// /**
+// * Return the normalized string. If the chached value does not exist the normalize method from the
+// * current normalizer implementation is called.
+// *
+// * @return the normalized string
+// */
+// public String getNormalizedString() {
+// if (normString_ == null) {
+// normalize();
+// }
+// return normString_;
+// }
+
+ /**
+ * Set a normalizer version. This activity load the new requested normalizer implementation.
+ *
+ * @param normVersion the normalizer version to be use
+ * @throws NormalizeException ErrorCode:400
+ */
+ public void setVersion(String normVersion) throws NormalizeException {
+ normVersion_ = normVersion;
+ init();
+ }
+
+ /**
+ * Return the current version string.
+ *
+ * @return the normaliser version string
+ */
+ public String getVersion() {
+ return normVersion_;
+ }
+
+ /**
+ * Wrapper method. Call the normalizer implementation method.
+ *
+ * @param rawString the raw string to normalize
+ * @param keepMultipleNewlines
+ * @return the normalized string
+ * @see NormalizeV01
+ */
+ public String normalize(String rawString, boolean keepMultipleNewlines) {
+ return normalize_.normalize(rawString, keepMultipleNewlines);
+ }
+
+ /**
+ * Wrapper method. Call the normalizer implementation method. Normalize the current raw string.
+ *
+ * @return the normalized string
+ * @see NormalizeV01
+ */
+ public String normalize(boolean keepMultipleNewlines) {
+ if (normString_ == null) {
+ normString_ = normalize(rawString_, keepMultipleNewlines);
+ }
+ return normString_;
+ }
+
+ /**
+ * Returns the normalizer line separator string.
+ * @return the line separator string
+ */
+ public String getNormCR() {
+ return normalize_.getNormCR();
+ }
+
+ /**
+ * load the class settings
+ *
+ * @throws NormalizeException
+ * @see SettingsReader
+ */
+ private void loadSettings() throws NormalizeException {
+ if (settings_ == null) {
+ try {
+ settings_ = SettingsReader.getInstance();
+ } catch (SettingsException e) {
+ String log_message = "Can not load normalizer settings. Cause:\n" + e.getMessage();
+ logger_.error(log_message, e);
+ throw new NormalizeException(log_message, e);
+ }
+ }
+ }
+} \ No newline at end of file