From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/tools/CodingHelper.java | 301 +++++++++++++++++++++ .../wag/egov/egiz/tools/DebugHelper.java | 90 ++++++ .../knowcenter/wag/egov/egiz/tools/FileHelper.java | 125 +++++++++ .../knowcenter/wag/egov/egiz/tools/Normalize.java | 55 ++++ .../wag/egov/egiz/tools/NormalizeV01.java | 184 +++++++++++++ .../knowcenter/wag/egov/egiz/tools/Normalizer.java | 280 +++++++++++++++++++ 6 files changed, 1035 insertions(+) create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/CodingHelper.java create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/DebugHelper.java create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/FileHelper.java create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalize.java create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalizer.java (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools') diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/CodingHelper.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/CodingHelper.java new file mode 100644 index 0000000..5132021 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/CodingHelper.java @@ -0,0 +1,301 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: CodingHelper.java,v 1.6 2006/10/11 07:52:36 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.tools; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +import org.apache.commons.codec.binary.Base64; + +import at.gv.egiz.pdfas.framework.input.DataSource; +import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper; + +/** + * This class provides encoding and decoding methods and other coding methods. + * All methods are static! + * + * @author wlackner + */ +public class CodingHelper +{ + + /** + * Static Base64 object + */ + private static Base64 b64 = new Base64(); + + /** + * This method encodes a given Unicode (Java) String to UTF-8 bytes and then + * encodes these UTF-8 bytes to a Base64 US-ASCII (Java) String. + * + * @param plain_string + * to be encoded + * @return the UTF-8 and Base64 encoded string + */ + public static String encodeUTF8AsBase64(String plain_string) + { + try + { + byte[] utf8_bytes = plain_string.getBytes("UTF-8"); + byte[] base64_bytes = b64.encode(utf8_bytes); + String encoded_string = new String(base64_bytes, "US-ASCII"); + return encoded_string; + } + catch (UnsupportedEncodingException e) + { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + + /** + * This method decodes the UTF-8 bytes from a Base64 US-ASCII (Java) String + * and decodes the UTF-8 bytes to a unicode (Java) String. + * + * @param encoded_string + * to be decoded + * @return the Base64 and UTF-8 decoded string + */ + public static String decodeUTF8FromBase64(String encoded_string) + { + try + { + byte[] base64_bytes = encoded_string.getBytes("US-ASCII"); + byte[] utf8_bytes = b64.decode(base64_bytes); + String plain_string = new String(utf8_bytes, "UTF-8"); + return plain_string; + } + catch (UnsupportedEncodingException e) + { + e.printStackTrace(); + throw new RuntimeException(e); + } + } + + // /** + // * This method encodes a given string UTF-8 + // * + // * @param theString to be encoded + // * @return the UTF-8 encoded string + // */ + // public static byte[] encodeUTF8(String theString) { + // byte[] utf8 = null; + // try { + // utf8 = theString.getBytes("UTF-8"); + // } catch (UnsupportedEncodingException e) { + // e.printStackTrace(); + // } + // return utf8; + // } + + // /** + // * This method decodes a given UTF-8 string + // * + // * @param theString to be decoded + // * @return the decoded UTF-8 string + // */ + // public static String decodeUTF8(String theString) { + // byte[] ba = theString.getBytes(); + // String the_string = decodeUTF8(ba); + // if (the_string != null) { + // return the_string; + // } + // return theString; + // } + + // /** + // * This method decodes a given UTF-8 byte array + // * + // * @param ba the byte array to be decoded + // * @return the decoded UTF-8 string + // */ + // public static String decodeUTF8(byte[] ba) { + // String the_string = null; + // try { + // the_string = new String(ba, "UTF-8"); + // } catch (UnsupportedEncodingException e) { + // e.printStackTrace(); + // } + // return the_string; + // } + + /** + * This method decodes a given Base64 string. + * + *

+ * Note that the given String must only contain Base64 characters. (The string + * will be converted to a byte array of "US-ASCII" (7 bit) bytes and then this + * byte array will be decoded using the Base64 algorithm. + *

+ * + * @param theString + * to be decoded + * @return a Base64 decoded byte array + */ + public static byte[] decodeBase64(String theString) + { + try + { + byte[] base64_bytes = theString.getBytes("US-ASCII"); + return b64.decode(base64_bytes); + } + catch (UnsupportedEncodingException e) + { + e.printStackTrace(); + throw new RuntimeException("Very Strange: US-ASCII encoding not supported???", e); + } + } + + /** + * This method decodes a given Base64 byte array + * + * @param ba + * the byte array to be decoded + * @return a Base64 decoded byte array + */ + public static byte[] decodeBase64(byte[] ba) + { + return b64.decode(ba); + } + + /** + * This method encodes a given byte array Base64 + * + * @param plainString + * the byte array to be encoded + * @return the Base64 encoded string + */ + public static String encodeBase64(byte[] plainString) + { + try + { + byte[] base64_bytes = b64.encode(plainString); + return new String(base64_bytes, "US-ASCII"); + } + catch (UnsupportedEncodingException e) + { + e.printStackTrace(); + throw new RuntimeException("Very Strange: US-ASCII encoding not supported???", e); + } + } + + // dferbas + /** + * This method builds an hash value of a given byte array. + * + * @param data + * the byte array to build the hash value for + * @param hashAlg hash algorithm for {@link MessageDigest} e.g. "SHA-1" + * @return the calculated hash value as a byte array + * @see MessageDigest + */ + public static byte[] buildDigest(byte[] data, String hashAlg) + { + MessageDigest digester = null; + try + { + digester = MessageDigest.getInstance(hashAlg); + digester.update(data); + return digester.digest(); + } + catch (NoSuchAlgorithmException e) + { + throw new RuntimeException(e); + } + } + + // dferbas + /** + * This method builds an hash value of a given byte array. + * @param input + * @param hashAlg hashAlg hash algorithm for {@link MessageDigest} e.g. "SHA-1" + * @return the calculated hash value as a byte array + * @throws IOException + */ + public static byte[] buildDigest(DataSource input, String hashAlg) throws IOException + { + // PERF: digesting needs data source. + byte [] data = DataSourceHelper.convertDataSourceToByteArray(input); + return buildDigest(data, hashAlg); + } + + /** + * This method escapes a given string with HTML entities. + * + * @param rawString + * the string to escaped + * @return the HTML escaped string + */ + public static String htmlEscape(String rawString) + { + rawString = rawString.replaceAll("\\&", "&"); + rawString = rawString.replaceAll("\\<", "<"); + rawString = rawString.replaceAll("\\>", ">"); + rawString = rawString.replaceAll("\">", """); + return rawString; + } + + /** + * This method checks, if a byte array contains chars that are not base64 + * conform. + * + * @param byteArray + * the array to test + * @return boolean, if a byte array is base64 conform, false otherwise + */ + public static boolean isB64(byte[] byteArray) + { + try + { + return Base64.isArrayByteBase64(byteArray); + } + catch (ArrayIndexOutOfBoundsException e) + { + return false; + } + } + + /** + * This method checks, if a string contains chars that are not base64 conform. + * + * @param string + * the chars to test + * @return boolean, if the given string is base64 conform, false otherwise + */ + public static boolean isB64(String string) + { + try + { + return Base64.isArrayByteBase64(string.getBytes()); + } + catch (ArrayIndexOutOfBoundsException e) + { + return false; + } + } +} \ No newline at end of file diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/DebugHelper.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/DebugHelper.java new file mode 100644 index 0000000..762cb71 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/DebugHelper.java @@ -0,0 +1,90 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.knowcenter.wag.egov.egiz.tools; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStreamWriter; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import at.knowcenter.wag.egov.egiz.sig.connectors.moa.EnvelopingBase64MOAConnector; + +/** + * Contains useful helper methods for debugging. + * + * @author wprinz + */ +public final class DebugHelper +{ + /** + * The log. + */ + private static Log log = LogFactory.getLog(DebugHelper.class); + + /** + * Tells, if Strings should be debugged to a file. + * + *

+ * If set to false, the corresponding methods will simply do nothing. + *

+ */ + public static boolean debug_to_file = true; + + /** + * The directory under which the debug files are to be stored. + */ + public static File debug_dir = new File("C:\\wprinz\\Filer\\egiz2"); //$NON-NLS-1$ + + /** + * Writes the given String to a file with the given file name in UTF-8 + * encoding. + * + * @param str + * The String to be written. + * @param file_name + * The file the String will be written to. + */ + public static void debugStringToFile(String str, String file_name) + { + if (!debug_to_file) + { + return; + } + + try + { + File file = new File(debug_dir, file_name); + FileOutputStream fos = new FileOutputStream(file); + OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8"); //$NON-NLS-1$ + osw.write(str); + osw.close(); + } + catch (Exception e) + { + log.error(e.getMessage(), e); + } + } +} diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/FileHelper.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/FileHelper.java new file mode 100644 index 0000000..17b98d7 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/FileHelper.java @@ -0,0 +1,125 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: FileHelper.java,v 1.2 2006/05/15 12:05:21 wlackner Exp $ + */ +package at.knowcenter.wag.egov.egiz.tools; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.apache.log4j.Logger; + +import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; + +/** + * This class provides file reader and writer methods. All methods are static! + * + * @author wlackner + */ +public class FileHelper { + /** + * The logger definition. + */ + private static final Logger logger_ = ConfigLogger.getLogger(FileHelper.class); + + /** + * This method reads a file by reading line by line. + * + * @param fileName the file to be read + * @return the content string of the file + */ + public static String readFromFile(String fileName) { + String file_string = null; + logger_.trace("Looking for file: " + fileName); + try { + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8")); + logger_.trace("Reading file: " + fileName); + String line = null; + file_string = ""; + while ((line = reader.readLine()) != null) { + file_string += line; + } + reader.close(); + logger_.debug("File successfully read: " + fileName); + } catch (FileNotFoundException e) { + logger_.debug("File not found: " + fileName); + } catch (IOException e) { + logger_.debug("Error reading file: " + fileName); + } + return file_string; + } + + /** + * This method reads a file by reading line by line. + * + * @param fileName the file to be read + * @return the content string of the file + */ + public static String readFromInputStream(InputStream inputStream) { + String file_string = null; + if (inputStream == null) { + return null; + } + try { + BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8")); + String line = null; + file_string = ""; + while ((line = reader.readLine()) != null) { + file_string += line; + } + reader.close(); + } catch (IOException e) { + logger_.debug("Error reading inputstream."); + } + return file_string; + } + + /** + * This method writes a file line by line. + * + * @param fileName the file to be written + * @param fileString the content to be written + * @return true if the file could be written sucessfully, false otherwise + */ + public static boolean writeToFile(String fileName, String fileString) { + BufferedWriter writer; + try { + FileWriter fwriter = new FileWriter(fileName); + writer = new BufferedWriter(fwriter); + writer.write(fileString); + writer.close(); + } catch (IOException e) { + logger_.info("File:" + fileName + " can not be written. Cause:" + e.getMessage()); + return false; + } + return true; + } +} \ No newline at end of file diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalize.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalize.java new file mode 100644 index 0000000..2b0b8c2 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalize.java @@ -0,0 +1,55 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: Normalize.java,v 1.2 2006/05/15 12:05:21 wlackner Exp $ + */ +package at.knowcenter.wag.egov.egiz.tools; + + +/** + * Defines an interface to get access to different normalizer implementations. + * + * @author wlackner + */ +public interface Normalize { + + /** + * Normalize a given text. + * @param rawText the raw text to normalize + * @param keepMultipleLineBreaks if true multiple line breaks in a row will not be normalized to a single line break + * @return the normalized string + */ + public String normalize(String rawText, boolean keepMultipleLineBreaks); + /** + * Return the current normalizer version string. + * @return the version string + */ + public String getVersion(); + + /** + * Returns the normalizer line separator string. + * @return the line separator string + */ + public String getNormCR(); + +} diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java new file mode 100644 index 0000000..57b8e6f --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java @@ -0,0 +1,184 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.tools; + +import java.io.Serializable; + +/** + * This ist the first version implementing a normalizer method. The normalize statements are + * performed by using regular expressions. + * + * @author wlackner + */ +public class NormalizeV01 implements Normalize, Serializable { +// 04.11.2010 change by exthex - added keepMultipleLineBreaks parameter to normalize method +// to allow multiple line breaks to not be normalized to a single one + /** + * SVUID. + */ + private static final long serialVersionUID = 2302956630639871601L; + + /** + * The space string + */ + private final static String NORM_SP = " "; //\u0020 + /** + * The line break string --> use only \n because XML-Parser ignores \r\n + */ + private final static String NORM_CR = "\n"; // + /** + * The apostrophe string + */ + private final static String NORM_AP = "'"; //\u0027 + /** + * The quotation mark string + */ + private final static String NORM_QU = "\""; //\u0022 + /** + * The hypens string + */ + private final static String NORM_HY = "-"; //\u002D + /** + * The current version string + */ + protected static final String VERSION = "V01"; + + /** + * The empty constructor. + */ + public NormalizeV01() { + } + + /** + * The normalizer implementation.
+ * Normalizer algorithums: + *
    + *
  1. code all multiple line breaks as \n\n
  2. + *
  3. replace all Tabs and form feeds with spaces
  4. + *
  5. code line breaks as \n
  6. + *
  7. reduce all multiple line breaks into one line break (only if keepMultipleLineBreaks == false), code line break as \r
  8. + *
  9. replace all single line breaks with space
  10. + *
  11. normalize spaces
  12. + *
  13. remove spaces before and after a line break
  14. + *
  15. remove leading and trailing space or line break in the string
  16. + *
  17. normalize line breaks
  18. + *
  19. normalize apostrophes
  20. + *
  21. normalize quotations
  22. + *
  23. normalize hypens
  24. + *
+ * + * @param rawText the text to normalize + * @param keepMultipleLineBreaks if true, multiple newlines in a row will not be normalized to a single line break + * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String) + */ + public String normalize(String rawText, boolean keepMultipleLineBreaks) { + if (rawText == null || rawText.equals("null") || rawText.length() == 0) { + return ""; + } + String normText = rawText; + + // replace all null values + normText = normText.replaceAll("\u0000+", ""); + + // replace all Tabs and form feeds with spaces + normText = normText.replaceAll("[\t\f]", NORM_SP); + + // replace all non breaking spaces with normal spaces + normText = normText.replaceAll("\u00a0+", NORM_SP); + + // code all windows line breaks as \n + normText = normText.replaceAll("\r\n", "\n"); + + // code all mac line breaks as \n + normText = normText.replace('\r', '\n'); + + if (!keepMultipleLineBreaks) + { + // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r + normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r"); + } + + // replace all single line breaks with one line break + normText = normText.replace('\n', '\r'); + + // normalize spaces + normText = normText.replaceAll(" +", NORM_SP); + + // remove spaces before and after a single line break + normText = normText.replaceAll(" ?\r ?", "\r"); + + if (keepMultipleLineBreaks) + { + // remove spaces before and after a multiple line breaks + normText = normText.replaceAll(" ?\r\r ?", "\r\r"); + } else + { + normText = normText.replaceAll(" ?\r\r ?", "\r"); + } + + // remove leading and trailing space or line break in the string + int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0); + int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length()); + if (end_idx < start_idx) { + end_idx = start_idx; + } + + // System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" + + // normText_.length()); + normText = normText.substring(start_idx, end_idx); + + // normalize line breaks + normText = normText.replaceAll("\r", NORM_CR); + + // normalize apostrophes + normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP); + + // normalize quotations + normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU); + + // normalize hypens + normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY); + + return normText; + } + + /** + * Return the version string. + * + * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion() + */ + public String getVersion() { + return VERSION; + } + + /** + * Returns the normalizer line separator string. + * @return the line separator string + */ + public String getNormCR() { + return NORM_CR; + } +} \ No newline at end of file diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalizer.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalizer.java new file mode 100644 index 0000000..a08c12e --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/Normalizer.java @@ -0,0 +1,280 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: Normalizer.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.tools; + +import java.io.Serializable; + +import org.apache.log4j.Level; +import org.apache.log4j.Logger; + +import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; +import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; +import at.knowcenter.wag.egov.egiz.exceptions.NormalizeException; +import at.knowcenter.wag.egov.egiz.exceptions.SettingsException; + +/** + * This class provides wrapper methods to get an access to different normalizer implementations. + *
+ * This class is to load the corresponding implementation of a normalizer class. Therefor it seams + * to be a factory. The factory settings are read from the configuration file calling the + * SettingsReader. + * + * @author wlackner + * @see at.knowcenter.wag.egov.egiz.tools.Normalizer + * @see at.knowcenter.wag.egov.egiz.tools.NormalizeV01 + * @see at.knowcenter.wag.egov.egiz.cfg.SettingsReader + */ +public class Normalizer implements Serializable { +// 04.11.2010 changed by exthex - normalize methods use and propagate the keepMultipleNewlines parameter + + /** + * SVUID. + */ + private static final long serialVersionUID = 4201772508393848555L; + + /** + * The current raw string to normalize + */ + private String rawString_ = null; + /** + * The current normalisation version string + */ + private String normVersion_ = null; + /** + * The normalized string cache + */ + private String normString_ = null; + /** + * The reference to the normalizer implementation + */ + private Normalize normalize_ = null; +// /** +// * A given Encoding, not used now +// */ +// private String encoding_ = null; + /** + * The SettingsReader instance + */ + private SettingsReader settings_ = null; + /** + * The factory class prefix + */ + private final static String CLASS_PREFIX = ".Normalize"; + /** + * The default version string + */ + protected final static String DEFAULT_VERSION = "V01"; + /** + * The settings key defined in the settings file + * + * @see SettingsReader + */ + protected final static String SETTINGS_VERSION_KEY = "normalizer.version"; + /** + * The logger definition. + */ + private static final Logger logger_ = ConfigLogger.getLogger(Normalizer.class); + + /** + * New Normalizer init by the raw string and a normalizer version. + * + * @param rawString the raw string to normalize + * @param normVersion the nomalizer version that should be used + * @throws NormalizeException ErrorCode:400 + */ + public Normalizer(String rawString, String normVersion) throws NormalizeException { + rawString_ = rawString; + normVersion_ = normVersion; + init(); + } + + /** + * New Normalizer init by the raw string. + * + * @param rawString the raw string to normalize + * @throws NormalizeException ErrorCode:400 + */ + public Normalizer(String rawString) throws NormalizeException { + rawString_ = rawString; + init(); + } + + /** + * The empty constructor. + * + * @throws NormalizeException ErrorCode:400 + */ + public Normalizer() throws NormalizeException { + init(); + } + + /** + * Load the factory implementation. This method trys to load the configured normalizer library. + * + * @throws NormalizeException + */ + public void init() throws NormalizeException { + loadSettings(); + String class_name = this.getClass().getPackage().getName() + getClassName(); + Class normalize_class = null; + try { + normalize_class = Class.forName(class_name); + } catch (ClassNotFoundException e) { + if (logger_.isEnabledFor(Level.FATAL)) { + logger_.fatal("Class not found:" + class_name); + } + throw new NormalizeException("Can not load normalizer library", e); + } + try { + normalize_ = (Normalize) normalize_class.newInstance(); + } catch (InstantiationException e) { + if (logger_.isEnabledFor(Level.FATAL)) { + logger_.fatal("Can not instantiate:" + class_name); + } + throw new NormalizeException("Can not load normalizer library", e); + } catch (IllegalAccessException e) { + if (logger_.isEnabledFor(Level.FATAL)) { + logger_.fatal("Can not access:" + class_name); + } + throw new NormalizeException("Can not load normalizer library", e); + } + } + + /** + * Returns the underlying normalizer instance. + * @author tknall + */ + public Normalize getInstance() { + return this.normalize_; + } + + /** + * Read the class postfix from the configuration file + * + * @return the full qualified class name + */ + private String getClassName() { + if (normVersion_ == null) { + normVersion_ = settings_.getSetting(SETTINGS_VERSION_KEY, DEFAULT_VERSION); + } + return CLASS_PREFIX + normVersion_; + } + + /* + * public void setEncoding(String encoding) { encoding_ = encoding; } + */ + + /** + * Set the raw string to normalize + */ + public void setRawString(String rawString) { + rawString_ = rawString; + } + +// /** +// * Return the normalized string. If the chached value does not exist the normalize method from the +// * current normalizer implementation is called. +// * +// * @return the normalized string +// */ +// public String getNormalizedString() { +// if (normString_ == null) { +// normalize(); +// } +// return normString_; +// } + + /** + * Set a normalizer version. This activity load the new requested normalizer implementation. + * + * @param normVersion the normalizer version to be use + * @throws NormalizeException ErrorCode:400 + */ + public void setVersion(String normVersion) throws NormalizeException { + normVersion_ = normVersion; + init(); + } + + /** + * Return the current version string. + * + * @return the normaliser version string + */ + public String getVersion() { + return normVersion_; + } + + /** + * Wrapper method. Call the normalizer implementation method. + * + * @param rawString the raw string to normalize + * @param keepMultipleNewlines + * @return the normalized string + * @see NormalizeV01 + */ + public String normalize(String rawString, boolean keepMultipleNewlines) { + return normalize_.normalize(rawString, keepMultipleNewlines); + } + + /** + * Wrapper method. Call the normalizer implementation method. Normalize the current raw string. + * + * @return the normalized string + * @see NormalizeV01 + */ + public String normalize(boolean keepMultipleNewlines) { + if (normString_ == null) { + normString_ = normalize(rawString_, keepMultipleNewlines); + } + return normString_; + } + + /** + * Returns the normalizer line separator string. + * @return the line separator string + */ + public String getNormCR() { + return normalize_.getNormCR(); + } + + /** + * load the class settings + * + * @throws NormalizeException + * @see SettingsReader + */ + private void loadSettings() throws NormalizeException { + if (settings_ == null) { + try { + settings_ = SettingsReader.getInstance(); + } catch (SettingsException e) { + String log_message = "Can not load normalizer settings. Cause:\n" + e.getMessage(); + logger_.error(log_message, e); + throw new NormalizeException(log_message, e); + } + } + } +} \ No newline at end of file -- cgit v1.2.3