1 files changed, 184 insertions, 0 deletions
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
new file mode 100644
index 0000000..57b8e6f
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
@@ -0,0 +1,184 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+import java.io.Serializable;
+
+/**
+ * This ist the first version implementing a normalizer method. The normalize statements are
+ * performed by using regular expressions.
+ * 
+ * @author wlackner
+ */
+public class NormalizeV01 implements Normalize, Serializable {
+// 04.11.2010 change by exthex - added keepMultipleLineBreaks parameter to normalize method 
+// to allow multiple line breaks to not be normalized to a single one
+  /**
+   * SVUID.
+   */
+  private static final long serialVersionUID = 2302956630639871601L;
+  
+  /**
+   * The space string
+   */
+  private final static String NORM_SP = " "; //\u0020
+  /**
+   * The line break string --> use only \n because XML-Parser ignores \r\n
+   */
+  private final static String NORM_CR = "\n"; //
+  /**
+   * The apostrophe string
+   */
+  private final static String NORM_AP = "'"; //\u0027
+  /**
+   * The quotation mark string
+   */
+  private final static String NORM_QU = "\""; //\u0022
+  /**
+   * The hypens string
+   */
+  private final static String NORM_HY = "-"; //\u002D
+  /**
+   * The current version string
+   */
+  protected static final String VERSION = "V01";
+
+  /**
+   * The empty constructor.
+   */
+  public NormalizeV01() {
+  }
+
+  /**
+   * The normalizer implementation. <br>
+   * Normalizer algorithums:
+   * <ol>
+   * <li>code all multiple line breaks as \n\n</li>
+   * <li>replace all Tabs and form feeds with spaces</li>
+   * <li>code line breaks as \n</li>
+   * <li>reduce all multiple line breaks into one line break (only if keepMultipleLineBreaks == false), code line break as \r</li>
+   * <li>replace all single line breaks with space</li>
+   * <li>normalize spaces</li>
+   * <li>remove spaces before and after a line break</li>
+   * <li>remove leading and trailing space or line break in the string</li>
+   * <li>normalize line breaks</li>
+   * <li>normalize apostrophes</li>
+   * <li>normalize quotations</li>
+   * <li>normalize hypens</li>
+   * </ol>
+   * 
+   * @param rawText the text to normalize
+   * @param keepMultipleLineBreaks if true, multiple newlines in a row will not be normalized to a single line break
+   * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String)
+   */
+  public String normalize(String rawText, boolean keepMultipleLineBreaks) {
+    if (rawText == null || rawText.equals("null") || rawText.length() == 0) {
+      return "";
+    }
+    String normText = rawText;
+
+    // replace all null values
+    normText = normText.replaceAll("\u0000+", "");
+
+    // replace all Tabs and form feeds with spaces
+    normText = normText.replaceAll("[\t\f]", NORM_SP);
+
+    // replace all non breaking spaces with normal spaces
+    normText = normText.replaceAll("\u00a0+", NORM_SP);
+
+    // code all windows line breaks as \n
+    normText = normText.replaceAll("\r\n", "\n");
+
+    // code all mac line breaks as \n
+    normText = normText.replace('\r', '\n');
+
+    if (!keepMultipleLineBreaks)
+    {
+      // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r
+      normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r");
+    }
+
+    // replace all single line breaks with one line break
+    normText = normText.replace('\n', '\r');
+
+    // normalize spaces
+    normText = normText.replaceAll(" +", NORM_SP);
+
+    // remove spaces before and after a single line break
+    normText = normText.replaceAll(" ?\r ?", "\r");
+
+    if (keepMultipleLineBreaks)
+    {
+      // remove spaces before and after a multiple line breaks
+      normText = normText.replaceAll(" ?\r\r ?", "\r\r");
+    } else
+    {
+      normText = normText.replaceAll(" ?\r\r ?", "\r");
+    }
+
+    // remove leading and trailing space or line break in the string
+    int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0);
+    int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length());
+    if (end_idx < start_idx) {
+      end_idx = start_idx;
+    }
+  
+    //    System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" +
+    // normText_.length());
+    normText = normText.substring(start_idx, end_idx);
+    
+    // normalize line breaks
+    normText = normText.replaceAll("\r", NORM_CR);
+
+    // normalize apostrophes
+    normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP);
+
+    // normalize quotations
+    normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU);
+
+    // normalize hypens
+    normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY);
+
+    return normText;
+  }
+
+  /**
+   * Return the version string.
+   * 
+   * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion()
+   */
+  public String getVersion() {
+    return VERSION;
+  }
+
+  /**
+   * Returns the normalizer line separator string.
+   * @return the line separator string
+   */
+  public String getNormCR() {
+    return NORM_CR;
+  }
+}
+\ No newline at end of file