1 files changed, 166 insertions, 0 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java b/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
new file mode 100644
index 0000000..d3af9b5
--- /dev/null
+++ b/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
@@ -0,0 +1,166 @@
+/*
+ * <copyright>
+ *  Copyright (c) 2006 by Know-Center, Graz, Austria
+ * </copyright>
+ *
+ *  This software is the confidential and proprietary information of Know-Center,
+ *  Graz, Austria. You shall not disclose such Confidential Information and shall 
+ *  use it only in accordance with the terms of the license agreement you entered 
+ *  into with Know-Center.
+ *
+ *  KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE
+ *  SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, 
+ *  OR NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES
+ *  SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING
+ *  THIS SOFTWARE OR ITS DERIVATIVES.
+ * 
+ * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+import java.io.Serializable;
+
+/**
+ * This ist the first version implementing a normalizer method. The normalize statements are
+ * performed by using regular expressions.
+ * 
+ * @author wlackner
+ */
+public class NormalizeV01 implements Normalize, Serializable {
+
+  /**
+   * SVUID.
+   */
+  private static final long serialVersionUID = 2302956630639871601L;
+  
+  /**
+   * The space string
+   */
+  private final static String NORM_SP = " "; //\u0020
+  /**
+   * The line break string --> use only \n because XML-Parser ignores \r\n
+   */
+  private final static String NORM_CR = "\n"; //
+  /**
+   * The apostrophe string
+   */
+  private final static String NORM_AP = "'"; //\u0027
+  /**
+   * The quotation mark string
+   */
+  private final static String NORM_QU = "\""; //\u0022
+  /**
+   * The hypens string
+   */
+  private final static String NORM_HY = "-"; //\u002D
+  /**
+   * The current version string
+   */
+  protected static final String VERSION = "V01";
+
+  /**
+   * The empty constructor.
+   */
+  public NormalizeV01() {
+  }
+
+  /**
+   * The normalizer implementation. <br>
+   * Normalizer algorithums:
+   * <ol>
+   * <li>code all multiple line breaks as \n\n</li>
+   * <li>replace all Tabs and form feeds with spaces</li>
+   * <li>code line breaks as \n</li>
+   * <li>reduce all multiple line breaks into one line break, code line break as \r</li>
+   * <li>replace all single line breaks with space</li>
+   * <li>normalize spaces</li>
+   * <li>remove spaces before and after a line break</li>
+   * <li>remove leading and trailing space or line break in the string</li>
+   * <li>normalize line breaks</li>
+   * <li>normalize apostrophes</li>
+   * <li>normalize quotations</li>
+   * <li>normalize hypens</li>
+   * </ol>
+   * 
+   * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String)
+   */
+  public String normalize(String rawText) {
+    if (rawText == null || rawText.equals("null") || rawText.length() == 0) {
+      return "";
+    }
+    String normText = rawText;
+
+    // replace all null values
+    normText = normText.replaceAll("\u0000+", "");
+
+    // replace all Tabs and form feeds with spaces
+    normText = normText.replaceAll("[\t\f]", NORM_SP);
+
+    // replace all non breaking spaces with normal spaces
+    normText = normText.replaceAll("\u00a0+", NORM_SP);
+
+    // code all windows line breaks as \n
+    normText = normText.replaceAll("\r\n", "\n");
+
+    // code all mac line breaks as \n
+    normText = normText.replace('\r', '\n');
+
+    // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r
+    normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r");
+
+    // replace all single line breaks with one line break
+    normText = normText.replace('\n', '\r');
+
+    // normalize spaces
+    normText = normText.replaceAll(" +", NORM_SP);
+
+    // remove spaces before and after a single line break
+    normText = normText.replaceAll(" ?\r ?", "\r");
+
+    // remove spaces before and after a multiple line breaks
+    normText = normText.replaceAll(" ?\r\r ?", "\r");
+
+    // remove leading and trailing space or line break in the string
+    int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0);
+    int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length());
+    if (end_idx < start_idx) {
+      end_idx = start_idx;
+    }
+  
+    //    System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" +
+    // normText_.length());
+    normText = normText.substring(start_idx, end_idx);
+    
+    // normalize line breaks
+    normText = normText.replaceAll("\r", NORM_CR);
+
+    // normalize apostrophes
+    normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP);
+
+    // normalize quotations
+    normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU);
+
+    // normalize hypens
+    normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY);
+
+    return normText;
+  }
+
+  /**
+   * Return the version string.
+   * 
+   * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion()
+   */
+  public String getVersion() {
+    return VERSION;
+  }
+
+  /**
+   * Returns the normalizer line separator string.
+   * @return the line separator string
+   */
+  public String getNormCR() {
+    return NORM_CR;
+  }
+}
+\ No newline at end of file