aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java166
1 files changed, 166 insertions, 0 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java b/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
new file mode 100644
index 0000000..d3af9b5
--- /dev/null
+++ b/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
@@ -0,0 +1,166 @@
+/*
+ * <copyright>
+ * Copyright (c) 2006 by Know-Center, Graz, Austria
+ * </copyright>
+ *
+ * This software is the confidential and proprietary information of Know-Center,
+ * Graz, Austria. You shall not disclose such Confidential Information and shall
+ * use it only in accordance with the terms of the license agreement you entered
+ * into with Know-Center.
+ *
+ * KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE
+ * SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,
+ * OR NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES
+ * SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING
+ * THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+import java.io.Serializable;
+
+/**
+ * This ist the first version implementing a normalizer method. The normalize statements are
+ * performed by using regular expressions.
+ *
+ * @author wlackner
+ */
+public class NormalizeV01 implements Normalize, Serializable {
+
+ /**
+ * SVUID.
+ */
+ private static final long serialVersionUID = 2302956630639871601L;
+
+ /**
+ * The space string
+ */
+ private final static String NORM_SP = " "; //\u0020
+ /**
+ * The line break string --> use only \n because XML-Parser ignores \r\n
+ */
+ private final static String NORM_CR = "\n"; //
+ /**
+ * The apostrophe string
+ */
+ private final static String NORM_AP = "'"; //\u0027
+ /**
+ * The quotation mark string
+ */
+ private final static String NORM_QU = "\""; //\u0022
+ /**
+ * The hypens string
+ */
+ private final static String NORM_HY = "-"; //\u002D
+ /**
+ * The current version string
+ */
+ protected static final String VERSION = "V01";
+
+ /**
+ * The empty constructor.
+ */
+ public NormalizeV01() {
+ }
+
+ /**
+ * The normalizer implementation. <br>
+ * Normalizer algorithums:
+ * <ol>
+ * <li>code all multiple line breaks as \n\n</li>
+ * <li>replace all Tabs and form feeds with spaces</li>
+ * <li>code line breaks as \n</li>
+ * <li>reduce all multiple line breaks into one line break, code line break as \r</li>
+ * <li>replace all single line breaks with space</li>
+ * <li>normalize spaces</li>
+ * <li>remove spaces before and after a line break</li>
+ * <li>remove leading and trailing space or line break in the string</li>
+ * <li>normalize line breaks</li>
+ * <li>normalize apostrophes</li>
+ * <li>normalize quotations</li>
+ * <li>normalize hypens</li>
+ * </ol>
+ *
+ * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String)
+ */
+ public String normalize(String rawText) {
+ if (rawText == null || rawText.equals("null") || rawText.length() == 0) {
+ return "";
+ }
+ String normText = rawText;
+
+ // replace all null values
+ normText = normText.replaceAll("\u0000+", "");
+
+ // replace all Tabs and form feeds with spaces
+ normText = normText.replaceAll("[\t\f]", NORM_SP);
+
+ // replace all non breaking spaces with normal spaces
+ normText = normText.replaceAll("\u00a0+", NORM_SP);
+
+ // code all windows line breaks as \n
+ normText = normText.replaceAll("\r\n", "\n");
+
+ // code all mac line breaks as \n
+ normText = normText.replace('\r', '\n');
+
+ // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r
+ normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r");
+
+ // replace all single line breaks with one line break
+ normText = normText.replace('\n', '\r');
+
+ // normalize spaces
+ normText = normText.replaceAll(" +", NORM_SP);
+
+ // remove spaces before and after a single line break
+ normText = normText.replaceAll(" ?\r ?", "\r");
+
+ // remove spaces before and after a multiple line breaks
+ normText = normText.replaceAll(" ?\r\r ?", "\r");
+
+ // remove leading and trailing space or line break in the string
+ int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0);
+ int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length());
+ if (end_idx < start_idx) {
+ end_idx = start_idx;
+ }
+
+ // System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" +
+ // normText_.length());
+ normText = normText.substring(start_idx, end_idx);
+
+ // normalize line breaks
+ normText = normText.replaceAll("\r", NORM_CR);
+
+ // normalize apostrophes
+ normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP);
+
+ // normalize quotations
+ normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU);
+
+ // normalize hypens
+ normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY);
+
+ return normText;
+ }
+
+ /**
+ * Return the version string.
+ *
+ * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion()
+ */
+ public String getVersion() {
+ return VERSION;
+ }
+
+ /**
+ * Returns the normalizer line separator string.
+ * @return the line separator string
+ */
+ public String getNormCR() {
+ return NORM_CR;
+ }
+} \ No newline at end of file