aboutsummaryrefslogtreecommitdiff
path: root/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
diff options
context:
space:
mode:
Diffstat (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java')
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java184
1 files changed, 184 insertions, 0 deletions
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
new file mode 100644
index 0000000..57b8e6f
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
@@ -0,0 +1,184 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.tools;
+
+import java.io.Serializable;
+
+/**
+ * This ist the first version implementing a normalizer method. The normalize statements are
+ * performed by using regular expressions.
+ *
+ * @author wlackner
+ */
+public class NormalizeV01 implements Normalize, Serializable {
+// 04.11.2010 change by exthex - added keepMultipleLineBreaks parameter to normalize method
+// to allow multiple line breaks to not be normalized to a single one
+ /**
+ * SVUID.
+ */
+ private static final long serialVersionUID = 2302956630639871601L;
+
+ /**
+ * The space string
+ */
+ private final static String NORM_SP = " "; //\u0020
+ /**
+ * The line break string --> use only \n because XML-Parser ignores \r\n
+ */
+ private final static String NORM_CR = "\n"; //
+ /**
+ * The apostrophe string
+ */
+ private final static String NORM_AP = "'"; //\u0027
+ /**
+ * The quotation mark string
+ */
+ private final static String NORM_QU = "\""; //\u0022
+ /**
+ * The hypens string
+ */
+ private final static String NORM_HY = "-"; //\u002D
+ /**
+ * The current version string
+ */
+ protected static final String VERSION = "V01";
+
+ /**
+ * The empty constructor.
+ */
+ public NormalizeV01() {
+ }
+
+ /**
+ * The normalizer implementation. <br>
+ * Normalizer algorithums:
+ * <ol>
+ * <li>code all multiple line breaks as \n\n</li>
+ * <li>replace all Tabs and form feeds with spaces</li>
+ * <li>code line breaks as \n</li>
+ * <li>reduce all multiple line breaks into one line break (only if keepMultipleLineBreaks == false), code line break as \r</li>
+ * <li>replace all single line breaks with space</li>
+ * <li>normalize spaces</li>
+ * <li>remove spaces before and after a line break</li>
+ * <li>remove leading and trailing space or line break in the string</li>
+ * <li>normalize line breaks</li>
+ * <li>normalize apostrophes</li>
+ * <li>normalize quotations</li>
+ * <li>normalize hypens</li>
+ * </ol>
+ *
+ * @param rawText the text to normalize
+ * @param keepMultipleLineBreaks if true, multiple newlines in a row will not be normalized to a single line break
+ * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String)
+ */
+ public String normalize(String rawText, boolean keepMultipleLineBreaks) {
+ if (rawText == null || rawText.equals("null") || rawText.length() == 0) {
+ return "";
+ }
+ String normText = rawText;
+
+ // replace all null values
+ normText = normText.replaceAll("\u0000+", "");
+
+ // replace all Tabs and form feeds with spaces
+ normText = normText.replaceAll("[\t\f]", NORM_SP);
+
+ // replace all non breaking spaces with normal spaces
+ normText = normText.replaceAll("\u00a0+", NORM_SP);
+
+ // code all windows line breaks as \n
+ normText = normText.replaceAll("\r\n", "\n");
+
+ // code all mac line breaks as \n
+ normText = normText.replace('\r', '\n');
+
+ if (!keepMultipleLineBreaks)
+ {
+ // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r
+ normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r");
+ }
+
+ // replace all single line breaks with one line break
+ normText = normText.replace('\n', '\r');
+
+ // normalize spaces
+ normText = normText.replaceAll(" +", NORM_SP);
+
+ // remove spaces before and after a single line break
+ normText = normText.replaceAll(" ?\r ?", "\r");
+
+ if (keepMultipleLineBreaks)
+ {
+ // remove spaces before and after a multiple line breaks
+ normText = normText.replaceAll(" ?\r\r ?", "\r\r");
+ } else
+ {
+ normText = normText.replaceAll(" ?\r\r ?", "\r");
+ }
+
+ // remove leading and trailing space or line break in the string
+ int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0);
+ int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length());
+ if (end_idx < start_idx) {
+ end_idx = start_idx;
+ }
+
+ // System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" +
+ // normText_.length());
+ normText = normText.substring(start_idx, end_idx);
+
+ // normalize line breaks
+ normText = normText.replaceAll("\r", NORM_CR);
+
+ // normalize apostrophes
+ normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP);
+
+ // normalize quotations
+ normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU);
+
+ // normalize hypens
+ normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY);
+
+ return normText;
+ }
+
+ /**
+ * Return the version string.
+ *
+ * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion()
+ */
+ public String getVersion() {
+ return VERSION;
+ }
+
+ /**
+ * Returns the normalizer line separator string.
+ * @return the line separator string
+ */
+ public String getNormCR() {
+ return NORM_CR;
+ }
+} \ No newline at end of file