aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
diff options
context:
space:
mode:
authortknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2013-01-09 15:41:29 +0000
committertknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2013-01-09 15:41:29 +0000
commit535a04fa05f739ec16dd81666e3b0f82dfbd442d (patch)
tree0804f301c1a9ceb303a8441b7b29244fc8eb7ff0 /src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
parent1efaf6fd5619dfa95c9d7e8c71eda4c2ffba4998 (diff)
downloadpdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.gz
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.bz2
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.zip
pdf-as-lib maven project files moved to pdf-as-lib
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java184
1 files changed, 0 insertions, 184 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java b/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
deleted file mode 100644
index 57b8e6f..0000000
--- a/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/**
- * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
- * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
- * joint initiative of the Federal Chancellery Austria and Graz University of
- * Technology.
- *
- * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
- * the European Commission - subsequent versions of the EUPL (the "Licence");
- * You may not use this work except in compliance with the Licence.
- * You may obtain a copy of the Licence at:
- * http://www.osor.eu/eupl/
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the Licence is distributed on an "AS IS" basis,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the Licence for the specific language governing permissions and
- * limitations under the Licence.
- *
- * This product combines work with different licenses. See the "NOTICE" text
- * file for details on the various modules and licenses.
- * The "NOTICE" text file is part of the distribution. Any derivative works
- * that you distribute must include a readable copy of the "NOTICE" text file.
- *
- * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $
- */
-package at.knowcenter.wag.egov.egiz.tools;
-
-import java.io.Serializable;
-
-/**
- * This ist the first version implementing a normalizer method. The normalize statements are
- * performed by using regular expressions.
- *
- * @author wlackner
- */
-public class NormalizeV01 implements Normalize, Serializable {
-// 04.11.2010 change by exthex - added keepMultipleLineBreaks parameter to normalize method
-// to allow multiple line breaks to not be normalized to a single one
- /**
- * SVUID.
- */
- private static final long serialVersionUID = 2302956630639871601L;
-
- /**
- * The space string
- */
- private final static String NORM_SP = " "; //\u0020
- /**
- * The line break string --> use only \n because XML-Parser ignores \r\n
- */
- private final static String NORM_CR = "\n"; //
- /**
- * The apostrophe string
- */
- private final static String NORM_AP = "'"; //\u0027
- /**
- * The quotation mark string
- */
- private final static String NORM_QU = "\""; //\u0022
- /**
- * The hypens string
- */
- private final static String NORM_HY = "-"; //\u002D
- /**
- * The current version string
- */
- protected static final String VERSION = "V01";
-
- /**
- * The empty constructor.
- */
- public NormalizeV01() {
- }
-
- /**
- * The normalizer implementation. <br>
- * Normalizer algorithums:
- * <ol>
- * <li>code all multiple line breaks as \n\n</li>
- * <li>replace all Tabs and form feeds with spaces</li>
- * <li>code line breaks as \n</li>
- * <li>reduce all multiple line breaks into one line break (only if keepMultipleLineBreaks == false), code line break as \r</li>
- * <li>replace all single line breaks with space</li>
- * <li>normalize spaces</li>
- * <li>remove spaces before and after a line break</li>
- * <li>remove leading and trailing space or line break in the string</li>
- * <li>normalize line breaks</li>
- * <li>normalize apostrophes</li>
- * <li>normalize quotations</li>
- * <li>normalize hypens</li>
- * </ol>
- *
- * @param rawText the text to normalize
- * @param keepMultipleLineBreaks if true, multiple newlines in a row will not be normalized to a single line break
- * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String)
- */
- public String normalize(String rawText, boolean keepMultipleLineBreaks) {
- if (rawText == null || rawText.equals("null") || rawText.length() == 0) {
- return "";
- }
- String normText = rawText;
-
- // replace all null values
- normText = normText.replaceAll("\u0000+", "");
-
- // replace all Tabs and form feeds with spaces
- normText = normText.replaceAll("[\t\f]", NORM_SP);
-
- // replace all non breaking spaces with normal spaces
- normText = normText.replaceAll("\u00a0+", NORM_SP);
-
- // code all windows line breaks as \n
- normText = normText.replaceAll("\r\n", "\n");
-
- // code all mac line breaks as \n
- normText = normText.replace('\r', '\n');
-
- if (!keepMultipleLineBreaks)
- {
- // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r
- normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r");
- }
-
- // replace all single line breaks with one line break
- normText = normText.replace('\n', '\r');
-
- // normalize spaces
- normText = normText.replaceAll(" +", NORM_SP);
-
- // remove spaces before and after a single line break
- normText = normText.replaceAll(" ?\r ?", "\r");
-
- if (keepMultipleLineBreaks)
- {
- // remove spaces before and after a multiple line breaks
- normText = normText.replaceAll(" ?\r\r ?", "\r\r");
- } else
- {
- normText = normText.replaceAll(" ?\r\r ?", "\r");
- }
-
- // remove leading and trailing space or line break in the string
- int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0);
- int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length());
- if (end_idx < start_idx) {
- end_idx = start_idx;
- }
-
- // System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" +
- // normText_.length());
- normText = normText.substring(start_idx, end_idx);
-
- // normalize line breaks
- normText = normText.replaceAll("\r", NORM_CR);
-
- // normalize apostrophes
- normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP);
-
- // normalize quotations
- normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU);
-
- // normalize hypens
- normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY);
-
- return normText;
- }
-
- /**
- * Return the version string.
- *
- * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion()
- */
- public String getVersion() {
- return VERSION;
- }
-
- /**
- * Returns the normalizer line separator string.
- * @return the line separator string
- */
- public String getNormCR() {
- return NORM_CR;
- }
-} \ No newline at end of file