/** * Copyright 2006 by Know-Center, Graz, Austria * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a * joint initiative of the Federal Chancellery Austria and Graz University of * Technology. * * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by * the European Commission - subsequent versions of the EUPL (the "Licence"); * You may not use this work except in compliance with the Licence. * You may obtain a copy of the Licence at: * http://www.osor.eu/eupl/ * * Unless required by applicable law or agreed to in writing, software * distributed under the Licence is distributed on an "AS IS" basis, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Licence for the specific language governing permissions and * limitations under the Licence. * * This product combines work with different licenses. See the "NOTICE" text * file for details on the various modules and licenses. * The "NOTICE" text file is part of the distribution. Any derivative works * that you distribute must include a readable copy of the "NOTICE" text file. * * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $ */ package at.knowcenter.wag.egov.egiz.tools; import java.io.Serializable; /** * This ist the first version implementing a normalizer method. The normalize statements are * performed by using regular expressions. * * @author wlackner */ public class NormalizeV01 implements Normalize, Serializable { // 04.11.2010 change by exthex - added keepMultipleLineBreaks parameter to normalize method // to allow multiple line breaks to not be normalized to a single one /** * SVUID. */ private static final long serialVersionUID = 2302956630639871601L; /** * The space string */ private final static String NORM_SP = " "; //\u0020 /** * The line break string --> use only \n because XML-Parser ignores \r\n */ private final static String NORM_CR = "\n"; // /** * The apostrophe string */ private final static String NORM_AP = "'"; //\u0027 /** * The quotation mark string */ private final static String NORM_QU = "\""; //\u0022 /** * The hypens string */ private final static String NORM_HY = "-"; //\u002D /** * The current version string */ protected static final String VERSION = "V01"; /** * The empty constructor. */ public NormalizeV01() { } /** * The normalizer implementation.
* Normalizer algorithums: *

code all multiple line breaks as \n\n
replace all Tabs and form feeds with spaces
code line breaks as \n
reduce all multiple line breaks into one line break (only if keepMultipleLineBreaks == false), code line break as \r
replace all single line breaks with space
normalize spaces
remove spaces before and after a line break
remove leading and trailing space or line break in the string
normalize line breaks
normalize apostrophes
normalize quotations
normalize hypens

* * @param rawText the text to normalize * @param keepMultipleLineBreaks if true, multiple newlines in a row will not be normalized to a single line break * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String) */ public String normalize(String rawText, boolean keepMultipleLineBreaks) { if (rawText == null || rawText.equals("null") || rawText.length() == 0) { return ""; } String normText = rawText; // replace all null values normText = normText.replaceAll("\u0000+", ""); // replace all Tabs and form feeds with spaces normText = normText.replaceAll("[\t\f]", NORM_SP); // replace all non breaking spaces with normal spaces normText = normText.replaceAll("\u00a0+", NORM_SP); // code all windows line breaks as \n normText = normText.replaceAll("\r\n", "\n"); // code all mac line breaks as \n normText = normText.replace('\r', '\n'); if (!keepMultipleLineBreaks) { // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r"); } // replace all single line breaks with one line break normText = normText.replace('\n', '\r'); // normalize spaces normText = normText.replaceAll(" +", NORM_SP); // remove spaces before and after a single line break normText = normText.replaceAll(" ?\r ?", "\r"); if (keepMultipleLineBreaks) { // remove spaces before and after a multiple line breaks normText = normText.replaceAll(" ?\r\r ?", "\r\r"); } else { normText = normText.replaceAll(" ?\r\r ?", "\r"); } // remove leading and trailing space or line break in the string int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0); int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length()); if (end_idx < start_idx) { end_idx = start_idx; } // System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" + // normText_.length()); normText = normText.substring(start_idx, end_idx); // normalize line breaks normText = normText.replaceAll("\r", NORM_CR); // normalize apostrophes normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP); // normalize quotations normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU); // normalize hypens normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY); return normText; } /** * Return the version string. * * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion() */ public String getVersion() { return VERSION; } /** * Returns the normalizer line separator string. * @return the line separator string */ public String getNormCR() { return NORM_CR; } }