/*
*
* Copyright (c) 2006 by Know-Center, Graz, Austria
*
*
* This software is the confidential and proprietary information of Know-Center,
* Graz, Austria. You shall not disclose such Confidential Information and shall
* use it only in accordance with the terms of the license agreement you entered
* into with Know-Center.
*
* KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE
* SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,
* OR NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES
* SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING
* THIS SOFTWARE OR ITS DERIVATIVES.
*
* $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $
*/
package at.knowcenter.wag.egov.egiz.tools;
import java.io.Serializable;
/**
* This ist the first version implementing a normalizer method. The normalize statements are
* performed by using regular expressions.
*
* @author wlackner
*/
public class NormalizeV01 implements Normalize, Serializable {
/**
* SVUID.
*/
private static final long serialVersionUID = 2302956630639871601L;
/**
* The space string
*/
private final static String NORM_SP = " "; //\u0020
/**
* The line break string --> use only \n because XML-Parser ignores \r\n
*/
private final static String NORM_CR = "\n"; //
/**
* The apostrophe string
*/
private final static String NORM_AP = "'"; //\u0027
/**
* The quotation mark string
*/
private final static String NORM_QU = "\""; //\u0022
/**
* The hypens string
*/
private final static String NORM_HY = "-"; //\u002D
/**
* The current version string
*/
protected static final String VERSION = "V01";
/**
* The empty constructor.
*/
public NormalizeV01() {
}
/**
* The normalizer implementation.
* Normalizer algorithums:
*
* - code all multiple line breaks as \n\n
* - replace all Tabs and form feeds with spaces
* - code line breaks as \n
* - reduce all multiple line breaks into one line break, code line break as \r
* - replace all single line breaks with space
* - normalize spaces
* - remove spaces before and after a line break
* - remove leading and trailing space or line break in the string
* - normalize line breaks
* - normalize apostrophes
* - normalize quotations
* - normalize hypens
*
*
* @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String)
*/
public String normalize(String rawText) {
if (rawText == null || rawText.equals("null") || rawText.length() == 0) {
return "";
}
String normText = rawText;
// replace all null values
normText = normText.replaceAll("\u0000+", "");
// replace all Tabs and form feeds with spaces
normText = normText.replaceAll("[\t\f]", NORM_SP);
// replace all non breaking spaces with normal spaces
normText = normText.replaceAll("\u00a0+", NORM_SP);
// code all windows line breaks as \n
normText = normText.replaceAll("\r\n", "\n");
// code all mac line breaks as \n
normText = normText.replace('\r', '\n');
// reduce all multiple line breaks into two line breaks, code muliple line break as \r\r
normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r");
// replace all single line breaks with one line break
normText = normText.replace('\n', '\r');
// normalize spaces
normText = normText.replaceAll(" +", NORM_SP);
// remove spaces before and after a single line break
normText = normText.replaceAll(" ?\r ?", "\r");
// remove spaces before and after a multiple line breaks
normText = normText.replaceAll(" ?\r\r ?", "\r");
// remove leading and trailing space or line break in the string
int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0);
int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length());
if (end_idx < start_idx) {
end_idx = start_idx;
}
// System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" +
// normText_.length());
normText = normText.substring(start_idx, end_idx);
// normalize line breaks
normText = normText.replaceAll("\r", NORM_CR);
// normalize apostrophes
normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP);
// normalize quotations
normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU);
// normalize hypens
normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY);
return normText;
}
/**
* Return the version string.
*
* @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion()
*/
public String getVersion() {
return VERSION;
}
/**
* Returns the normalizer line separator string.
* @return the line separator string
*/
public String getNormCR() {
return NORM_CR;
}
}