/* * * Copyright (c) 2006 by Know-Center, Graz, Austria * * * This software is the confidential and proprietary information of Know-Center, * Graz, Austria. You shall not disclose such Confidential Information and shall * use it only in accordance with the terms of the license agreement you entered * into with Know-Center. * * KNOW-CENTER MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE * SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, * OR NON-INFRINGEMENT. KNOW-CENTER SHALL NOT BE LIABLE FOR ANY DAMAGES * SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING * THIS SOFTWARE OR ITS DERIVATIVES. * * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $ */ package at.knowcenter.wag.egov.egiz.tools; import java.io.Serializable; /** * This ist the first version implementing a normalizer method. The normalize statements are * performed by using regular expressions. * * @author wlackner */ public class NormalizeV01 implements Normalize, Serializable { /** * SVUID. */ private static final long serialVersionUID = 2302956630639871601L; /** * The space string */ private final static String NORM_SP = " "; //\u0020 /** * The line break string --> use only \n because XML-Parser ignores \r\n */ private final static String NORM_CR = "\n"; // /** * The apostrophe string */ private final static String NORM_AP = "'"; //\u0027 /** * The quotation mark string */ private final static String NORM_QU = "\""; //\u0022 /** * The hypens string */ private final static String NORM_HY = "-"; //\u002D /** * The current version string */ protected static final String VERSION = "V01"; /** * The empty constructor. */ public NormalizeV01() { } /** * The normalizer implementation.
* Normalizer algorithums: *

code all multiple line breaks as \n\n
replace all Tabs and form feeds with spaces
code line breaks as \n
reduce all multiple line breaks into one line break, code line break as \r
replace all single line breaks with space
normalize spaces
remove spaces before and after a line break
remove leading and trailing space or line break in the string
normalize line breaks
normalize apostrophes
normalize quotations
normalize hypens

* * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String) */ public String normalize(String rawText) { if (rawText == null || rawText.equals("null") || rawText.length() == 0) { return ""; } String normText = rawText; // replace all null values normText = normText.replaceAll("\u0000+", ""); // replace all Tabs and form feeds with spaces normText = normText.replaceAll("[\t\f]", NORM_SP); // replace all non breaking spaces with normal spaces normText = normText.replaceAll("\u00a0+", NORM_SP); // code all windows line breaks as \n normText = normText.replaceAll("\r\n", "\n"); // code all mac line breaks as \n normText = normText.replace('\r', '\n'); // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r"); // replace all single line breaks with one line break normText = normText.replace('\n', '\r'); // normalize spaces normText = normText.replaceAll(" +", NORM_SP); // remove spaces before and after a single line break normText = normText.replaceAll(" ?\r ?", "\r"); // remove spaces before and after a multiple line breaks normText = normText.replaceAll(" ?\r\r ?", "\r"); // remove leading and trailing space or line break in the string int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0); int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length()); if (end_idx < start_idx) { end_idx = start_idx; } // System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" + // normText_.length()); normText = normText.substring(start_idx, end_idx); // normalize line breaks normText = normText.replaceAll("\r", NORM_CR); // normalize apostrophes normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP); // normalize quotations normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU); // normalize hypens normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY); return normText; } /** * Return the version string. * * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion() */ public String getVersion() { return VERSION; } /** * Returns the normalizer line separator string. * @return the line separator string */ public String getNormCR() { return NORM_CR; } }