/** * Copyright 2006 by Know-Center, Graz, Austria * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a * joint initiative of the Federal Chancellery Austria and Graz University of * Technology. * * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by * the European Commission - subsequent versions of the EUPL (the "Licence"); * You may not use this work except in compliance with the Licence. * You may obtain a copy of the Licence at: * http://www.osor.eu/eupl/ * * Unless required by applicable law or agreed to in writing, software * distributed under the Licence is distributed on an "AS IS" basis, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Licence for the specific language governing permissions and * limitations under the Licence. * * This product combines work with different licenses. See the "NOTICE" text * file for details on the various modules and licenses. * The "NOTICE" text file is part of the distribution. Any derivative works * that you distribute must include a readable copy of the "NOTICE" text file. * * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $ */ package at.knowcenter.wag.egov.egiz.tools; import java.io.Serializable; /** * This ist the first version implementing a normalizer method. The normalize statements are * performed by using regular expressions. * * @author wlackner */ public class NormalizeV01 implements Normalize, Serializable { // 04.11.2010 change by exthex - added keepMultipleLineBreaks parameter to normalize method // to allow multiple line breaks to not be normalized to a single one /** * SVUID. */ private static final long serialVersionUID = 2302956630639871601L; /** * The space string */ private final static String NORM_SP = " "; //\u0020 /** * The line break string --> use only \n because XML-Parser ignores \r\n */ private final static String NORM_CR = "\n"; // /** * The apostrophe string */ private final static String NORM_AP = "'"; //\u0027 /** * The quotation mark string */ private final static String NORM_QU = "\""; //\u0022 /** * The hypens string */ private final static String NORM_HY = "-"; //\u002D /** * The current version string */ protected static final String VERSION = "V01"; /** * The empty constructor. */ public NormalizeV01() { } /** * The normalizer implementation.
* Normalizer algorithums: *
    *
  1. code all multiple line breaks as \n\n
  2. *
  3. replace all Tabs and form feeds with spaces
  4. *
  5. code line breaks as \n
  6. *
  7. reduce all multiple line breaks into one line break (only if keepMultipleLineBreaks == false), code line break as \r
  8. *
  9. replace all single line breaks with space
  10. *
  11. normalize spaces
  12. *
  13. remove spaces before and after a line break
  14. *
  15. remove leading and trailing space or line break in the string
  16. *
  17. normalize line breaks
  18. *
  19. normalize apostrophes
  20. *
  21. normalize quotations
  22. *
  23. normalize hypens
  24. *
* * @param rawText the text to normalize * @param keepMultipleLineBreaks if true, multiple newlines in a row will not be normalized to a single line break * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String) */ public String normalize(String rawText, boolean keepMultipleLineBreaks) { if (rawText == null || rawText.equals("null") || rawText.length() == 0) { return ""; } String normText = rawText; // replace all null values normText = normText.replaceAll("\u0000+", ""); // replace all Tabs and form feeds with spaces normText = normText.replaceAll("[\t\f]", NORM_SP); // replace all non breaking spaces with normal spaces normText = normText.replaceAll("\u00a0+", NORM_SP); // code all windows line breaks as \n normText = normText.replaceAll("\r\n", "\n"); // code all mac line breaks as \n normText = normText.replace('\r', '\n'); if (!keepMultipleLineBreaks) { // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r"); } // replace all single line breaks with one line break normText = normText.replace('\n', '\r'); // normalize spaces normText = normText.replaceAll(" +", NORM_SP); // remove spaces before and after a single line break normText = normText.replaceAll(" ?\r ?", "\r"); if (keepMultipleLineBreaks) { // remove spaces before and after a multiple line breaks normText = normText.replaceAll(" ?\r\r ?", "\r\r"); } else { normText = normText.replaceAll(" ?\r\r ?", "\r"); } // remove leading and trailing space or line break in the string int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0); int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length()); if (end_idx < start_idx) { end_idx = start_idx; } // System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" + // normText_.length()); normText = normText.substring(start_idx, end_idx); // normalize line breaks normText = normText.replaceAll("\r", NORM_CR); // normalize apostrophes normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP); // normalize quotations normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU); // normalize hypens normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY); return normText; } /** * Return the version string. * * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion() */ public String getVersion() { return VERSION; } /** * Returns the normalizer line separator string. * @return the line separator string */ public String getNormCR() { return NORM_CR; } }