From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/tools/NormalizeV01.java | 184 +++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java') diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java new file mode 100644 index 0000000..57b8e6f --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/tools/NormalizeV01.java @@ -0,0 +1,184 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: NormalizeV01.java,v 1.5 2006/10/31 08:20:56 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.tools; + +import java.io.Serializable; + +/** + * This ist the first version implementing a normalizer method. The normalize statements are + * performed by using regular expressions. + * + * @author wlackner + */ +public class NormalizeV01 implements Normalize, Serializable { +// 04.11.2010 change by exthex - added keepMultipleLineBreaks parameter to normalize method +// to allow multiple line breaks to not be normalized to a single one + /** + * SVUID. + */ + private static final long serialVersionUID = 2302956630639871601L; + + /** + * The space string + */ + private final static String NORM_SP = " "; //\u0020 + /** + * The line break string --> use only \n because XML-Parser ignores \r\n + */ + private final static String NORM_CR = "\n"; // + /** + * The apostrophe string + */ + private final static String NORM_AP = "'"; //\u0027 + /** + * The quotation mark string + */ + private final static String NORM_QU = "\""; //\u0022 + /** + * The hypens string + */ + private final static String NORM_HY = "-"; //\u002D + /** + * The current version string + */ + protected static final String VERSION = "V01"; + + /** + * The empty constructor. + */ + public NormalizeV01() { + } + + /** + * The normalizer implementation.
+ * Normalizer algorithums: + *
    + *
  1. code all multiple line breaks as \n\n
  2. + *
  3. replace all Tabs and form feeds with spaces
  4. + *
  5. code line breaks as \n
  6. + *
  7. reduce all multiple line breaks into one line break (only if keepMultipleLineBreaks == false), code line break as \r
  8. + *
  9. replace all single line breaks with space
  10. + *
  11. normalize spaces
  12. + *
  13. remove spaces before and after a line break
  14. + *
  15. remove leading and trailing space or line break in the string
  16. + *
  17. normalize line breaks
  18. + *
  19. normalize apostrophes
  20. + *
  21. normalize quotations
  22. + *
  23. normalize hypens
  24. + *
+ * + * @param rawText the text to normalize + * @param keepMultipleLineBreaks if true, multiple newlines in a row will not be normalized to a single line break + * @see at.knowcenter.wag.egov.egiz.tools.Normalize#normalize(java.lang.String) + */ + public String normalize(String rawText, boolean keepMultipleLineBreaks) { + if (rawText == null || rawText.equals("null") || rawText.length() == 0) { + return ""; + } + String normText = rawText; + + // replace all null values + normText = normText.replaceAll("\u0000+", ""); + + // replace all Tabs and form feeds with spaces + normText = normText.replaceAll("[\t\f]", NORM_SP); + + // replace all non breaking spaces with normal spaces + normText = normText.replaceAll("\u00a0+", NORM_SP); + + // code all windows line breaks as \n + normText = normText.replaceAll("\r\n", "\n"); + + // code all mac line breaks as \n + normText = normText.replace('\r', '\n'); + + if (!keepMultipleLineBreaks) + { + // reduce all multiple line breaks into two line breaks, code muliple line break as \r\r + normText = normText.replaceAll("\n[\\s\n]*\n", "\r\r"); + } + + // replace all single line breaks with one line break + normText = normText.replace('\n', '\r'); + + // normalize spaces + normText = normText.replaceAll(" +", NORM_SP); + + // remove spaces before and after a single line break + normText = normText.replaceAll(" ?\r ?", "\r"); + + if (keepMultipleLineBreaks) + { + // remove spaces before and after a multiple line breaks + normText = normText.replaceAll(" ?\r\r ?", "\r\r"); + } else + { + normText = normText.replaceAll(" ?\r\r ?", "\r"); + } + + // remove leading and trailing space or line break in the string + int start_idx = (normText.charAt(0) == ' ' || normText.charAt(0) == '\r' ? 1 : 0); + int end_idx = (normText.charAt(normText.length() - 1) == ' ' || normText.charAt(normText.length() - 1) == '\r' ? normText.length() - 1 : normText.length()); + if (end_idx < start_idx) { + end_idx = start_idx; + } + + // System.err.println("Start idx:" + start_idx + " End idx:" + end_idx + " Text length:" + + // normText_.length()); + normText = normText.substring(start_idx, end_idx); + + // normalize line breaks + normText = normText.replaceAll("\r", NORM_CR); + + // normalize apostrophes + normText = normText.replaceAll("[\u0060\u00B4\u2018\u2019\u201A\u201B]", NORM_AP); + + // normalize quotations + normText = normText.replaceAll("[\u201C\u201D\u201E\u201F]", NORM_QU); + + // normalize hypens + normText = normText.replaceAll("[\u00AD\u2013\u2014]", NORM_HY); + + return normText; + } + + /** + * Return the version string. + * + * @see at.knowcenter.wag.egov.egiz.tools.Normalize#getVersion() + */ + public String getVersion() { + return VERSION; + } + + /** + * Returns the normalizer line separator string. + * @return the line separator string + */ + public String getNormCR() { + return NORM_CR; + } +} \ No newline at end of file -- cgit v1.2.3