From 9f441a0aaf0e55b50014e814410c61117f7330c4 Mon Sep 17 00:00:00 2001 From: mcentner Date: Fri, 12 Sep 2008 13:50:31 +0000 Subject: Add text validation. git-svn-id: https://joinup.ec.europa.eu/svn/mocca/trunk@35 8a26b1a7-26f0-462f-b9ef-d0e30c41f5a4 --- .../java/at/gv/egiz/bku/text/TextValidator.java | 77 +++++++++- .../at/gv/egiz/bku/text/TestTextValidator.java | 155 +++++++++++++++++++++ 2 files changed, 231 insertions(+), 1 deletion(-) create mode 100644 BKUViewer/src/test/java/at/gv/egiz/bku/text/TestTextValidator.java (limited to 'BKUViewer') diff --git a/BKUViewer/src/main/java/at/gv/egiz/bku/text/TextValidator.java b/BKUViewer/src/main/java/at/gv/egiz/bku/text/TextValidator.java index 5108140d..485aa727 100644 --- a/BKUViewer/src/main/java/at/gv/egiz/bku/text/TextValidator.java +++ b/BKUViewer/src/main/java/at/gv/egiz/bku/text/TextValidator.java @@ -16,17 +16,92 @@ */ package at.gv.egiz.bku.text; +import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import at.gv.egiz.bku.viewer.ValidationException; import at.gv.egiz.bku.viewer.Validator; public class TextValidator implements Validator { + /** + * Logging facility. + */ + protected static Log log = LogFactory.getLog(TextValidator.class); + + private void invalid(char c) throws ValidationException { + log.info("Invalid character (0x" + Integer.toHexString(c) + ") found."); + // TODO: localize + throw new ValidationException(); + } + @Override public void validate(InputStream is, String charset) throws ValidationException { - // TODO: implement character validation + + InputStreamReader reader; + if (charset != null) { + try { + reader = new InputStreamReader(is, charset); + } catch (UnsupportedEncodingException e) { + log.info("Charset '" + charset + "' not supported.", e); + // TODO: localize + throw new ValidationException(e); + } + } else { + reader = new InputStreamReader(is, Charset.forName("UTF-8")); + } + + try { + char c; + CharBuffer cb = CharBuffer.allocate(256); + for (int l; (l = reader.read(cb)) != -1;) { + cb.flip(); + for (int i = 0; i < l; i++) { + c = cb.get(); + if (c < '\u0020') { + // C0 Controls and Basic Latin (0x000C-0x000D) + if (c > '\r') invalid(c); if (c >= '\u000C') continue; + // C0 Controls and Basic Latin (0x0009-0x000A) + if (c > '\n') invalid(c); if (c >= '\t') continue; + invalid(c); + } else { + // C0 Controls and Basic Latin (0x0020-0x007E) + if (c <= '\u007E') continue; + // C1 Controls and Latin-1 Supplement (0x00A1-0x00FF) + if (c < '\u00A1') invalid(c); if (c <= '\u00FF') continue; + // Latin Extended-A (0x0100-0x017F) + if (c < '\u0100') invalid(c); if (c <= '\u017F') continue; + // EURO Sign + if (c == '\u20AC') continue; + // Spacing Modifier Letters + if (c == '\u02C7') continue; + if (c == '\u02D8') continue; + if (c == '\u02D9') continue; + if (c == '\u02DB') continue; + if (c == '\u02DD') continue; + if (c == '\u2015') continue; + invalid(c); + } + } + } + cb.clear(); + } catch (IOException e) { + // TODO: localize + throw new ValidationException(e); + } + + + } } diff --git a/BKUViewer/src/test/java/at/gv/egiz/bku/text/TestTextValidator.java b/BKUViewer/src/test/java/at/gv/egiz/bku/text/TestTextValidator.java new file mode 100644 index 00000000..7137911d --- /dev/null +++ b/BKUViewer/src/test/java/at/gv/egiz/bku/text/TestTextValidator.java @@ -0,0 +1,155 @@ +/* +* Copyright 2008 Federal Chancellery Austria and +* Graz University of Technology +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package at.gv.egiz.bku.text; + +import static org.junit.Assert.*; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; + +import org.junit.Ignore; +import org.junit.Test; + +import at.gv.egiz.bku.viewer.ValidationException; +import at.gv.egiz.bku.viewer.Validator; +import at.gv.egiz.bku.viewer.ValidatorFactory; + +public class TestTextValidator { + + public static byte[] generateText(String encoding) throws UnsupportedEncodingException { + + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + PrintWriter writer = new PrintWriter(new OutputStreamWriter(bos, encoding)); + + writer.write("C0 Controls and Basic Latin 0x0009-0x000A"); + writer.write("\n"); + for (char c = '\t'; c <= '\n'; c++) { + writer.write(c); + } + writer.write("\n"); + writer.write("C0 Controls and Basic Latin 0x000C-0x000D"); + writer.write("\n"); + for (char c = '\f'; c <= '\r'; c++) { + writer.write(c); + } + writer.write("\n"); + writer.write("C0 Controls and Basic Latin 0x0020-0x007E"); + writer.write("\n"); + for (char c = '\u0020'; c <= '\u007E'; c++) { + writer.write(c); + } + writer.write("\n"); + writer.write("C1 Controls and Latin-1 Supplement 0x00A1-0x00FF"); + writer.write("\n"); + for (char c = '\u00A1'; c <= '\u00FF'; c++) { + writer.write(c); + } + writer.write("\n"); + writer.write("Latin Extended-A 0x0100-0x017F"); + writer.write("\n"); + for (char c = '\u0100'; c <= '\u017F'; c++) { + writer.write(c); + } + writer.write("\n"); + writer.write("Spacing Modifier Letters 0x02C7"); + writer.write("\n"); + writer.write("\u02C7"); + writer.write("\n"); + writer.write("Spacing Modifier Letters 0x02D8"); + writer.write("\n"); + writer.write("\u02D8"); + writer.write("\n"); + writer.write("Spacing Modifier Letters 0x02D9"); + writer.write("\n"); + writer.write("\u02D9"); + writer.write("\n"); + writer.write("Spacing Modifier Letters 0x02DB"); + writer.write("\n"); + writer.write("\u02DB"); + writer.write("\n"); + writer.write("Spacing Modifier Letters 0x02DD"); + writer.write("\n"); + writer.write("\u02DD"); + writer.write("\n"); + writer.write("General Punctuation 0x2015"); + writer.write("\n"); + writer.write("\u2015"); + writer.write("\n"); + writer.write("Currency Symbols 0x20AC"); + writer.write("\n"); + writer.write("\u20AC"); + writer.flush(); + + return bos.toByteArray(); + + } + + public void testTextValidation(String encoding) throws ValidationException, UnsupportedEncodingException { + + Validator validator = ValidatorFactory.newValidator("text/plain"); + + assertNotNull(validator); + + InputStream is = new ByteArrayInputStream(generateText(encoding)); + + assertNotNull(is); + + validator.validate(is, encoding); + + } + + @Test + public void testUTF8() throws ValidationException, UnsupportedEncodingException { + testTextValidation("UTF-8"); + } + + @Test + public void testISO8859_1() throws ValidationException, UnsupportedEncodingException { + testTextValidation("ISO-8859-1"); + } + + @Test + public void testISO8859_2() throws ValidationException, UnsupportedEncodingException { + testTextValidation("ISO-8859-2"); + } + + @Test + public void testISO8859_3() throws ValidationException, UnsupportedEncodingException { + testTextValidation("ISO-8859-3"); + } + + @Test + public void testISO8859_9() throws ValidationException, UnsupportedEncodingException { + testTextValidation("ISO-8859-9"); + } + + @Ignore + @Test + public void testISO8859_10() throws ValidationException, UnsupportedEncodingException { + testTextValidation("ISO-8859-10"); + } + + @Test + public void testISO8859_15() throws ValidationException, UnsupportedEncodingException { + testTextValidation("ISO-8859-15"); + } + +} -- cgit v1.2.3