From 9f441a0aaf0e55b50014e814410c61117f7330c4 Mon Sep 17 00:00:00 2001 From: mcentner Date: Fri, 12 Sep 2008 13:50:31 +0000 Subject: Add text validation. git-svn-id: https://joinup.ec.europa.eu/svn/mocca/trunk@35 8a26b1a7-26f0-462f-b9ef-d0e30c41f5a4 --- .../java/at/gv/egiz/bku/text/TextValidator.java | 77 +++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) (limited to 'BKUViewer/src/main/java/at/gv/egiz') diff --git a/BKUViewer/src/main/java/at/gv/egiz/bku/text/TextValidator.java b/BKUViewer/src/main/java/at/gv/egiz/bku/text/TextValidator.java index 5108140d..485aa727 100644 --- a/BKUViewer/src/main/java/at/gv/egiz/bku/text/TextValidator.java +++ b/BKUViewer/src/main/java/at/gv/egiz/bku/text/TextValidator.java @@ -16,17 +16,92 @@ */ package at.gv.egiz.bku.text; +import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import at.gv.egiz.bku.viewer.ValidationException; import at.gv.egiz.bku.viewer.Validator; public class TextValidator implements Validator { + /** + * Logging facility. + */ + protected static Log log = LogFactory.getLog(TextValidator.class); + + private void invalid(char c) throws ValidationException { + log.info("Invalid character (0x" + Integer.toHexString(c) + ") found."); + // TODO: localize + throw new ValidationException(); + } + @Override public void validate(InputStream is, String charset) throws ValidationException { - // TODO: implement character validation + + InputStreamReader reader; + if (charset != null) { + try { + reader = new InputStreamReader(is, charset); + } catch (UnsupportedEncodingException e) { + log.info("Charset '" + charset + "' not supported.", e); + // TODO: localize + throw new ValidationException(e); + } + } else { + reader = new InputStreamReader(is, Charset.forName("UTF-8")); + } + + try { + char c; + CharBuffer cb = CharBuffer.allocate(256); + for (int l; (l = reader.read(cb)) != -1;) { + cb.flip(); + for (int i = 0; i < l; i++) { + c = cb.get(); + if (c < '\u0020') { + // C0 Controls and Basic Latin (0x000C-0x000D) + if (c > '\r') invalid(c); if (c >= '\u000C') continue; + // C0 Controls and Basic Latin (0x0009-0x000A) + if (c > '\n') invalid(c); if (c >= '\t') continue; + invalid(c); + } else { + // C0 Controls and Basic Latin (0x0020-0x007E) + if (c <= '\u007E') continue; + // C1 Controls and Latin-1 Supplement (0x00A1-0x00FF) + if (c < '\u00A1') invalid(c); if (c <= '\u00FF') continue; + // Latin Extended-A (0x0100-0x017F) + if (c < '\u0100') invalid(c); if (c <= '\u017F') continue; + // EURO Sign + if (c == '\u20AC') continue; + // Spacing Modifier Letters + if (c == '\u02C7') continue; + if (c == '\u02D8') continue; + if (c == '\u02D9') continue; + if (c == '\u02DB') continue; + if (c == '\u02DD') continue; + if (c == '\u2015') continue; + invalid(c); + } + } + } + cb.clear(); + } catch (IOException e) { + // TODO: localize + throw new ValidationException(e); + } + + + } } -- cgit v1.2.3