aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/com/lowagie/text/pdf/hyphenation
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/com/lowagie/text/pdf/hyphenation')
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/ByteVector.java124
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/CharVector.java134
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/Hyphen.java68
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/Hyphenation.java83
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/HyphenationException.java28
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/HyphenationTree.java454
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/Hyphenator.java240
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/PatternConsumer.java55
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/SimplePatternParser.java278
-rw-r--r--src/main/java/com/lowagie/text/pdf/hyphenation/TernaryTree.java667
10 files changed, 2131 insertions, 0 deletions
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/ByteVector.java b/src/main/java/com/lowagie/text/pdf/hyphenation/ByteVector.java
new file mode 100644
index 0000000..f3a83ec
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/ByteVector.java
@@ -0,0 +1,124 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+import java.io.Serializable;
+
+/**
+ * This class implements a simple byte vector with access to the
+ * underlying array.
+ *
+ * @author Carlos Villegas <cav@uniscope.co.jp>
+ */
+public class ByteVector implements Serializable {
+
+ /**
+ * Capacity increment size
+ */
+ private static final int DEFAULT_BLOCK_SIZE = 2048;
+ private int blockSize;
+
+ /**
+ * The encapsulated array
+ */
+ private byte[] array;
+
+ /**
+ * Points to next free item
+ */
+ private int n;
+
+ public ByteVector() {
+ this(DEFAULT_BLOCK_SIZE);
+ }
+
+ public ByteVector(int capacity) {
+ if (capacity > 0) {
+ blockSize = capacity;
+ } else {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = new byte[blockSize];
+ n = 0;
+ }
+
+ public ByteVector(byte[] a) {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ array = a;
+ n = 0;
+ }
+
+ public ByteVector(byte[] a, int capacity) {
+ if (capacity > 0) {
+ blockSize = capacity;
+ } else {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = a;
+ n = 0;
+ }
+
+ public byte[] getArray() {
+ return array;
+ }
+
+ /**
+ * return number of items in array
+ */
+ public int length() {
+ return n;
+ }
+
+ /**
+ * returns current capacity of array
+ */
+ public int capacity() {
+ return array.length;
+ }
+
+ public void put(int index, byte val) {
+ array[index] = val;
+ }
+
+ public byte get(int index) {
+ return array[index];
+ }
+
+ /**
+ * This is to implement memory allocation in the array. Like malloc().
+ */
+ public int alloc(int size) {
+ int index = n;
+ int len = array.length;
+ if (n + size >= len) {
+ byte[] aux = new byte[len + blockSize];
+ System.arraycopy(array, 0, aux, 0, len);
+ array = aux;
+ }
+ n += size;
+ return index;
+ }
+
+ public void trimToSize() {
+ if (n < array.length) {
+ byte[] aux = new byte[n];
+ System.arraycopy(array, 0, aux, 0, n);
+ array = aux;
+ }
+ }
+
+}
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/CharVector.java b/src/main/java/com/lowagie/text/pdf/hyphenation/CharVector.java
new file mode 100644
index 0000000..777cc7a
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/CharVector.java
@@ -0,0 +1,134 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+import java.io.Serializable;
+
+/**
+ * This class implements a simple char vector with access to the
+ * underlying array.
+ *
+ * @author Carlos Villegas <cav@uniscope.co.jp>
+ */
+public class CharVector implements Cloneable, Serializable {
+
+ /**
+ * Capacity increment size
+ */
+ private static final int DEFAULT_BLOCK_SIZE = 2048;
+ private int blockSize;
+
+ /**
+ * The encapsulated array
+ */
+ private char[] array;
+
+ /**
+ * Points to next free item
+ */
+ private int n;
+
+ public CharVector() {
+ this(DEFAULT_BLOCK_SIZE);
+ }
+
+ public CharVector(int capacity) {
+ if (capacity > 0) {
+ blockSize = capacity;
+ } else {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = new char[blockSize];
+ n = 0;
+ }
+
+ public CharVector(char[] a) {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ array = a;
+ n = a.length;
+ }
+
+ public CharVector(char[] a, int capacity) {
+ if (capacity > 0) {
+ blockSize = capacity;
+ } else {
+ blockSize = DEFAULT_BLOCK_SIZE;
+ }
+ array = a;
+ n = a.length;
+ }
+
+ /**
+ * Reset Vector but don't resize or clear elements
+ */
+ public void clear() {
+ n = 0;
+ }
+
+ public Object clone() {
+ CharVector cv = new CharVector((char[])array.clone(), blockSize);
+ cv.n = this.n;
+ return cv;
+ }
+
+ public char[] getArray() {
+ return array;
+ }
+
+ /**
+ * return number of items in array
+ */
+ public int length() {
+ return n;
+ }
+
+ /**
+ * returns current capacity of array
+ */
+ public int capacity() {
+ return array.length;
+ }
+
+ public void put(int index, char val) {
+ array[index] = val;
+ }
+
+ public char get(int index) {
+ return array[index];
+ }
+
+ public int alloc(int size) {
+ int index = n;
+ int len = array.length;
+ if (n + size >= len) {
+ char[] aux = new char[len + blockSize];
+ System.arraycopy(array, 0, aux, 0, len);
+ array = aux;
+ }
+ n += size;
+ return index;
+ }
+
+ public void trimToSize() {
+ if (n < array.length) {
+ char[] aux = new char[n];
+ System.arraycopy(array, 0, aux, 0, n);
+ array = aux;
+ }
+ }
+
+}
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/Hyphen.java b/src/main/java/com/lowagie/text/pdf/hyphenation/Hyphen.java
new file mode 100644
index 0000000..7e2522c
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/Hyphen.java
@@ -0,0 +1,68 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+import java.io.Serializable;
+
+/**
+ * This class represents a hyphen. A 'full' hyphen is made of 3 parts:
+ * the pre-break text, post-break text and no-break. If no line-break
+ * is generated at this position, the no-break text is used, otherwise,
+ * pre-break and post-break are used. Typically, pre-break is equal to
+ * the hyphen character and the others are empty. However, this general
+ * scheme allows support for cases in some languages where words change
+ * spelling if they're split across lines, like german's 'backen' which
+ * hyphenates 'bak-ken'. BTW, this comes from TeX.
+ *
+ * @author Carlos Villegas <cav@uniscope.co.jp>
+ */
+
+public class Hyphen implements Serializable {
+ public String preBreak;
+ public String noBreak;
+ public String postBreak;
+
+ Hyphen(String pre, String no, String post) {
+ preBreak = pre;
+ noBreak = no;
+ postBreak = post;
+ }
+
+ Hyphen(String pre) {
+ preBreak = pre;
+ noBreak = null;
+ postBreak = null;
+ }
+
+ public String toString() {
+ if (noBreak == null
+ && postBreak == null
+ && preBreak != null
+ && preBreak.equals("-")) {
+ return "-";
+ }
+ StringBuffer res = new StringBuffer("{");
+ res.append(preBreak);
+ res.append("}{");
+ res.append(postBreak);
+ res.append("}{");
+ res.append(noBreak);
+ res.append('}');
+ return res.toString();
+ }
+
+}
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/Hyphenation.java b/src/main/java/com/lowagie/text/pdf/hyphenation/Hyphenation.java
new file mode 100644
index 0000000..c86ab87
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/Hyphenation.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+/**
+ * This class represents a hyphenated word.
+ *
+ * @author Carlos Villegas <cav@uniscope.co.jp>
+ */
+public class Hyphenation {
+
+ private int[] hyphenPoints;
+ private String word;
+
+ /**
+ * number of hyphenation points in word
+ */
+ private int len;
+
+ /**
+ * rawWord as made of alternating strings and {@link Hyphen Hyphen}
+ * instances
+ */
+ Hyphenation(String word, int[] points) {
+ this.word = word;
+ hyphenPoints = points;
+ len = points.length;
+ }
+
+ /**
+ * @return the number of hyphenation points in the word
+ */
+ public int length() {
+ return len;
+ }
+
+ /**
+ * @return the pre-break text, not including the hyphen character
+ */
+ public String getPreHyphenText(int index) {
+ return word.substring(0, hyphenPoints[index]);
+ }
+
+ /**
+ * @return the post-break text
+ */
+ public String getPostHyphenText(int index) {
+ return word.substring(hyphenPoints[index]);
+ }
+
+ /**
+ * @return the hyphenation points
+ */
+ public int[] getHyphenationPoints() {
+ return hyphenPoints;
+ }
+
+ public String toString() {
+ StringBuffer str = new StringBuffer();
+ int start = 0;
+ for (int i = 0; i < len; i++) {
+ str.append(word.substring(start, hyphenPoints[i]) + "-");
+ start = hyphenPoints[i];
+ }
+ str.append(word.substring(start));
+ return str.toString();
+ }
+
+}
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/HyphenationException.java b/src/main/java/com/lowagie/text/pdf/hyphenation/HyphenationException.java
new file mode 100644
index 0000000..575e18d
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/HyphenationException.java
@@ -0,0 +1,28 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+/**
+ * @author Carlos Villegas <cav@uniscope.co.jp>
+ */
+public class HyphenationException extends Exception {
+
+ public HyphenationException(String msg) {
+ super(msg);
+ }
+
+}
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/HyphenationTree.java b/src/main/java/com/lowagie/text/pdf/hyphenation/HyphenationTree.java
new file mode 100644
index 0000000..b0d4c0b
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/HyphenationTree.java
@@ -0,0 +1,454 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id: HyphenationTree.java,v 1.40 2005/07/16 16:49:34 blowagie Exp $ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+/**
+ * This tree structure stores the hyphenation patterns in an efficient
+ * way for fast lookup. It provides the provides the method to
+ * hyphenate a word.
+ *
+ * @author Carlos Villegas <cav@uniscope.co.jp>
+ */
+public class HyphenationTree extends TernaryTree
+ implements PatternConsumer, Serializable {
+
+ /**
+ * value space: stores the inteletter values
+ */
+ protected ByteVector vspace;
+
+ /**
+ * This map stores hyphenation exceptions
+ */
+ protected HashMap stoplist;
+
+ /**
+ * This map stores the character classes
+ */
+ protected TernaryTree classmap;
+
+ /**
+ * Temporary map to store interletter values on pattern loading.
+ */
+ private transient TernaryTree ivalues;
+
+ public HyphenationTree() {
+ stoplist = new HashMap(23); // usually a small table
+ classmap = new TernaryTree();
+ vspace = new ByteVector();
+ vspace.alloc(1); // this reserves index 0, which we don't use
+ }
+
+ /**
+ * Packs the values by storing them in 4 bits, two values into a byte
+ * Values range is from 0 to 9. We use zero as terminator,
+ * so we'll add 1 to the value.
+ * @param values a string of digits from '0' to '9' representing the
+ * interletter values.
+ * @return the index into the vspace array where the packed values
+ * are stored.
+ */
+ protected int packValues(String values) {
+ int i, n = values.length();
+ int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
+ int offset = vspace.alloc(m);
+ byte[] va = vspace.getArray();
+ for (i = 0; i < n; i++) {
+ int j = i >> 1;
+ byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f);
+ if ((i & 1) == 1) {
+ va[j + offset] = (byte)(va[j + offset] | v);
+ } else {
+ va[j + offset] = (byte)(v << 4); // big endian
+ }
+ }
+ va[m - 1 + offset] = 0; // terminator
+ return offset;
+ }
+
+ protected String unpackValues(int k) {
+ StringBuffer buf = new StringBuffer();
+ byte v = vspace.get(k++);
+ while (v != 0) {
+ char c = (char)((v >>> 4) - 1 + '0');
+ buf.append(c);
+ c = (char)(v & 0x0f);
+ if (c == 0) {
+ break;
+ }
+ c = (char)(c - 1 + '0');
+ buf.append(c);
+ v = vspace.get(k++);
+ }
+ return buf.toString();
+ }
+
+ public void loadSimplePatterns(InputStream stream) throws HyphenationException {
+ SimplePatternParser pp = new SimplePatternParser();
+ ivalues = new TernaryTree();
+
+ pp.parse(stream, this);
+
+ // patterns/values should be now in the tree
+ // let's optimize a bit
+ trimToSize();
+ vspace.trimToSize();
+ classmap.trimToSize();
+
+ // get rid of the auxiliary map
+ ivalues = null;
+ }
+
+
+ public String findPattern(String pat) {
+ int k = super.find(pat);
+ if (k >= 0) {
+ return unpackValues(k);
+ }
+ return "";
+ }
+
+ /**
+ * String compare, returns 0 if equal or
+ * t is a substring of s
+ */
+ protected int hstrcmp(char[] s, int si, char[] t, int ti) {
+ for (; s[si] == t[ti]; si++, ti++) {
+ if (s[si] == 0) {
+ return 0;
+ }
+ }
+ if (t[ti] == 0) {
+ return 0;
+ }
+ return s[si] - t[ti];
+ }
+
+ protected byte[] getValues(int k) {
+ StringBuffer buf = new StringBuffer();
+ byte v = vspace.get(k++);
+ while (v != 0) {
+ char c = (char)((v >>> 4) - 1);
+ buf.append(c);
+ c = (char)(v & 0x0f);
+ if (c == 0) {
+ break;
+ }
+ c = (char)(c - 1);
+ buf.append(c);
+ v = vspace.get(k++);
+ }
+ byte[] res = new byte[buf.length()];
+ for (int i = 0; i < res.length; i++) {
+ res[i] = (byte)buf.charAt(i);
+ }
+ return res;
+ }
+
+ /**
+ * <p>Search for all possible partial matches of word starting
+ * at index an update interletter values. In other words, it
+ * does something like:</p>
+ * <code>
+ * for(i=0; i<patterns.length; i++) {
+ * if ( word.substring(index).startsWidth(patterns[i]) )
+ * update_interletter_values(patterns[i]);
+ * }
+ * </code>
+ * <p>But it is done in an efficient way since the patterns are
+ * stored in a ternary tree. In fact, this is the whole purpose
+ * of having the tree: doing this search without having to test
+ * every single pattern. The number of patterns for languages
+ * such as English range from 4000 to 10000. Thus, doing thousands
+ * of string comparisons for each word to hyphenate would be
+ * really slow without the tree. The tradeoff is memory, but
+ * using a ternary tree instead of a trie, almost halves the
+ * the memory used by Lout or TeX. It's also faster than using
+ * a hash table</p>
+ * @param word null terminated word to match
+ * @param index start index from word
+ * @param il interletter values array to update
+ */
+ protected void searchPatterns(char[] word, int index, byte[] il) {
+ byte[] values;
+ int i = index;
+ char p, q;
+ char sp = word[i];
+ p = root;
+
+ while (p > 0 && p < sc.length) {
+ if (sc[p] == 0xFFFF) {
+ if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
+ values = getValues(eq[p]); // data pointer is in eq[]
+ int j = index;
+ for (int k = 0; k < values.length; k++) {
+ if (j < il.length && values[k] > il[j]) {
+ il[j] = values[k];
+ }
+ j++;
+ }
+ }
+ return;
+ }
+ int d = sp - sc[p];
+ if (d == 0) {
+ if (sp == 0) {
+ break;
+ }
+ sp = word[++i];
+ p = eq[p];
+ q = p;
+
+ // look for a pattern ending at this position by searching for
+ // the null char ( splitchar == 0 )
+ while (q > 0 && q < sc.length) {
+ if (sc[q] == 0xFFFF) { // stop at compressed branch
+ break;
+ }
+ if (sc[q] == 0) {
+ values = getValues(eq[q]);
+ int j = index;
+ for (int k = 0; k < values.length; k++) {
+ if (j < il.length && values[k] > il[j]) {
+ il[j] = values[k];
+ }
+ j++;
+ }
+ break;
+ } else {
+ q = lo[q];
+
+ /**
+ * actually the code should be:
+ * q = sc[q] < 0 ? hi[q] : lo[q];
+ * but java chars are unsigned
+ */
+ }
+ }
+ } else {
+ p = d < 0 ? lo[p] : hi[p];
+ }
+ }
+ }
+
+ /**
+ * Hyphenate word and return a Hyphenation object.
+ * @param word the word to be hyphenated
+ * @param remainCharCount Minimum number of characters allowed
+ * before the hyphenation point.
+ * @param pushCharCount Minimum number of characters allowed after
+ * the hyphenation point.
+ * @return a {@link Hyphenation Hyphenation} object representing
+ * the hyphenated word or null if word is not hyphenated.
+ */
+ public Hyphenation hyphenate(String word, int remainCharCount,
+ int pushCharCount) {
+ char[] w = word.toCharArray();
+ return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
+ }
+
+ /**
+ * w = "****nnllllllnnn*****",
+ * where n is a non-letter, l is a letter,
+ * all n may be absent, the first n is at offset,
+ * the first l is at offset + iIgnoreAtBeginning;
+ * word = ".llllll.'\0'***",
+ * where all l in w are copied into word.
+ * In the first part of the routine len = w.length,
+ * in the second part of the routine len = word.length.
+ * Three indices are used:
+ * index(w), the index in w,
+ * index(word), the index in word,
+ * letterindex(word), the index in the letter part of word.
+ * The following relations exist:
+ * index(w) = offset + i - 1
+ * index(word) = i - iIgnoreAtBeginning
+ * letterindex(word) = index(word) - 1
+ * (see first loop).
+ * It follows that:
+ * index(w) - index(word) = offset - 1 + iIgnoreAtBeginning
+ * index(w) = letterindex(word) + offset + iIgnoreAtBeginning
+ */
+
+ /**
+ * Hyphenate word and return an array of hyphenation points.
+ * @param w char array that contains the word
+ * @param offset Offset to first character in word
+ * @param len Length of word
+ * @param remainCharCount Minimum number of characters allowed
+ * before the hyphenation point.
+ * @param pushCharCount Minimum number of characters allowed after
+ * the hyphenation point.
+ * @return a {@link Hyphenation Hyphenation} object representing
+ * the hyphenated word or null if word is not hyphenated.
+ */
+ public Hyphenation hyphenate(char[] w, int offset, int len,
+ int remainCharCount, int pushCharCount) {
+ int i;
+ char[] word = new char[len + 3];
+
+ // normalize word
+ char[] c = new char[2];
+ int iIgnoreAtBeginning = 0;
+ int iLength = len;
+ boolean bEndOfLetters = false;
+ for (i = 1; i <= len; i++) {
+ c[0] = w[offset + i - 1];
+ int nc = classmap.find(c, 0);
+ if (nc < 0) { // found a non-letter character ...
+ if (i == (1 + iIgnoreAtBeginning)) {
+ // ... before any letter character
+ iIgnoreAtBeginning ++;
+ } else {
+ // ... after a letter character
+ bEndOfLetters = true;
+ }
+ iLength --;
+ } else {
+ if (!bEndOfLetters) {
+ word[i - iIgnoreAtBeginning] = (char)nc;
+ } else {
+ return null;
+ }
+ }
+ }
+ len = iLength;
+ if (len < (remainCharCount + pushCharCount)) {
+ // word is too short to be hyphenated
+ return null;
+ }
+ int[] result = new int[len + 1];
+ int k = 0;
+
+ // check exception list first
+ String sw = new String(word, 1, len);
+ if (stoplist.containsKey(sw)) {
+ // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
+ ArrayList hw = (ArrayList)stoplist.get(sw);
+ int j = 0;
+ for (i = 0; i < hw.size(); i++) {
+ Object o = hw.get(i);
+ // j = index(sw) = letterindex(word)?
+ // result[k] = corresponding index(w)
+ if (o instanceof String) {
+ j += ((String)o).length();
+ if (j >= remainCharCount && j < (len - pushCharCount)) {
+ result[k++] = j + iIgnoreAtBeginning;
+ }
+ }
+ }
+ } else {
+ // use algorithm to get hyphenation points
+ word[0] = '.'; // word start marker
+ word[len + 1] = '.'; // word end marker
+ word[len + 2] = 0; // null terminated
+ byte[] il = new byte[len + 3]; // initialized to zero
+ for (i = 0; i < len + 1; i++) {
+ searchPatterns(word, i, il);
+ }
+
+ // hyphenation points are located where interletter value is odd
+ // i is letterindex(word),
+ // i + 1 is index(word),
+ // result[k] = corresponding index(w)
+ for (i = 0; i < len; i++) {
+ if (((il[i + 1] & 1) == 1) && i >= remainCharCount
+ && i <= (len - pushCharCount)) {
+ result[k++] = i + iIgnoreAtBeginning;
+ }
+ }
+ }
+
+
+ if (k > 0) {
+ // trim result array
+ int[] res = new int[k];
+ System.arraycopy(result, 0, res, 0, k);
+ return new Hyphenation(new String(w, offset, len), res);
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Add a character class to the tree. It is used by
+ * {@link SimplePatternParser SimplePatternParser} as callback to
+ * add character classes. Character classes define the
+ * valid word characters for hyphenation. If a word contains
+ * a character not defined in any of the classes, it is not hyphenated.
+ * It also defines a way to normalize the characters in order
+ * to compare them with the stored patterns. Usually pattern
+ * files use only lower case characters, in this case a class
+ * for letter 'a', for example, should be defined as "aA", the first
+ * character being the normalization char.
+ */
+ public void addClass(String chargroup) {
+ if (chargroup.length() > 0) {
+ char equivChar = chargroup.charAt(0);
+ char[] key = new char[2];
+ key[1] = 0;
+ for (int i = 0; i < chargroup.length(); i++) {
+ key[0] = chargroup.charAt(i);
+ classmap.insert(key, 0, equivChar);
+ }
+ }
+ }
+
+ /**
+ * Add an exception to the tree. It is used by
+ * {@link SimplePatternParser SimplePatternParser} class as callback to
+ * store the hyphenation exceptions.
+ * @param word normalized word
+ * @param hyphenatedword a vector of alternating strings and
+ * {@link Hyphen hyphen} objects.
+ */
+ public void addException(String word, ArrayList hyphenatedword) {
+ stoplist.put(word, hyphenatedword);
+ }
+
+ /**
+ * Add a pattern to the tree. Mainly, to be used by
+ * {@link SimplePatternParser SimplePatternParser} class as callback to
+ * add a pattern to the tree.
+ * @param pattern the hyphenation pattern
+ * @param ivalue interletter weight values indicating the
+ * desirability and priority of hyphenating at a given point
+ * within the pattern. It should contain only digit characters.
+ * (i.e. '0' to '9').
+ */
+ public void addPattern(String pattern, String ivalue) {
+ int k = ivalues.find(ivalue);
+ if (k <= 0) {
+ k = packValues(ivalue);
+ ivalues.insert(ivalue, (char)k);
+ }
+ insert(pattern, (char)k);
+ }
+
+ public void printStats() {
+ System.out.println("Value space size = "
+ + Integer.toString(vspace.length()));
+ super.printStats();
+ }
+}
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/Hyphenator.java b/src/main/java/com/lowagie/text/pdf/hyphenation/Hyphenator.java
new file mode 100644
index 0000000..bdd34b9
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/Hyphenator.java
@@ -0,0 +1,240 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.Hashtable;
+
+import com.lowagie.text.pdf.BaseFont;
+
+/**
+ * This class is the main entry point to the hyphenation package.
+ * You can use only the static methods or create an instance.
+ *
+ * @author Carlos Villegas <cav@uniscope.co.jp>
+ */
+public class Hyphenator {
+
+ /** TODO: Don't use statics */
+ private static Hashtable hyphenTrees = new Hashtable();
+
+ private HyphenationTree hyphenTree = null;
+ private int remainCharCount = 2;
+ private int pushCharCount = 2;
+ private static boolean errorDump = false;
+ private static final String defaultHyphLocation = "com/lowagie/text/pdf/hyphenation/hyph/";
+
+ /** Holds value of property hyphenDir. */
+ private static String hyphenDir = "";
+
+ /**
+ * @param lang
+ * @param country
+ * @param leftMin
+ * @param rightMin
+ */
+ public Hyphenator(String lang, String country, int leftMin,
+ int rightMin) {
+ hyphenTree = getHyphenationTree(lang, country);
+ remainCharCount = leftMin;
+ pushCharCount = rightMin;
+ }
+
+ /**
+ * @param lang
+ * @param country
+ * @return the hyphenation tree
+ */
+ public static HyphenationTree getHyphenationTree(String lang,
+ String country) {
+ String key = lang;
+ // check whether the country code has been used
+ if (country != null && !country.equals("none")) {
+ key += "_" + country;
+ }
+ // first try to find it in the cache
+ if (hyphenTrees.containsKey(key)) {
+ return (HyphenationTree)hyphenTrees.get(key);
+ }
+ if (hyphenTrees.containsKey(lang)) {
+ return (HyphenationTree)hyphenTrees.get(lang);
+ }
+
+ HyphenationTree hTree = getResourceHyphenationTree(key);
+ if (hTree == null)
+ hTree = getFileHyphenationTree(key);
+ // put it into the pattern cache
+ if (hTree != null) {
+ hyphenTrees.put(key, hTree);
+ }
+ return hTree;
+ }
+
+ /**
+ * @param key
+ * @return a hyphenation tree
+ */
+ public static HyphenationTree getResourceHyphenationTree(String key) {
+ try {
+ InputStream stream = BaseFont.getResourceStream(defaultHyphLocation + key + ".xml");
+ if (stream == null && key.length() > 2)
+ stream = BaseFont.getResourceStream(defaultHyphLocation + key.substring(0, 2) + ".xml");
+ if (stream == null)
+ return null;
+ HyphenationTree hTree = new HyphenationTree();
+ hTree.loadSimplePatterns(stream);
+ return hTree;
+ }
+ catch (Exception e) {
+ return null;
+ }
+ }
+
+ /**
+ * @param key
+ * @return a hyphenation tree
+ */
+ public static HyphenationTree getFileHyphenationTree(String key) {
+ try {
+ if (hyphenDir == null)
+ return null;
+ InputStream stream = null;
+ File hyphenFile = new File(hyphenDir, key + ".xml");
+ if (hyphenFile.canRead())
+ stream = new FileInputStream(hyphenFile);
+ if (stream == null && key.length() > 2) {
+ hyphenFile = new File(hyphenDir, key.substring(0, 2) + ".xml");
+ if (hyphenFile.canRead())
+ stream = new FileInputStream(hyphenFile);
+ }
+ if (stream == null)
+ return null;
+ HyphenationTree hTree = new HyphenationTree();
+ hTree.loadSimplePatterns(stream);
+ return hTree;
+ }
+ catch (Exception e) {
+ return null;
+ }
+ }
+
+ /**
+ * @param lang
+ * @param country
+ * @param word
+ * @param leftMin
+ * @param rightMin
+ * @return a hyphenation object
+ */
+ public static Hyphenation hyphenate(String lang, String country,
+ String word, int leftMin,
+ int rightMin) {
+ HyphenationTree hTree = getHyphenationTree(lang, country);
+ if (hTree == null) {
+ //log.error("Error building hyphenation tree for language "
+ // + lang);
+ return null;
+ }
+ return hTree.hyphenate(word, leftMin, rightMin);
+ }
+
+ /**
+ * @param lang
+ * @param country
+ * @param word
+ * @param offset
+ * @param len
+ * @param leftMin
+ * @param rightMin
+ * @return a hyphenation object
+ */
+ public static Hyphenation hyphenate(String lang, String country,
+ char[] word, int offset, int len,
+ int leftMin, int rightMin) {
+ HyphenationTree hTree = getHyphenationTree(lang, country);
+ if (hTree == null) {
+ //log.error("Error building hyphenation tree for language "
+ // + lang);
+ return null;
+ }
+ return hTree.hyphenate(word, offset, len, leftMin, rightMin);
+ }
+
+ /**
+ * @param min
+ */
+ public void setMinRemainCharCount(int min) {
+ remainCharCount = min;
+ }
+
+ /**
+ * @param min
+ */
+ public void setMinPushCharCount(int min) {
+ pushCharCount = min;
+ }
+
+ /**
+ * @param lang
+ * @param country
+ */
+ public void setLanguage(String lang, String country) {
+ hyphenTree = getHyphenationTree(lang, country);
+ }
+
+ /**
+ * @param word
+ * @param offset
+ * @param len
+ * @return a hyphenation object
+ */
+ public Hyphenation hyphenate(char[] word, int offset, int len) {
+ if (hyphenTree == null) {
+ return null;
+ }
+ return hyphenTree.hyphenate(word, offset, len, remainCharCount,
+ pushCharCount);
+ }
+
+ /**
+ * @param word
+ * @return a hyphenation object
+ */
+ public Hyphenation hyphenate(String word) {
+ if (hyphenTree == null) {
+ return null;
+ }
+ return hyphenTree.hyphenate(word, remainCharCount, pushCharCount);
+ }
+
+ /** Getter for property hyphenDir.
+ * @return Value of property hyphenDir.
+ */
+ public static String getHyphenDir() {
+ return hyphenDir;
+ }
+
+ /** Setter for property hyphenDir.
+ * @param _hyphenDir New value of property hyphenDir.
+ */
+ public static void setHyphenDir(String _hyphenDir) {
+ hyphenDir = _hyphenDir;
+ }
+
+}
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/PatternConsumer.java b/src/main/java/com/lowagie/text/pdf/hyphenation/PatternConsumer.java
new file mode 100644
index 0000000..d7c6b63
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/PatternConsumer.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+import java.util.ArrayList;
+
+/**
+ * This interface is used to connect the XML pattern file parser to
+ * the hyphenation tree.
+ *
+ * @author Carlos Villegas <cav@uniscope.co.jp>
+ */
+public interface PatternConsumer {
+
+ /**
+ * Add a character class.
+ * A character class defines characters that are considered
+ * equivalent for the purpose of hyphenation (e.g. "aA"). It
+ * usually means to ignore case.
+ * @param chargroup character group
+ */
+ void addClass(String chargroup);
+
+ /**
+ * Add a hyphenation exception. An exception replaces the
+ * result obtained by the algorithm for cases for which this
+ * fails or the user wants to provide his own hyphenation.
+ * A hyphenatedword is a vector of alternating String's and
+ * {@link Hyphen Hyphen} instances
+ */
+ void addException(String word, ArrayList hyphenatedword);
+
+ /**
+ * Add hyphenation patterns.
+ * @param pattern the pattern
+ * @param values interletter values expressed as a string of
+ * digit characters.
+ */
+ void addPattern(String pattern, String values);
+
+}
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/SimplePatternParser.java b/src/main/java/com/lowagie/text/pdf/hyphenation/SimplePatternParser.java
new file mode 100644
index 0000000..ade8885
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/SimplePatternParser.java
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2005 by Paulo Soares.
+ *
+ * The contents of this file are subject to the Mozilla Public License Version 1.1
+ * (the "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the License.
+ *
+ * The Original Code is 'iText, a free JAVA-PDF library'.
+ *
+ * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
+ * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
+ * All Rights Reserved.
+ * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
+ * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
+ *
+ * Contributor(s): all the names of the contributors are added in the source code
+ * where applicable.
+ *
+ * Alternatively, the contents of this file may be used under the terms of the
+ * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
+ * provisions of LGPL are applicable instead of those above. If you wish to
+ * allow use of your version of this file only under the terms of the LGPL
+ * License and not to allow others to use your version of this file under
+ * the MPL, indicate your decision by deleting the provisions above and
+ * replace them with the notice and other provisions required by the LGPL.
+ * If you do not delete the provisions above, a recipient may use your version
+ * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the MPL as stated above or under the terms of the GNU
+ * Library General Public License as published by the Free Software Foundation;
+ * either version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
+ * details.
+ *
+ * If you didn't download this code from the following link, you should check if
+ * you aren't using an obsolete version:
+ * http://www.lowagie.com/iText/
+ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+import com.lowagie.text.pdf.*;
+import com.lowagie.text.ExceptionConverter;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.FileInputStream;
+
+/** Parses the xml hyphenation pattern.
+ *
+ * @author Paulo Soares (psoares@consiste.pt)
+ */
+public class SimplePatternParser implements SimpleXMLDocHandler, PatternConsumer {
+ int currElement;
+ PatternConsumer consumer;
+ StringBuffer token;
+ ArrayList exception;
+ char hyphenChar;
+ SimpleXMLParser parser;
+
+ static final int ELEM_CLASSES = 1;
+ static final int ELEM_EXCEPTIONS = 2;
+ static final int ELEM_PATTERNS = 3;
+ static final int ELEM_HYPHEN = 4;
+
+ /** Creates a new instance of PatternParser2 */
+ public SimplePatternParser() {
+ token = new StringBuffer();
+ hyphenChar = '-'; // default
+ }
+
+ public void parse(InputStream stream, PatternConsumer consumer) {
+ this.consumer = consumer;
+ try {
+ SimpleXMLParser.parse(this, stream);
+ }
+ catch (IOException e) {
+ throw new ExceptionConverter(e);
+ }
+ finally {
+ try{stream.close();}catch(Exception e){}
+ }
+ }
+
+ protected static String getPattern(String word) {
+ StringBuffer pat = new StringBuffer();
+ int len = word.length();
+ for (int i = 0; i < len; i++) {
+ if (!Character.isDigit(word.charAt(i))) {
+ pat.append(word.charAt(i));
+ }
+ }
+ return pat.toString();
+ }
+
+ protected ArrayList normalizeException(ArrayList ex) {
+ ArrayList res = new ArrayList();
+ for (int i = 0; i < ex.size(); i++) {
+ Object item = ex.get(i);
+ if (item instanceof String) {
+ String str = (String)item;
+ StringBuffer buf = new StringBuffer();
+ for (int j = 0; j < str.length(); j++) {
+ char c = str.charAt(j);
+ if (c != hyphenChar) {
+ buf.append(c);
+ } else {
+ res.add(buf.toString());
+ buf.setLength(0);
+ char[] h = new char[1];
+ h[0] = hyphenChar;
+ // we use here hyphenChar which is not necessarily
+ // the one to be printed
+ res.add(new Hyphen(new String(h), null, null));
+ }
+ }
+ if (buf.length() > 0) {
+ res.add(buf.toString());
+ }
+ } else {
+ res.add(item);
+ }
+ }
+ return res;
+ }
+
+ protected String getExceptionWord(ArrayList ex) {
+ StringBuffer res = new StringBuffer();
+ for (int i = 0; i < ex.size(); i++) {
+ Object item = ex.get(i);
+ if (item instanceof String) {
+ res.append((String)item);
+ } else {
+ if (((Hyphen)item).noBreak != null) {
+ res.append(((Hyphen)item).noBreak);
+ }
+ }
+ }
+ return res.toString();
+ }
+
+ protected static String getInterletterValues(String pat) {
+ StringBuffer il = new StringBuffer();
+ String word = pat + "a"; // add dummy letter to serve as sentinel
+ int len = word.length();
+ for (int i = 0; i < len; i++) {
+ char c = word.charAt(i);
+ if (Character.isDigit(c)) {
+ il.append(c);
+ i++;
+ } else {
+ il.append('0');
+ }
+ }
+ return il.toString();
+ }
+
+ public void endDocument() {
+ }
+
+ public void endElement(String tag) {
+ if (token.length() > 0) {
+ String word = token.toString();
+ switch (currElement) {
+ case ELEM_CLASSES:
+ consumer.addClass(word);
+ break;
+ case ELEM_EXCEPTIONS:
+ exception.add(word);
+ exception = normalizeException(exception);
+ consumer.addException(getExceptionWord(exception),
+ (ArrayList)exception.clone());
+ break;
+ case ELEM_PATTERNS:
+ consumer.addPattern(getPattern(word),
+ getInterletterValues(word));
+ break;
+ case ELEM_HYPHEN:
+ // nothing to do
+ break;
+ }
+ if (currElement != ELEM_HYPHEN) {
+ token.setLength(0);
+ }
+ }
+ if (currElement == ELEM_HYPHEN) {
+ currElement = ELEM_EXCEPTIONS;
+ } else {
+ currElement = 0;
+ }
+ }
+
+ public void startDocument() {
+ }
+
+ public void startElement(String tag, java.util.HashMap h) {
+ if (tag.equals("hyphen-char")) {
+ String hh = (String)h.get("value");
+ if (hh != null && hh.length() == 1) {
+ hyphenChar = hh.charAt(0);
+ }
+ } else if (tag.equals("classes")) {
+ currElement = ELEM_CLASSES;
+ } else if (tag.equals("patterns")) {
+ currElement = ELEM_PATTERNS;
+ } else if (tag.equals("exceptions")) {
+ currElement = ELEM_EXCEPTIONS;
+ exception = new ArrayList();
+ } else if (tag.equals("hyphen")) {
+ if (token.length() > 0) {
+ exception.add(token.toString());
+ }
+ exception.add(new Hyphen((String)h.get("pre"),
+ (String)h.get("no"),
+ (String)h.get("post")));
+ currElement = ELEM_HYPHEN;
+ }
+ token.setLength(0);
+ }
+
+ public void text(String str) {
+ StringTokenizer tk = new StringTokenizer(str);
+ while (tk.hasMoreTokens()) {
+ String word = tk.nextToken();
+ // System.out.println("\"" + word + "\"");
+ switch (currElement) {
+ case ELEM_CLASSES:
+ consumer.addClass(word);
+ break;
+ case ELEM_EXCEPTIONS:
+ exception.add(word);
+ exception = normalizeException(exception);
+ consumer.addException(getExceptionWord(exception),
+ (ArrayList)exception.clone());
+ exception.clear();
+ break;
+ case ELEM_PATTERNS:
+ consumer.addPattern(getPattern(word),
+ getInterletterValues(word));
+ break;
+ }
+ }
+ }
+
+ // PatternConsumer implementation for testing purposes
+ public void addClass(String c) {
+ System.out.println("class: " + c);
+ }
+
+ public void addException(String w, ArrayList e) {
+ System.out.println("exception: " + w + " : " + e.toString());
+ }
+
+ public void addPattern(String p, String v) {
+ System.out.println("pattern: " + p + " : " + v);
+ }
+
+ public static void main(String[] args) throws Exception {
+ try {
+ if (args.length > 0) {
+ SimplePatternParser pp = new SimplePatternParser();
+ pp.parse(new FileInputStream(args[0]), pp);
+ }
+ }
+ catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/src/main/java/com/lowagie/text/pdf/hyphenation/TernaryTree.java b/src/main/java/com/lowagie/text/pdf/hyphenation/TernaryTree.java
new file mode 100644
index 0000000..0dc16e5
--- /dev/null
+++ b/src/main/java/com/lowagie/text/pdf/hyphenation/TernaryTree.java
@@ -0,0 +1,667 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.lowagie.text.pdf.hyphenation;
+
+import java.util.Enumeration;
+import java.util.Stack;
+import java.io.Serializable;
+
+/**
+ * <h2>Ternary Search Tree.</h2>
+ *
+ * <p>A ternary search tree is a hibrid between a binary tree and
+ * a digital search tree (trie). Keys are limited to strings.
+ * A data value of type char is stored in each leaf node.
+ * It can be used as an index (or pointer) to the data.
+ * Branches that only contain one key are compressed to one node
+ * by storing a pointer to the trailer substring of the key.
+ * This class is intended to serve as base class or helper class
+ * to implement Dictionary collections or the like. Ternary trees
+ * have some nice properties as the following: the tree can be
+ * traversed in sorted order, partial matches (wildcard) can be
+ * implemented, retrieval of all keys within a given distance
+ * from the target, etc. The storage requirements are higher than
+ * a binary tree but a lot less than a trie. Performance is
+ * comparable with a hash table, sometimes it outperforms a hash
+ * function (most of the time can determine a miss faster than a hash).</p>
+ *
+ * <p>The main purpose of this java port is to serve as a base for
+ * implementing TeX's hyphenation algorithm (see The TeXBook,
+ * appendix H). Each language requires from 5000 to 15000 hyphenation
+ * patterns which will be keys in this tree. The strings patterns
+ * are usually small (from 2 to 5 characters), but each char in the
+ * tree is stored in a node. Thus memory usage is the main concern.
+ * We will sacrify 'elegance' to keep memory requirenments to the
+ * minimum. Using java's char type as pointer (yes, I know pointer
+ * it is a forbidden word in java) we can keep the size of the node
+ * to be just 8 bytes (3 pointers and the data char). This gives
+ * room for about 65000 nodes. In my tests the english patterns
+ * took 7694 nodes and the german patterns 10055 nodes,
+ * so I think we are safe.</p>
+ *
+ * <p>All said, this is a map with strings as keys and char as value.
+ * Pretty limited!. It can be extended to a general map by
+ * using the string representation of an object and using the
+ * char value as an index to an array that contains the object
+ * values.</p>
+ *
+ * @author cav@uniscope.co.jp
+ */
+
+public class TernaryTree implements Cloneable, Serializable {
+
+ /**
+ * We use 4 arrays to represent a node. I guess I should have created
+ * a proper node class, but somehow Knuth's pascal code made me forget
+ * we now have a portable language with virtual memory management and
+ * automatic garbage collection! And now is kind of late, furthermore,
+ * if it ain't broken, don't fix it.
+ */
+
+ /**
+ * Pointer to low branch and to rest of the key when it is
+ * stored directly in this node, we don't have unions in java!
+ */
+ protected char[] lo;
+
+ /**
+ * Pointer to high branch.
+ */
+ protected char[] hi;
+
+ /**
+ * Pointer to equal branch and to data when this node is a string terminator.
+ */
+ protected char[] eq;
+
+ /**
+ * <P>The character stored in this node: splitchar.
+ * Two special values are reserved:</P>
+ * <ul><li>0x0000 as string terminator</li>
+ * <li>0xFFFF to indicate that the branch starting at
+ * this node is compressed</li></ul>
+ * <p>This shouldn't be a problem if we give the usual semantics to
+ * strings since 0xFFFF is garanteed not to be an Unicode character.</p>
+ */
+ protected char[] sc;
+
+ /**
+ * This vector holds the trailing of the keys when the branch is compressed.
+ */
+ protected CharVector kv;
+
+ protected char root;
+ protected char freenode;
+ protected int length; // number of items in tree
+
+ protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
+
+ TernaryTree() {
+ init();
+ }
+
+ protected void init() {
+ root = 0;
+ freenode = 1;
+ length = 0;
+ lo = new char[BLOCK_SIZE];
+ hi = new char[BLOCK_SIZE];
+ eq = new char[BLOCK_SIZE];
+ sc = new char[BLOCK_SIZE];
+ kv = new CharVector();
+ }
+
+ /**
+ * Branches are initially compressed, needing
+ * one node per key plus the size of the string
+ * key. They are decompressed as needed when
+ * another key with same prefix
+ * is inserted. This saves a lot of space,
+ * specially for long keys.
+ */
+ public void insert(String key, char val) {
+ // make sure we have enough room in the arrays
+ int len = key.length()
+ + 1; // maximum number of nodes that may be generated
+ if (freenode + len > eq.length) {
+ redimNodeArrays(eq.length + BLOCK_SIZE);
+ }
+ char strkey[] = new char[len--];
+ key.getChars(0, len, strkey, 0);
+ strkey[len] = 0;
+ root = insert(root, strkey, 0, val);
+ }
+
+ public void insert(char[] key, int start, char val) {
+ int len = strlen(key) + 1;
+ if (freenode + len > eq.length) {
+ redimNodeArrays(eq.length + BLOCK_SIZE);
+ }
+ root = insert(root, key, start, val);
+ }
+
+ /**
+ * The actual insertion function, recursive version.
+ */
+ private char insert(char p, char[] key, int start, char val) {
+ int len = strlen(key, start);
+ if (p == 0) {
+ // this means there is no branch, this node will start a new branch.
+ // Instead of doing that, we store the key somewhere else and create
+ // only one node with a pointer to the key
+ p = freenode++;
+ eq[p] = val; // holds data
+ length++;
+ hi[p] = 0;
+ if (len > 0) {
+ sc[p] = 0xFFFF; // indicates branch is compressed
+ lo[p] = (char)kv.alloc(len
+ + 1); // use 'lo' to hold pointer to key
+ strcpy(kv.getArray(), lo[p], key, start);
+ } else {
+ sc[p] = 0;
+ lo[p] = 0;
+ }
+ return p;
+ }
+
+ if (sc[p] == 0xFFFF) {
+ // branch is compressed: need to decompress
+ // this will generate garbage in the external key array
+ // but we can do some garbage collection later
+ char pp = freenode++;
+ lo[pp] = lo[p]; // previous pointer to key
+ eq[pp] = eq[p]; // previous pointer to data
+ lo[p] = 0;
+ if (len > 0) {
+ sc[p] = kv.get(lo[pp]);
+ eq[p] = pp;
+ lo[pp]++;
+ if (kv.get(lo[pp]) == 0) {
+ // key completly decompressed leaving garbage in key array
+ lo[pp] = 0;
+ sc[pp] = 0;
+ hi[pp] = 0;
+ } else {
+ // we only got first char of key, rest is still there
+ sc[pp] = 0xFFFF;
+ }
+ } else {
+ // In this case we can save a node by swapping the new node
+ // with the compressed node
+ sc[pp] = 0xFFFF;
+ hi[p] = pp;
+ sc[p] = 0;
+ eq[p] = val;
+ length++;
+ return p;
+ }
+ }
+ char s = key[start];
+ if (s < sc[p]) {
+ lo[p] = insert(lo[p], key, start, val);
+ } else if (s == sc[p]) {
+ if (s != 0) {
+ eq[p] = insert(eq[p], key, start + 1, val);
+ } else {
+ // key already in tree, overwrite data
+ eq[p] = val;
+ }
+ } else {
+ hi[p] = insert(hi[p], key, start, val);
+ }
+ return p;
+ }
+
+ /**
+ * Compares 2 null terminated char arrays
+ */
+ public static int strcmp(char[] a, int startA, char[] b, int startB) {
+ for (; a[startA] == b[startB]; startA++, startB++) {
+ if (a[startA] == 0) {
+ return 0;
+ }
+ }
+ return a[startA] - b[startB];
+ }
+
+ /**
+ * Compares a string with null terminated char array
+ */
+ public static int strcmp(String str, char[] a, int start) {
+ int i, d, len = str.length();
+ for (i = 0; i < len; i++) {
+ d = (int)str.charAt(i) - a[start + i];
+ if (d != 0) {
+ return d;
+ }
+ if (a[start + i] == 0) {
+ return d;
+ }
+ }
+ if (a[start + i] != 0) {
+ return (int)-a[start + i];
+ }
+ return 0;
+
+ }
+
+ public static void strcpy(char[] dst, int di, char[] src, int si) {
+ while (src[si] != 0) {
+ dst[di++] = src[si++];
+ }
+ dst[di] = 0;
+ }
+
+ public static int strlen(char[] a, int start) {
+ int len = 0;
+ for (int i = start; i < a.length && a[i] != 0; i++) {
+ len++;
+ }
+ return len;
+ }
+
+ public static int strlen(char[] a) {
+ return strlen(a, 0);
+ }
+
+ public int find(String key) {
+ int len = key.length();
+ char strkey[] = new char[len + 1];
+ key.getChars(0, len, strkey, 0);
+ strkey[len] = 0;
+
+ return find(strkey, 0);
+ }
+
+ public int find(char[] key, int start) {
+ int d;
+ char p = root;
+ int i = start;
+ char c;
+
+ while (p != 0) {
+ if (sc[p] == 0xFFFF) {
+ if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
+ return eq[p];
+ } else {
+ return -1;
+ }
+ }
+ c = key[i];
+ d = c - sc[p];
+ if (d == 0) {
+ if (c == 0) {
+ return eq[p];
+ }
+ i++;
+ p = eq[p];
+ } else if (d < 0) {
+ p = lo[p];
+ } else {
+ p = hi[p];
+ }
+ }
+ return -1;
+ }
+
+ public boolean knows(String key) {
+ return (find(key) >= 0);
+ }
+
+ // redimension the arrays
+ private void redimNodeArrays(int newsize) {
+ int len = newsize < lo.length ? newsize : lo.length;
+ char[] na = new char[newsize];
+ System.arraycopy(lo, 0, na, 0, len);
+ lo = na;
+ na = new char[newsize];
+ System.arraycopy(hi, 0, na, 0, len);
+ hi = na;
+ na = new char[newsize];
+ System.arraycopy(eq, 0, na, 0, len);
+ eq = na;
+ na = new char[newsize];
+ System.arraycopy(sc, 0, na, 0, len);
+ sc = na;
+ }
+
+ public int size() {
+ return length;
+ }
+
+ public Object clone() {
+ TernaryTree t = new TernaryTree();
+ t.lo = (char[])this.lo.clone();
+ t.hi = (char[])this.hi.clone();
+ t.eq = (char[])this.eq.clone();
+ t.sc = (char[])this.sc.clone();
+ t.kv = (CharVector)this.kv.clone();
+ t.root = this.root;
+ t.freenode = this.freenode;
+ t.length = this.length;
+
+ return t;
+ }
+
+ /**
+ * Recursively insert the median first and then the median of the
+ * lower and upper halves, and so on in order to get a balanced
+ * tree. The array of keys is assumed to be sorted in ascending
+ * order.
+ */
+ protected void insertBalanced(String[] k, char[] v, int offset, int n) {
+ int m;
+ if (n < 1) {
+ return;
+ }
+ m = n >> 1;
+
+ insert(k[m + offset], v[m + offset]);
+ insertBalanced(k, v, offset, m);
+
+ insertBalanced(k, v, offset + m + 1, n - m - 1);
+ }
+
+
+ /**
+ * Balance the tree for best search performance
+ */
+ public void balance() {
+ // System.out.print("Before root splitchar = "); System.out.println(sc[root]);
+
+ int i = 0, n = length;
+ String[] k = new String[n];
+ char[] v = new char[n];
+ Iterator iter = new Iterator();
+ while (iter.hasMoreElements()) {
+ v[i] = iter.getValue();
+ k[i++] = (String)iter.nextElement();
+ }
+ init();
+ insertBalanced(k, v, 0, n);
+
+ // With uniform letter distribution sc[root] should be around 'm'
+ // System.out.print("After root splitchar = "); System.out.println(sc[root]);
+ }
+
+ /**
+ * Each node stores a character (splitchar) which is part of
+ * some key(s). In a compressed branch (one that only contain
+ * a single string key) the trailer of the key which is not
+ * already in nodes is stored externally in the kv array.
+ * As items are inserted, key substrings decrease.
+ * Some substrings may completely disappear when the whole
+ * branch is totally decompressed.
+ * The tree is traversed to find the key substrings actually
+ * used. In addition, duplicate substrings are removed using
+ * a map (implemented with a TernaryTree!).
+ *
+ */
+ public void trimToSize() {
+ // first balance the tree for best performance
+ balance();
+
+ // redimension the node arrays
+ redimNodeArrays(freenode);
+
+ // ok, compact kv array
+ CharVector kx = new CharVector();
+ kx.alloc(1);
+ TernaryTree map = new TernaryTree();
+ compact(kx, map, root);
+ kv = kx;
+ kv.trimToSize();
+ }
+
+ private void compact(CharVector kx, TernaryTree map, char p) {
+ int k;
+ if (p == 0) {
+ return;
+ }
+ if (sc[p] == 0xFFFF) {
+ k = map.find(kv.getArray(), lo[p]);
+ if (k < 0) {
+ k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
+ strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
+ map.insert(kx.getArray(), k, (char)k);
+ }
+ lo[p] = (char)k;
+ } else {
+ compact(kx, map, lo[p]);
+ if (sc[p] != 0) {
+ compact(kx, map, eq[p]);
+ }
+ compact(kx, map, hi[p]);
+ }
+ }
+
+
+ public Enumeration keys() {
+ return new Iterator();
+ }
+
+ public class Iterator implements Enumeration {
+
+ /**
+ * current node index
+ */
+ int cur;
+
+ /**
+ * current key
+ */
+ String curkey;
+
+ private class Item implements Cloneable {
+ char parent;
+ char child;
+
+ public Item() {
+ parent = 0;
+ child = 0;
+ }
+
+ public Item(char p, char c) {
+ parent = p;
+ child = c;
+ }
+
+ public Object clone() {
+ return new Item(parent, child);
+ }
+
+ }
+
+ /**
+ * Node stack
+ */
+ Stack ns;
+
+ /**
+ * key stack implemented with a StringBuffer
+ */
+ StringBuffer ks;
+
+ public Iterator() {
+ cur = -1;
+ ns = new Stack();
+ ks = new StringBuffer();
+ rewind();
+ }
+
+ public void rewind() {
+ ns.removeAllElements();
+ ks.setLength(0);
+ cur = root;
+ run();
+ }
+
+ public Object nextElement() {
+ String res = new String(curkey);
+ cur = up();
+ run();
+ return res;
+ }
+
+ public char getValue() {
+ if (cur >= 0) {
+ return eq[cur];
+ }
+ return 0;
+ }
+
+ public boolean hasMoreElements() {
+ return (cur != -1);
+ }
+
+ /**
+ * traverse upwards
+ */
+ private int up() {
+ Item i = new Item();
+ int res = 0;
+
+ if (ns.empty()) {
+ return -1;
+ }
+
+ if (cur != 0 && sc[cur] == 0) {
+ return lo[cur];
+ }
+
+ boolean climb = true;
+
+ while (climb) {
+ i = (Item)ns.pop();
+ i.child++;
+ switch (i.child) {
+ case 1:
+ if (sc[i.parent] != 0) {
+ res = eq[i.parent];
+ ns.push(i.clone());
+ ks.append(sc[i.parent]);
+ } else {
+ i.child++;
+ ns.push(i.clone());
+ res = hi[i.parent];
+ }
+ climb = false;
+ break;
+
+ case 2:
+ res = hi[i.parent];
+ ns.push(i.clone());
+ if (ks.length() > 0) {
+ ks.setLength(ks.length() - 1); // pop
+ }
+ climb = false;
+ break;
+
+ default:
+ if (ns.empty()) {
+ return -1;
+ }
+ climb = true;
+ break;
+ }
+ }
+ return res;
+ }
+
+ /**
+ * traverse the tree to find next key
+ */
+ private int run() {
+ if (cur == -1) {
+ return -1;
+ }
+
+ boolean leaf = false;
+ while (true) {
+ // first go down on low branch until leaf or compressed branch
+ while (cur != 0) {
+ if (sc[cur] == 0xFFFF) {
+ leaf = true;
+ break;
+ }
+ ns.push(new Item((char)cur, '\u0000'));
+ if (sc[cur] == 0) {
+ leaf = true;
+ break;
+ }
+ cur = lo[cur];
+ }
+ if (leaf) {
+ break;
+ }
+ // nothing found, go up one node and try again
+ cur = up();
+ if (cur == -1) {
+ return -1;
+ }
+ }
+ // The current node should be a data node and
+ // the key should be in the key stack (at least partially)
+ StringBuffer buf = new StringBuffer(ks.toString());
+ if (sc[cur] == 0xFFFF) {
+ int p = lo[cur];
+ while (kv.get(p) != 0) {
+ buf.append(kv.get(p++));
+ }
+ }
+ curkey = buf.toString();
+ return 0;
+ }
+
+ }
+
+ public void printStats() {
+ System.out.println("Number of keys = " + Integer.toString(length));
+ System.out.println("Node count = " + Integer.toString(freenode));
+ // System.out.println("Array length = " + Integer.toString(eq.length));
+ System.out.println("Key Array length = "
+ + Integer.toString(kv.length()));
+
+ /*
+ * for(int i=0; i<kv.length(); i++)
+ * if ( kv.get(i) != 0 )
+ * System.out.print(kv.get(i));
+ * else
+ * System.out.println("");
+ * System.out.println("Keys:");
+ * for(Enumeration enum = keys(); enum.hasMoreElements(); )
+ * System.out.println(enum.nextElement());
+ */
+
+ }
+
+/* public static void main(String[] args) throws Exception {
+ TernaryTree tt = new TernaryTree();
+ tt.insert("Carlos", 'C');
+ tt.insert("Car", 'r');
+ tt.insert("palos", 'l');
+ tt.insert("pa", 'p');
+ tt.trimToSize();
+ System.out.println((char)tt.find("Car"));
+ System.out.println((char)tt.find("Carlos"));
+ System.out.println((char)tt.find("alto"));
+ tt.printStats();
+ }*/
+
+}
+