aboutsummaryrefslogtreecommitdiff
path: root/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
diff options
context:
space:
mode:
Diffstat (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java')
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java956
1 files changed, 956 insertions, 0 deletions
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
new file mode 100644
index 0000000..fd59d34
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
@@ -0,0 +1,956 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: AbsoluteTextSignature.java,v 1.1 2006/10/31 08:08:33 wprinz Exp $
+ */
+package at.knowcenter.wag.egov.egiz.pdf;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Vector;
+
+import org.apache.log4j.Logger;
+
+import at.knowcenter.wag.egov.egiz.PdfAS;
+import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
+import at.knowcenter.wag.egov.egiz.exceptions.SignatureException;
+import at.knowcenter.wag.egov.egiz.exceptions.SignatureTypesException;
+import at.knowcenter.wag.egov.egiz.framework.FoundBlock;
+import at.knowcenter.wag.egov.egiz.framework.FoundKey;
+import at.knowcenter.wag.egov.egiz.sig.SignatureObject;
+import at.knowcenter.wag.egov.egiz.sig.SignatureTypeDefinition;
+import at.knowcenter.wag.egov.egiz.sig.SignatureTypes;
+
+/**
+ * Contains methods and helpers that implement the absolute text signature.
+ *
+ * @author wprinz
+ */
+public class AbsoluteTextSignature
+{
+
+ /**
+ * The logger definition.
+ */
+ private static final Logger logger = ConfigLogger.getLogger(AbsoluteTextSignature.class);
+
+ /**
+ * Returns a List of SignatureTypeDefinitions that can be extracted from text.
+ *
+ * <p>
+ * These SignatureTypeDefinitions are all text extractable, which means that they define all required fields as visible.
+ * </p>
+ * <p>
+ * This method filters out all SignatureTypeDefinitions that are not text extractable.
+ * </p>
+ *
+ * @return Returns a List of SignatureTypeDefinitions that can be extracted from text.
+ * @throws SignatureTypesException F.e.
+ */
+ public static List getSignatureTypesForTextAnalysis() throws SignatureTypesException
+ {
+ SignatureTypes sig_types = SignatureTypes.getInstance();
+ List allSignatureTypes = sig_types.getSignatureTypeDefinitions();
+
+ List textSignatureTypes = new ArrayList(allSignatureTypes.size());
+ Iterator it = allSignatureTypes.iterator();
+ while (it.hasNext())
+ {
+ SignatureTypeDefinition std = (SignatureTypeDefinition) it.next();
+ if (!std.isTextExtractable())
+ {
+ logger.debug("The profile " + std.getType() + " is not text extractable and is thereby not used for text analysis.");
+ continue;
+ }
+ textSignatureTypes.add(std);
+ }
+
+ return textSignatureTypes;
+ }
+
+ /**
+ * Extracts all signature holders from a given text.
+ *
+ * <p>
+ * First the latest signature holder is extracted. Then the latest signature
+ * holder in the rest text, which is the second latest one, is extracted. Then
+ * the third latest signature holder is extracted and so forth until no more
+ * signature holders are found.
+ * </p>
+ *
+ * @param text
+ * The text.
+ * @return Returns the List of extracted signature holders ordered by their
+ * date ascendingly (the lowest, earliest date first, the latest,
+ * newest date last). An empty list is returned if no signature
+ * holders were found.
+ * @throws SignatureException
+ * F.e.
+ * @throws SignatureTypesException
+ * F.e.
+ */
+ public static List extractSignatureHoldersFromText(String text) throws SignatureException, SignatureTypesException
+ {
+ List holders = new ArrayList();
+ String current_text = text;
+ for (;;)
+ {
+ TextualSignatureHolder signature_holder = extractLatestBlock(current_text);
+ if (signature_holder == null)
+ {
+ break;
+ }
+ holders.add(0, signature_holder);
+ current_text = signature_holder.getSignedText();
+ }
+ return holders;
+ }
+
+ /**
+ * Extracts the latest signature block from the given text and creates a
+ * SignatureHolder object that can be verified.
+ *
+ * @param text
+ * The text.
+ * @return Returns the SignatureObject extracted from the text, or null, if no
+ * latest block was found.
+ * @throws SignatureException
+ * F.e.
+ * @throws SignatureTypesException
+ * F.e.
+ */
+ public static TextualSignatureHolder extractLatestBlock(String text) throws SignatureException, SignatureTypesException
+ {
+ FoundBlock latest_block = findLatestBlock(text);
+ if (latest_block == null)
+ {
+ return null;
+ }
+ String reconstructed_text = cutOutBlock(text, latest_block);
+ SignatureObject so = createSignatureObjectFromFoundBlock(text, latest_block);
+ TextualSignatureHolder tsh = new TextualSignatureHolder(reconstructed_text, so);
+ return tsh;
+ }
+
+ /**
+ * Finds the latest signature block for a given text.
+ *
+ * <p>
+ * The latest block is the one with the highest, most recent date. Usually
+ * this block will be extracted (cut out) of the text which will result in the
+ * originally signed text of this signature to be verified using the cut out
+ * data.
+ * </p>
+ *
+ * @param text
+ * The text to be analyzed.
+ * @return Returns the latest found block or null, if there was none.
+ * @throws SignatureException
+ * F.e.
+ * @throws SignatureTypesException
+ * F.e.
+ */
+ public static FoundBlock findLatestBlock(String text) throws SignatureException, SignatureTypesException
+ {
+ List signatureTypes_ = getSignatureTypesForTextAnalysis();
+
+ List found_potential_candidates = new ArrayList();
+
+ for (int i = 0; i < signatureTypes_.size(); i++)
+ {
+ SignatureTypeDefinition block_type = (SignatureTypeDefinition) signatureTypes_.get(i);
+ List found_candidates_for_type = findPotentialSignaturesForProfile(text, block_type);
+
+ found_potential_candidates.addAll(found_candidates_for_type);
+ }
+
+ if (found_potential_candidates.isEmpty())
+ {
+ logger.debug("no candidates found at all");
+ return null;
+ }
+
+ List found_candidates = new ArrayList();
+ logger.debug("checking block integrity");
+ for (int i = 0; i < found_potential_candidates.size(); i++)
+ {
+ FoundBlock found_block = (FoundBlock) found_potential_candidates.get(i);
+ String date_value = getDateValue(text, found_block);
+ try
+ {
+ EGIZDate date = EGIZDate.parseFromString(date_value);
+
+ logger.debug("found_block = " + date + " - " + found_block);
+
+ checkBlockIntegrity(text, found_block);
+ found_candidates.add(found_block);
+ }
+ catch (Exception e)
+ {
+ logger.debug("Exception while checking the integrity of the found block " + found_block + ". Ignoring this block.", e);
+ }
+ }
+
+ sortFoundBlocksByDate(text, found_candidates);
+ if (logger.isDebugEnabled())
+ {
+ logger.debug("sorted blocks:");
+ for (int i = 0; i < found_candidates.size(); i++)
+ {
+ FoundBlock found_block = (FoundBlock) found_candidates.get(i);
+
+ String date_value = getDateValue(text, found_block);
+ EGIZDate date = EGIZDate.parseFromString(date_value);
+
+ logger.debug(" #" + i + ": " + date + " - " + found_block);
+ }
+ }
+
+ List latest_blocks = filterLastDateEqualBlocks(text, found_candidates);
+ if (logger.isDebugEnabled())
+ {
+ logger.debug("latest blocks:");
+ for (int i = 0; i < latest_blocks.size(); i++)
+ {
+ FoundBlock found_block = (FoundBlock) latest_blocks.get(i);
+
+ String date_value = getDateValue(text, found_block);
+ EGIZDate date = EGIZDate.parseFromString(date_value);
+
+ logger.debug(" #" + i + ": " + date + " - " + found_block);
+ }
+ }
+
+ // The semantic equality check has been outdated by the
+ // advanced choosing algorithm.
+ // boolean semantic_equality =
+ // PdfAS.checkForSemanticEquality(latest_blocks);
+ // logger.debug("semantic_equality = " + semantic_equality);
+ // if (!semantic_equality)
+ // {
+ // throw new SignatureException(314, "The latest blocks weren't semantically
+ // equal.");
+ // }
+
+ FoundBlock latest_block = chooseMostPossibleBlock(latest_blocks);
+
+ logger.debug("latest block = " + latest_block);
+ return latest_block;
+ }
+
+ /**
+ * Finds the List of potential blocks within the given text for the given
+ * profile.
+ *
+ * @param text
+ * The text, in which potential block are to be sought.
+ * @param block_type
+ * The profile for which the text is to be sought.
+ * @return Returns the List of potential FoundBlocks or an empty List if none
+ * could be found.
+ */
+ public static List findPotentialSignaturesForProfile(String text,
+ SignatureTypeDefinition block_type)
+ {
+ logger.debug("find potential signatures for " + block_type.getType());
+ List found_blocks = new ArrayList();
+
+ final boolean old_style = false;
+
+ Vector keys = block_type.getRevertSortedKeys();
+ Vector captions = block_type.getRevertSortedCaptions();
+
+ String last_key = (String) keys.get(0);
+ logger.debug("last_key = " + last_key);
+ String last_caption = (String) captions.get(0);
+ logger.debug("last_caption = " + last_caption);
+ String current_last_caption= last_caption;
+ List found_last_captions = findIndicesWithStartingNL(text, last_caption);
+ if (last_key.equals(SignatureTypes.SIG_ID))
+ {
+ logger.debug("Last key is SIG_ID, so it may not be present. Searching for the previous to last key.");
+ String prevlast_key = (String) keys.get(1);
+ String prevlast_caption = (String) captions.get(1);
+ current_last_caption = prevlast_caption;
+ List found_prevlast_captions = findIndicesWithStartingNL(text, prevlast_caption);
+ if (!found_prevlast_captions.isEmpty())
+ {
+ found_last_captions.addAll(found_prevlast_captions);
+ }
+ }
+ if (logger.isDebugEnabled())
+ {
+ logger.debug("found " + found_last_captions.size() + " last captions.");
+ for (int i = 0; i < found_last_captions.size(); i++)
+ {
+ logger.debug(" found last caption at index " + found_last_captions.get(i));
+ }
+ }
+
+ for (int lci = 0; lci < found_last_captions.size(); lci++)
+ {
+ int last_caption_index = ((Integer) found_last_captions.get(lci)).intValue();
+ logger.debug("resolving signature block from last caption index " + last_caption_index);
+ int potential_block_end = findEndOfValue(text, last_caption_index);
+ if (potential_block_end == (last_caption_index + current_last_caption.length()+1))
+ {
+ potential_block_end = findEndOfValue(text, potential_block_end);
+ }
+
+ // FIXME: complete HOTFIX
+ /*
+ int extendedValueEnd = potential_block_end;
+ String cv;
+ do {
+ extendedValueEnd = findEndOfValue(text, extendedValueEnd);
+ cv = text.substring(last_caption_index + current_last_caption.length()+1, extendedValueEnd);
+ } while (extendedValueEnd < text.length());
+ */
+
+ logger.debug("potential_block_end = " + potential_block_end);
+ List found_keys = PdfAS.findBlockInText(text.substring(0, potential_block_end), block_type, old_style); // findRestKeys(text,
+ // keys,
+ // captions,
+ // last_caption_index);
+
+ if (found_keys == null)
+ {
+ logger.debug("Not all other captions could be found for the last_caption_index " + last_caption_index + " ==> discarding this index.");
+
+ continue;
+ }
+
+ // sort found keys ascendingly
+ PdfAS.sortFoundKeysAscendingly(found_keys);
+
+ boolean reverse_check_ok = reverseCheckFoundKeys(text, found_keys);
+ if (!reverse_check_ok)
+ {
+ logger.debug("The reverse check ruled this list of found keys out ==> they are discarded.");
+
+ continue;
+ }
+
+ logger.debug("The reverse check proved this list of found keys out ==> adding them as potential candidates.");
+
+ FoundBlock found_block = new FoundBlock();
+ found_block.std = block_type;
+ found_block.found_keys = found_keys;
+ found_block.end_index = potential_block_end;//findEndOfValue(text, last_caption_index);
+ found_blocks.add(found_block);
+ }
+
+ logger.debug("found " + found_blocks.size() + " potential signatures for " + block_type.getType());
+ return found_blocks;
+ }
+
+ /**
+ * Finds all indices of the given subtext (starting at a new line) within a
+ * given text.
+ *
+ * <p>
+ * This is usually used to find the indices of the last captions.
+ * </p>
+ *
+ * @param text
+ * The text to be searched.
+ * @param subtext
+ * The subtext to be sought.
+ * @return Returns the List of found indices.
+ */
+ public static List findIndicesWithStartingNL(String text, String subtext)
+ {
+ List found_indices = new ArrayList();
+
+ // // for some reason "^" + subtext doesn't work as a pattern
+ // String pattern = "\n" + subtext;
+ // Pattern p = Pattern.compile(pattern);
+ // Matcher m = p.matcher(text);
+ //
+ // while (m.find())
+ // {
+ // int found_index = m.start() + 1; // +1 removes the newline
+ // found_indices.add(new Integer(found_index));
+ // }
+
+ int search_from_index = 0;
+ for (;;)
+ {
+ int found_index = text.indexOf("\n" + subtext, search_from_index);
+ if (found_index < 0)
+ {
+ break;
+ }
+ found_index += 1; // The +1 compensates the "\n"
+ found_indices.add(new Integer(found_index));
+ search_from_index = found_index + subtext.length();
+ }
+ return found_indices;
+ }
+
+ /**
+ * Finds the other keys/captions according to their order starting from the
+ * last_caption index upwards.
+ *
+ * @param text
+ * The text.
+ * @param keys
+ * The list of keys.
+ * @param captions
+ * The list of captions.
+ * @param last_caption_index
+ * The index of the last caption.
+ * @return Returns the List of found keys, if all keys could be found, or null
+ * if not all keys could be found.
+ */
+ public static List findRestKeys(String text, List keys, List captions,
+ int last_caption_index)
+ {
+ List found_keys = new ArrayList();
+
+ FoundKey last_caption_found_key = new FoundKey((String) keys.get(0), (String) captions.get(0), last_caption_index);
+ found_keys.add(last_caption_found_key);
+
+ String rest_text = text.substring(0, last_caption_index);
+
+ for (int i = 1; i < captions.size(); i++)
+ {
+ String sought_caption = (String) captions.get(i);
+ int index = rest_text.lastIndexOf(sought_caption);
+
+ if (index < 0)
+ {
+ return null;
+ }
+ FoundKey found_key = new FoundKey((String) keys.get(i), (String) captions.get(i), index);
+ found_keys.add(0, found_key);
+
+ rest_text = rest_text.substring(0, index);
+ }
+
+ return found_keys;
+ }
+
+ /**
+ * Performs a reverse (top to bottom) search for the found keys and checks
+ * that these indices are the same as those that were found during the regular
+ * (bottom up) search.
+ * <p>
+ * If a reverse check proves that the found keys are not at the same positions
+ * as during regular search, this list of found keys should be discarded.
+ * </p>
+ *
+ * @param text
+ * The text.
+ * @param found_keys
+ * The found keys to be reversely checked.
+ * @return Returns true, if all (also the non required) captions could be
+ * found at the same indices as during regular search, false
+ * otherwise.
+ */
+ public static boolean reverseCheckFoundKeys(String text, List found_keys)
+ {
+ int search_from_index = ((FoundKey) found_keys.get(0)).start_index;
+ for (int i = 0; i < found_keys.size(); i++)
+ {
+ FoundKey found_key = (FoundKey) found_keys.get(i);
+
+ int reverse_found_index = text.indexOf(found_key.caption, search_from_index);
+
+ // dferbas fix #331 ??
+
+ if (reverse_found_index < 0)
+ {
+ throw new RuntimeException("The caption " + found_key.caption + " wasn't found in the text during reverse checking - there is something wrong.");
+ }
+
+ if (reverse_found_index != found_key.start_index)
+ {
+ logger.debug("The index for caption " + found_key.caption + " wasn't proved during reverse checking.");
+ return false;
+ }
+ search_from_index = found_key.start_index + found_key.caption.length();
+ }
+ return true;
+ }
+
+ /**
+ * Finds the end of the value in the text.
+ *
+ * <p>
+ * This simply scans for a '\n' from a given start index. The line up to and
+ * inclusive the '\n' is considered to be the value.
+ * </p>
+ * <p>
+ * Note that this method does NOT find the accurate value, if the value goes
+ * over multiple lines! This may bear a serious problem. Usually this method
+ * is only used to finding the end of the last value in a found block, because
+ * mid- values are exactly determined by their start index and the start of
+ * the next caption. Nevertheless, if the last value spans over multiple
+ * lines, this method will not retrieve it completely.
+ * </p>
+ *
+ * @param text
+ * The text.
+ * @param start_index
+ * The start index from where the end of the value is sought.
+ * @return Returns the end index of the value, which is the index of the first
+ * character not belonging to the value anymore (the character after
+ * the '\n').
+ */
+ public static int findEndOfValue(String text, int start_index)
+ {
+ int newline_index = text.indexOf('\n', start_index);
+ if (newline_index < 0)
+ {
+ return text.length();
+ }
+ return newline_index + 1;
+ }
+
+ /**
+ * Checks the integrity of a found block.
+ *
+ * <p>
+ * This is an assertive function.
+ * </p>
+ *
+ * @param text
+ * The text.
+ * @param found_block
+ * The found block.
+ */
+ public static void checkBlockIntegrity(String text, FoundBlock found_block)
+ {
+ for (int i = 0; i < found_block.found_keys.size() - 1; i++)
+ {
+ FoundKey this_key = (FoundKey) found_block.found_keys.get(i);
+ FoundKey next_key = (FoundKey) found_block.found_keys.get(i + 1);
+
+ int this_end_index = findEndOfValue(text, this_key.start_index);
+ if (this_end_index != next_key.start_index)
+ {
+ logger.debug("multi line value: " + this_key);
+ // throw new RuntimeException("The end index of found key " + this_key +
+ // " doesn't match the start index of found key " + next_key);
+ }
+ }
+
+ FoundKey last_key = (FoundKey) found_block.found_keys.get(found_block.found_keys.size() - 1);
+ int end_of_block = findEndOfValue(text, last_key.start_index);
+ if (end_of_block == (last_key.start_index+last_key.caption.length()+1))
+ {
+ end_of_block = findEndOfValue(text,end_of_block);
+ }
+ if (end_of_block != found_block.end_index)
+ {
+ throw new RuntimeException("The end index of last key " + last_key + " doesn't match the end index of the block " + found_block);
+ }
+ }
+
+ /**
+ * Cuts out the given found block from the text.
+ *
+ * @param text
+ * The text.
+ * @param block
+ * The found block.
+ * @return Returns the rest text without the block.
+ */
+ public static String cutOutBlock(String text, FoundBlock block)
+ {
+ int block_start_index = ((FoundKey) block.found_keys.get(0)).getStartIndex();
+ int block_end_index = block.end_index;
+
+ if (block_start_index == 0 && block_end_index == text.length())
+ {
+ // the block is the whole text - the rest text is empty.
+ // This may happen if a (no-text) empty document contains a binary signature.
+ // Then the "signed text" of the binary signature is empty.
+ return "";
+ }
+
+ if (block_end_index == text.length())
+ {
+ // if the block is at the end of the text, remove the "\n" before the
+ // block as well.
+ String pre = text.substring(0, block_start_index - 1);
+ return pre;
+ }
+
+ String pre = text.substring(0, block_start_index);
+ String post = text.substring(block_end_index);
+
+ String rest_text = pre + post;
+ return rest_text;
+ }
+
+ /**
+ * Returns the value of the date field as String.
+ *
+ * @param text
+ * The text.
+ * @param block
+ * The found block.
+ * @return Returns the date value.
+ */
+ public static String getDateValue(String text, FoundBlock block)
+ {
+ FoundKey date_key = block.getDateFoundKey();
+ int date_value_start_index = date_key.start_index + date_key.caption.length();
+ int date_value_end_index = findEndOfValue(text, date_value_start_index);
+ if (date_value_end_index == (date_value_start_index+1))
+ {
+ date_value_end_index = findEndOfValue(text, date_value_end_index);
+ }
+ String date_value = text.substring(date_value_start_index, date_value_end_index).trim();
+ logger.debug("DateString="+date_value);
+ return date_value;
+ }
+
+ /**
+ * Creates a SignatureObject from a found block by extracting the
+ * corresponding values.
+ *
+ * @param text
+ * The text.
+ * @param found_block
+ * The found block.
+ * @return Returns the created SignatureObject.
+ * @throws SignatureTypesException
+ * F.e.
+ * @throws SignatureException
+ * F.e.
+ */
+ public static SignatureObject createSignatureObjectFromFoundBlock(
+ String text, FoundBlock found_block) throws SignatureTypesException, SignatureException
+ {
+ SignatureObject signatureObject = new SignatureObject();
+
+ signatureObject.setSigType(found_block.std.getType());
+ signatureObject.initByType();
+
+ int end_index = found_block.end_index;
+ for (int i = found_block.found_keys.size() - 1; i >= 0; i--)
+ {
+ FoundKey cur_key = (FoundKey) found_block.found_keys.get(i);
+ int start_index = cur_key.getStartIndex() + cur_key.caption.length();
+
+ String value = text.substring(start_index, end_index);
+
+ signatureObject.setSigValueCaption(cur_key.getKey(), value, cur_key.caption);
+
+ end_index = cur_key.getStartIndex();
+ }
+
+ return signatureObject;
+
+ }
+
+ /**
+ * Parses the EGIZDate from a found block and the given text.
+ *
+ * @param text
+ * The text.
+ * @param found_block
+ * The found block.
+ * @return Returns the parsed EGIZDate.
+ */
+ public static EGIZDate getDateFromFoundBlock(String text,
+ FoundBlock found_block)
+ {
+ String date_value = getDateValue(text, found_block);
+ EGIZDate date = EGIZDate.parseFromString(date_value);
+ return date;
+ }
+
+ /**
+ * Sorts the List of found blocks by date.
+ *
+ * @param text
+ * The text.
+ * @param found_blocks
+ * The List of found blocks.
+ */
+ public static void sortFoundBlocksByDate(final String text, List found_blocks)
+ {
+ Collections.sort(found_blocks, new Comparator()
+ {
+ public int compare(Object arg0, Object arg1)
+ {
+ FoundBlock fb0 = (FoundBlock) arg0;
+ FoundBlock fb1 = (FoundBlock) arg1;
+
+ EGIZDate date0 = getDateFromFoundBlock(text, fb0);
+ EGIZDate date1 = getDateFromFoundBlock(text, fb1);
+ return date0.compareTo(date1);
+ }
+ });
+ }
+
+ /**
+ * Given a List of FoundBlock objects, this method returns the last blocks of
+ * this list that have the same date.
+ *
+ * <p>
+ * Usually a date sorted list (earliest first, latest last) will be provided
+ * to this method. Then the last date equal blocks are returned, which are the
+ * last blocks.
+ * </p>
+ *
+ * @param text
+ * The text to retrieve the values of the fields from.
+ * @param found_blocks
+ * The List of FoundBlock objects.
+ * @return Returns the List of the last date equal blocks.
+ */
+ public static List filterLastDateEqualBlocks(String text, List found_blocks)
+ {
+ List latest_blocks = new ArrayList();
+ latest_blocks.add(found_blocks.get(found_blocks.size() - 1));
+ for (int i = found_blocks.size() - 2; i >= 0; i--)
+ {
+ FoundBlock this_block = (FoundBlock) found_blocks.get(i);
+ FoundBlock succ_block = (FoundBlock) found_blocks.get(i + 1);
+
+ EGIZDate this_date = getDateFromFoundBlock(text, this_block);
+ EGIZDate succ_date = getDateFromFoundBlock(text, succ_block);
+
+ if (!this_date.equals(succ_date))
+ {
+ break;
+ }
+ latest_blocks.add(0, this_block);
+ }
+
+ return latest_blocks;
+ }
+ /**
+ * Chooses the most possible (best choice) block of the list of blocks.
+ *
+ * <p>
+ * The strategy to find the most possible block is to choose the very one
+ * block with the maximum number of captions. This block has extracted most
+ * information from the text.
+ * </p>
+ * <p>
+ * If there are still multiple blocks with the same number of cations, the
+ * blocks are compared caption-wise. The block with all captions being longer
+ * or equal to all other blocks' captions wins.
+ * </p>
+ *
+ * @param found_blocks
+ * The List of semantically equal blocks.
+ * @return Returns the best choice FoundBlock.
+ * @throws SignatureException
+ */
+ public static FoundBlock chooseMostPossibleBlock(List found_blocks) throws SignatureException
+ {
+ // int largest_block_index = 0;
+ // FoundBlock largest_block = (FoundBlock) found_blocks.get(0);
+ //
+ // for (int i = 1; i < found_blocks.size(); i++)
+ // {
+ // FoundBlock current_block = (FoundBlock) found_blocks.get(i);
+ //
+ // if (current_block.found_keys.size() > largest_block.found_keys.size())
+ // {
+ // largest_block = current_block;
+ // largest_block_index = i;
+ // }
+ // }
+
+ List vertically_largest = filterVerticallyLargestBlocks(found_blocks);
+ if (logger.isDebugEnabled())
+ {
+ logger.debug("vertically largest blocks:");
+ for (int i = 0; i < vertically_largest.size(); i++)
+ {
+ FoundBlock found_block = (FoundBlock) vertically_largest.get(i);
+ logger.debug(" #" + i + ": " + found_block);
+ }
+ }
+
+ List horizontally_largest = filterHorizontallyLargestBlocks(vertically_largest);
+ if (logger.isDebugEnabled())
+ {
+ logger.debug("horizontally largest blocks:");
+ for (int i = 0; i < horizontally_largest.size(); i++)
+ {
+ FoundBlock found_block = (FoundBlock) horizontally_largest.get(i);
+ logger.debug(" #" + i + ": " + found_block);
+ }
+ }
+ FoundBlock largest_block = (FoundBlock) horizontally_largest.get(0);
+ logger.debug("Chose largest block: " + largest_block);
+ return largest_block;
+ }
+
+ /**
+ * Filters out all blocks but the vertically largest ones.
+ *
+ * <p>
+ * A vertically largest block has the most found keys.
+ * </p>
+ *
+ * @param found_blocks
+ * The List of FoundBlock objects to be filtered.
+ * @return Returns the List of the vertically largest FoundBlock objects.
+ */
+ public static List filterVerticallyLargestBlocks(List found_blocks)
+ {
+ // determine the size of the largest block(s)
+ int largest_size = Integer.MIN_VALUE;
+ for (int i = 0; i < found_blocks.size(); i++)
+ {
+ FoundBlock fb = (FoundBlock) found_blocks.get(i);
+ final int current_size = fb.found_keys.size();
+ if (current_size > largest_size)
+ {
+ largest_size = current_size;
+ }
+ }
+
+ // keep all blocks that have the largest_size
+ List largest_blocks = new ArrayList();
+ for (int i = 0; i < found_blocks.size(); i++)
+ {
+ FoundBlock fb = (FoundBlock) found_blocks.get(i);
+ if (fb.found_keys.size() < largest_size)
+ {
+ continue;
+ }
+ largest_blocks.add(fb);
+ }
+ return largest_blocks;
+ }
+
+ /**
+ * Filters out all blocks but the horizonally largest ones.
+ *
+ * <p>
+ * A vertically largest block has the most found keys.
+ * </p>
+ *
+ * @param found_blocks
+ * The List of FoundBlock objects to be filtered. All of these
+ * FoundBlock objects must have the same number of found keys.
+ * @return Returns the List of the horizontally largest FoundBlock objects.
+ * @throws SignatureException
+ */
+ public static List filterHorizontallyLargestBlocks(List found_blocks) throws SignatureException
+ {
+ List horizontally_largest = new ArrayList();
+ FoundBlock largest_block = (FoundBlock) found_blocks.get(0);
+ horizontally_largest.add(largest_block);
+
+ for (int i = 1; i < found_blocks.size(); i++)
+ {
+ FoundBlock fb = (FoundBlock) found_blocks.get(i);
+
+ if (isHorizontallyEqual(fb, largest_block))
+ {
+ horizontally_largest.add(fb);
+ continue;
+ }
+
+ if (isHorizontallyLarger(fb, largest_block))
+ {
+ horizontally_largest = new ArrayList();
+ largest_block = fb;
+ horizontally_largest.add(largest_block);
+ }
+ else
+ {
+ if (!isHorizontallyLarger(largest_block, fb))
+ {
+ // The block is neither equal nor larger nor lower.
+ // We cannot exactly determine which one to use.
+ throw new SignatureException(315, "The blocks are neither larger nor lower nor equal. Cannot decide which one to pick. fb = " + fb + ", largest_block = " + largest_block);
+ }
+ }
+
+ }
+
+ return horizontally_largest;
+ }
+
+ protected static boolean isHorizontallyEqual(FoundBlock fb0, FoundBlock fb1)
+ {
+ final int num_keys = fb0.found_keys.size();
+ if (num_keys != fb1.found_keys.size())
+ {
+ throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size());
+ }
+
+ for (int i = 0; i < num_keys; i++)
+ {
+ FoundKey fk0 = (FoundKey) fb0.found_keys.get(i);
+ FoundKey fk1 = (FoundKey) fb1.found_keys.get(i);
+
+ if (fk0.caption.length() != fk1.caption.length())
+ {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ protected static boolean isHorizontallyLarger(FoundBlock fb0, FoundBlock fb1)
+ {
+ final int num_keys = fb0.found_keys.size();
+ if (num_keys != fb1.found_keys.size())
+ {
+ throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size());
+ }
+
+ boolean larger = false;
+
+ for (int i = 0; i < num_keys; i++)
+ {
+ FoundKey fk0 = (FoundKey) fb0.found_keys.get(i);
+ FoundKey fk1 = (FoundKey) fb1.found_keys.get(i);
+
+ if (fk0.caption.length() == fk1.caption.length())
+ {
+ continue;
+ }
+
+ if (fk0.caption.length() > fk1.caption.length())
+ {
+ larger = true;
+ continue;
+ }
+
+ // if (fk0.caption.length() < fk1.caption.length())
+ return false;
+ }
+
+ return larger;
+ }
+
+}