From a0de2a3b2a5f4a99f280f5caebbca0d183ae109a Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 7 Feb 2007 10:08:21 +0000 Subject: Bugfix: Querformat, BKU 2.7.x, ... git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@35 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/AbsoluteTextSignature.java | 286 +++++++++++++++++---- 1 file changed, 236 insertions(+), 50 deletions(-) (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java') diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java index b3c2e24..85673b5 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java @@ -151,69 +151,89 @@ public class AbsoluteTextSignature SignatureTypes sig_types = SignatureTypes.getInstance(); List signatureTypes_ = sig_types.getSignatureTypeDefinitions(); - List found_candidates = new ArrayList(); + List found_potential_candidates = new ArrayList(); for (int i = 0; i < signatureTypes_.size(); i++) { SignatureTypeDefinition block_type = (SignatureTypeDefinition) signatureTypes_.get(i); List found_candidates_for_type = findPotentialSignaturesForProfile(text, block_type); - found_candidates.addAll(found_candidates_for_type); + found_potential_candidates.addAll(found_candidates_for_type); } - if (found_candidates.isEmpty()) + if (found_potential_candidates.isEmpty()) { logger.debug("no candidates found at all"); return null; } + List found_candidates = new ArrayList(); logger.debug("checking block integrity"); - for (int i = 0; i < found_candidates.size(); i++) + for (int i = 0; i < found_potential_candidates.size(); i++) { - FoundBlock found_block = (FoundBlock) found_candidates.get(i); + FoundBlock found_block = (FoundBlock) found_potential_candidates.get(i); String date_value = getDateValue(text, found_block); logger.debug("date_value = " + date_value); - EGIZDate date = EGIZDate.parseFromString(date_value); + try + { + EGIZDate date = EGIZDate.parseFromString(date_value); + + logger.debug("found_block = " + date + " - " + found_block); - logger.debug("found_block = " + date + " - " + found_block); + checkBlockIntegrity(text, found_block); - checkBlockIntegrity(text, found_block); + found_candidates.add(found_block); + } + catch (Exception e) + { + logger.debug("Exception while checking the integrity of the found block " + found_block + ". Ignoring this block.", e); + } } sortFoundBlocksByDate(text, found_candidates); - - logger.debug("sorted blocks:"); - for (int i = 0; i < found_candidates.size(); i++) + if (logger.isDebugEnabled()) { - FoundBlock found_block = (FoundBlock) found_candidates.get(i); + logger.debug("sorted blocks:"); + for (int i = 0; i < found_candidates.size(); i++) + { + FoundBlock found_block = (FoundBlock) found_candidates.get(i); - String date_value = getDateValue(text, found_block); - EGIZDate date = EGIZDate.parseFromString(date_value); + String date_value = getDateValue(text, found_block); + EGIZDate date = EGIZDate.parseFromString(date_value); - logger.debug(" #" + i + ": " + date + " - " + found_block); + logger.debug(" #" + i + ": " + date + " - " + found_block); + } } List latest_blocks = filterLastDateEqualBlocks(text, found_candidates); - logger.debug("latest blocks:"); - for (int i = 0; i < latest_blocks.size(); i++) + if (logger.isDebugEnabled()) { - FoundBlock found_block = (FoundBlock) latest_blocks.get(i); + logger.debug("latest blocks:"); + for (int i = 0; i < latest_blocks.size(); i++) + { + FoundBlock found_block = (FoundBlock) latest_blocks.get(i); - String date_value = getDateValue(text, found_block); - EGIZDate date = EGIZDate.parseFromString(date_value); + String date_value = getDateValue(text, found_block); + EGIZDate date = EGIZDate.parseFromString(date_value); - logger.debug(" #" + i + ": " + date + " - " + found_block); + logger.debug(" #" + i + ": " + date + " - " + found_block); + } } - boolean semantic_equality = PdfAS.checkForSemanticEquality(latest_blocks); - logger.debug("semantic_equality = " + semantic_equality); - if (!semantic_equality) - { - throw new SignatureException(314, "The latest blocks weren't semantically equal."); - } + // The semantic equality check has been outdated by the + // advanced choosing algorithm. + // boolean semantic_equality = + // PdfAS.checkForSemanticEquality(latest_blocks); + // logger.debug("semantic_equality = " + semantic_equality); + // if (!semantic_equality) + // { + // throw new SignatureException(314, "The latest blocks weren't semantically + // equal."); + // } + + FoundBlock latest_block = chooseMostPossibleBlock(latest_blocks); - FoundBlock latest_block = chooseMostPossibleSemanticallyEqualBlock(latest_blocks); logger.debug("latest block = " + latest_block); return latest_block; } @@ -269,7 +289,6 @@ public class AbsoluteTextSignature } } - for (int lci = 0; lci < found_last_captions.size(); lci++) { int last_caption_index = ((Integer) found_last_captions.get(lci)).intValue(); @@ -470,8 +489,6 @@ public class AbsoluteTextSignature */ public static int findEndOfValue(String text, int start_index) { - // FIXME[tknall]: this method does not work properly for landscape documents because always starts with "\n". Look for errors in PdfAS.java, method findBlockInText(...) to set the start_index accordingly. - // Hint: Captions and values of landscape documents are separated with " \n" and not only with " ". int newline_index = text.indexOf('\n', start_index); if (newline_index < 0) { @@ -685,44 +702,213 @@ public class AbsoluteTextSignature } /** - * Chooses the most possible (best choice) block of the list of semantically - * equal blocks. + * Chooses the most possible (best choice) block of the list of blocks. * *

- * Thus blocks are considered semantically equal if their required keys are - * semantically equal, semantically equal blocks may still differ in the - * number of their non required fields. This may lead to multiple found blocks - * of the same size in characters, but where some blocks' elements swallow - * elements found by other blocks. + * The strategy to find the most possible block is to choose the very one + * block with the maximum number of captions. This block has extracted most + * information from the text. *

*

- * The strategy to avoid this is to choose the very one block with the maximum - * number of captions. This block has extracted most information from the - * text. + * If there are still multiple blocks with the same number of cations, the + * blocks are compared caption-wise. The block with all captions being longer + * or equal to all other blocks' captions wins. *

* * @param found_blocks * The List of semantically equal blocks. * @return Returns the best choice FoundBlock. + * @throws SignatureException + */ + public static FoundBlock chooseMostPossibleBlock(List found_blocks) throws SignatureException + { + // int largest_block_index = 0; + // FoundBlock largest_block = (FoundBlock) found_blocks.get(0); + // + // for (int i = 1; i < found_blocks.size(); i++) + // { + // FoundBlock current_block = (FoundBlock) found_blocks.get(i); + // + // if (current_block.found_keys.size() > largest_block.found_keys.size()) + // { + // largest_block = current_block; + // largest_block_index = i; + // } + // } + + List vertically_largest = filterVerticallyLargestBlocks(found_blocks); + if (logger.isDebugEnabled()) + { + logger.debug("vertically largest blocks:"); + for (int i = 0; i < vertically_largest.size(); i++) + { + FoundBlock found_block = (FoundBlock) vertically_largest.get(i); + + logger.debug(" #" + i + ": " + found_block); + } + } + + List horizontally_largest = filterHorizontallyLargestBlocks(vertically_largest); + if (logger.isDebugEnabled()) + { + logger.debug("horizontally largest blocks:"); + for (int i = 0; i < horizontally_largest.size(); i++) + { + FoundBlock found_block = (FoundBlock) horizontally_largest.get(i); + + logger.debug(" #" + i + ": " + found_block); + } + } + + FoundBlock largest_block = (FoundBlock) horizontally_largest.get(0); + + logger.debug("Chose largest block: " + largest_block); + return largest_block; + } + + /** + * Filters out all blocks but the vertically largest ones. + * + *

+ * A vertically largest block has the most found keys. + *

+ * + * @param found_blocks + * The List of FoundBlock objects to be filtered. + * @return Returns the List of the vertically largest FoundBlock objects. */ - public static FoundBlock chooseMostPossibleSemanticallyEqualBlock( - List found_blocks) + public static List filterVerticallyLargestBlocks(List found_blocks) { - int largest_block_index = 0; + // determine the size of the largest block(s) + int largest_size = Integer.MIN_VALUE; + for (int i = 1; i < found_blocks.size(); i++) + { + FoundBlock fb = (FoundBlock) found_blocks.get(i); + final int current_size = fb.found_keys.size(); + if (current_size > largest_size) + { + largest_size = current_size; + } + } + + // keep all blocks that have the largest_size + List largest_blocks = new ArrayList(); + for (int i = 0; i < found_blocks.size(); i++) + { + FoundBlock fb = (FoundBlock) found_blocks.get(i); + if (fb.found_keys.size() < largest_size) + { + continue; + } + largest_blocks.add(fb); + } + + return largest_blocks; + } + + /** + * Filters out all blocks but the horizonally largest ones. + * + *

+ * A vertically largest block has the most found keys. + *

+ * + * @param found_blocks + * The List of FoundBlock objects to be filtered. All of these + * FoundBlock objects must have the same number of found keys. + * @return Returns the List of the horizontally largest FoundBlock objects. + * @throws SignatureException + */ + public static List filterHorizontallyLargestBlocks(List found_blocks) throws SignatureException + { + List horizontally_largest = new ArrayList(); FoundBlock largest_block = (FoundBlock) found_blocks.get(0); + horizontally_largest.add(largest_block); for (int i = 1; i < found_blocks.size(); i++) { - FoundBlock current_block = (FoundBlock) found_blocks.get(i); + FoundBlock fb = (FoundBlock) found_blocks.get(i); + + if (isHorizontallyEqual(fb, largest_block)) + { + horizontally_largest.add(fb); + continue; + } - if (current_block.found_keys.size() > largest_block.found_keys.size()) + if (isHorizontallyLarger(fb, largest_block)) + { + horizontally_largest = new ArrayList(); + largest_block = fb; + horizontally_largest.add(largest_block); + } + else { - largest_block = current_block; - largest_block_index = i; + if (!isHorizontallyLarger(largest_block, fb)) + { + // The block is neither equal nor larger nor lower. + // We cannot exactly determine which one to use. + throw new SignatureException(315, "The blocks are neither larger nor lower nor equal. Cannot decide which one to pick. fb = " + fb + ", largest_block = " + largest_block); + } } + } - logger.debug("Chose largest block with index #" + largest_block_index + ": " + largest_block); - return largest_block; + return horizontally_largest; } + + protected static boolean isHorizontallyEqual(FoundBlock fb0, FoundBlock fb1) + { + final int num_keys = fb0.found_keys.size(); + if (num_keys != fb1.found_keys.size()) + { + throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size()); + } + + for (int i = 0; i < num_keys; i++) + { + FoundKey fk0 = (FoundKey) fb0.found_keys.get(i); + FoundKey fk1 = (FoundKey) fb1.found_keys.get(i); + + if (fk0.caption.length() != fk1.caption.length()) + { + return false; + } + } + + return true; + } + + protected static boolean isHorizontallyLarger(FoundBlock fb0, FoundBlock fb1) + { + final int num_keys = fb0.found_keys.size(); + if (num_keys != fb1.found_keys.size()) + { + throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size()); + } + + boolean larger = false; + + for (int i = 0; i < num_keys; i++) + { + FoundKey fk0 = (FoundKey) fb0.found_keys.get(i); + FoundKey fk1 = (FoundKey) fb1.found_keys.get(i); + + if (fk0.caption.length() == fk1.caption.length()) + { + continue; + } + + if (fk0.caption.length() > fk1.caption.length()) + { + larger = true; + continue; + } + + // if (fk0.caption.length() < fk1.caption.length()) + return false; + } + + return larger; + } + } -- cgit v1.2.3