From 548c8770e5ec6cb9bf73b7c341673d4077099a75 Mon Sep 17 00:00:00 2001 From: tknall Date: Thu, 7 Dec 2006 20:13:50 +0000 Subject: knowcenter adjustments from 2006-12-01 merged git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@13 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/AbsoluteTextSignature.java | 102 +++++++++++++++++---- 1 file changed, 86 insertions(+), 16 deletions(-) (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java') diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java index 5523041..658f7dd 100644 --- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java +++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java @@ -14,7 +14,8 @@ * DERIVATIVES. * * $Id: AbsoluteTextSignature.java,v 1.1 2006/10/31 08:08:33 wprinz Exp $ - */package at.knowcenter.wag.egov.egiz.pdf; + */ +package at.knowcenter.wag.egov.egiz.pdf; import java.util.ArrayList; import java.util.Collections; @@ -36,6 +37,7 @@ import at.knowcenter.wag.egov.egiz.sig.SignatureTypes; /** * Contains methods and helpers that implement the absolute text signature. + * * @author wprinz */ public class AbsoluteTextSignature @@ -46,7 +48,6 @@ public class AbsoluteTextSignature */ private static final Logger logger = ConfigLogger.getLogger(AbsoluteTextSignature.class); - /** * Extracts all signature holders from a given text. * @@ -138,14 +139,14 @@ public class AbsoluteTextSignature */ public static FoundBlock findLatestBlock(String text) throws SignatureException, SignatureTypesException { -// try -// { -// writeTextToFile(text, new File("C:\\wprinz\\text.utf8.txt")); -// } -// catch (IOException e) -// { -// e.printStackTrace(); -// } + // try + // { + // writeTextToFile(text, new File("C:\\wprinz\\text.utf8.txt")); + // } + // catch (IOException e) + // { + // e.printStackTrace(); + // } SignatureTypes sig_types = SignatureTypes.getInstance(); List signatureTypes_ = sig_types.getSignatureTypeDefinitions(); @@ -212,7 +213,7 @@ public class AbsoluteTextSignature throw new SignatureException(314, "The latest blocks weren't semantically equal."); } - FoundBlock latest_block = (FoundBlock) latest_blocks.get(0); + FoundBlock latest_block = chooseMostPossibleSemanticallyEqualBlock(latest_blocks); logger.debug("latest block = " + latest_block); return latest_block; } @@ -245,7 +246,20 @@ public class AbsoluteTextSignature String last_caption = (String) captions.get(0); logger.debug("last_caption = " + last_caption); - List found_last_captions = findIndices(text, last_caption); + List found_last_captions = findIndicesWithStartingNL(text, last_caption); + if (last_key.equals(SignatureTypes.SIG_ID)) + { + logger.debug("Last key is SIG_ID, so it may not be present. Searching for the previous to last key."); + String prevlast_key = (String) keys.get(1); + logger.debug("last_key = " + prevlast_key); + String prevlast_caption = (String) captions.get(1); + logger.debug("prevlast_caption = " + last_caption); + List found_prevlast_captions = findIndicesWithStartingNL(text, prevlast_caption); + if (!found_prevlast_captions.isEmpty()) + { + found_last_captions.addAll(found_prevlast_captions); + } + } if (logger.isDebugEnabled()) { logger.debug("found " + found_last_captions.size() + " last captions."); @@ -255,6 +269,7 @@ public class AbsoluteTextSignature } } + for (int lci = 0; lci < found_last_captions.size(); lci++) { int last_caption_index = ((Integer) found_last_captions.get(lci)).intValue(); @@ -300,7 +315,8 @@ public class AbsoluteTextSignature } /** - * Finds all indices of the given subtext within a given text. + * Finds all indices of the given subtext (starting at a new line) within a + * given text. * *

* This is usually used to find the indices of the last captions. @@ -312,17 +328,30 @@ public class AbsoluteTextSignature * The subtext to be sought. * @return Returns the List of found indices. */ - public static List findIndices(String text, String subtext) + public static List findIndicesWithStartingNL(String text, String subtext) { List found_indices = new ArrayList(); + + // // for some reason "^" + subtext doesn't work as a pattern + // String pattern = "\n" + subtext; + // Pattern p = Pattern.compile(pattern); + // Matcher m = p.matcher(text); + // + // while (m.find()) + // { + // int found_index = m.start() + 1; // +1 removes the newline + // found_indices.add(new Integer(found_index)); + // } + int search_from_index = 0; for (;;) { - int found_index = text.indexOf(subtext, search_from_index); + int found_index = text.indexOf("\n" + subtext, search_from_index); if (found_index < 0) { break; } + found_index += 1; // The +1 compensates the "\n" found_indices.add(new Integer(found_index)); search_from_index = found_index + subtext.length(); } @@ -471,7 +500,7 @@ public class AbsoluteTextSignature int this_end_index = findEndOfValue(text, this_key.start_index); if (this_end_index != next_key.start_index) { - logger.warn("multi line value: " + this_key); + logger.debug("multi line value: " + this_key); // throw new RuntimeException("The end index of found key " + this_key + // " doesn't match the start index of found key " + next_key); } @@ -653,4 +682,45 @@ public class AbsoluteTextSignature return latest_blocks; } + /** + * Chooses the most possible (best choice) block of the list of semantically + * equal blocks. + * + *

+ * Thus blocks are considered semantically equal if their required keys are + * semantically equal, semantically equal blocks may still differ in the + * number of their non required fields. This may lead to multiple found blocks + * of the same size in characters, but where some blocks' elements swallow + * elements found by other blocks. + *

+ *

+ * The strategy to avoid this is to choose the very one block with the maximum + * number of captions. This block has extracted most information from the + * text. + *

+ * + * @param found_blocks + * The List of semantically equal blocks. + * @return Returns the best choice FoundBlock. + */ + public static FoundBlock chooseMostPossibleSemanticallyEqualBlock( + List found_blocks) + { + int largest_block_index = 0; + FoundBlock largest_block = (FoundBlock) found_blocks.get(0); + + for (int i = 1; i < found_blocks.size(); i++) + { + FoundBlock current_block = (FoundBlock) found_blocks.get(i); + + if (current_block.found_keys.size() > largest_block.found_keys.size()) + { + largest_block = current_block; + largest_block_index = i; + } + } + + logger.debug("Chose largest block with index #" + largest_block_index + ": " + largest_block); + return largest_block; + } } -- cgit v1.2.3