aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
diff options
context:
space:
mode:
authortknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2007-02-07 10:08:21 +0000
committertknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2007-02-07 10:08:21 +0000
commita0de2a3b2a5f4a99f280f5caebbca0d183ae109a (patch)
tree7479e30c10c3994cba18c6bf8784f61748bb6cd3 /src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
parent8f7cebd9c2c5c0f6e33863ba57ad1c215f35605c (diff)
downloadpdf-as-3-a0de2a3b2a5f4a99f280f5caebbca0d183ae109a.tar.gz
pdf-as-3-a0de2a3b2a5f4a99f280f5caebbca0d183ae109a.tar.bz2
pdf-as-3-a0de2a3b2a5f4a99f280f5caebbca0d183ae109a.zip
Bugfix: Querformat, BKU 2.7.x, ...
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@35 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java286
1 files changed, 236 insertions, 50 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
index b3c2e24..85673b5 100644
--- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
+++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
@@ -151,69 +151,89 @@ public class AbsoluteTextSignature
SignatureTypes sig_types = SignatureTypes.getInstance();
List signatureTypes_ = sig_types.getSignatureTypeDefinitions();
- List found_candidates = new ArrayList();
+ List found_potential_candidates = new ArrayList();
for (int i = 0; i < signatureTypes_.size(); i++)
{
SignatureTypeDefinition block_type = (SignatureTypeDefinition) signatureTypes_.get(i);
List found_candidates_for_type = findPotentialSignaturesForProfile(text, block_type);
- found_candidates.addAll(found_candidates_for_type);
+ found_potential_candidates.addAll(found_candidates_for_type);
}
- if (found_candidates.isEmpty())
+ if (found_potential_candidates.isEmpty())
{
logger.debug("no candidates found at all");
return null;
}
+ List found_candidates = new ArrayList();
logger.debug("checking block integrity");
- for (int i = 0; i < found_candidates.size(); i++)
+ for (int i = 0; i < found_potential_candidates.size(); i++)
{
- FoundBlock found_block = (FoundBlock) found_candidates.get(i);
+ FoundBlock found_block = (FoundBlock) found_potential_candidates.get(i);
String date_value = getDateValue(text, found_block);
logger.debug("date_value = " + date_value);
- EGIZDate date = EGIZDate.parseFromString(date_value);
+ try
+ {
+ EGIZDate date = EGIZDate.parseFromString(date_value);
+
+ logger.debug("found_block = " + date + " - " + found_block);
- logger.debug("found_block = " + date + " - " + found_block);
+ checkBlockIntegrity(text, found_block);
- checkBlockIntegrity(text, found_block);
+ found_candidates.add(found_block);
+ }
+ catch (Exception e)
+ {
+ logger.debug("Exception while checking the integrity of the found block " + found_block + ". Ignoring this block.", e);
+ }
}
sortFoundBlocksByDate(text, found_candidates);
-
- logger.debug("sorted blocks:");
- for (int i = 0; i < found_candidates.size(); i++)
+ if (logger.isDebugEnabled())
{
- FoundBlock found_block = (FoundBlock) found_candidates.get(i);
+ logger.debug("sorted blocks:");
+ for (int i = 0; i < found_candidates.size(); i++)
+ {
+ FoundBlock found_block = (FoundBlock) found_candidates.get(i);
- String date_value = getDateValue(text, found_block);
- EGIZDate date = EGIZDate.parseFromString(date_value);
+ String date_value = getDateValue(text, found_block);
+ EGIZDate date = EGIZDate.parseFromString(date_value);
- logger.debug(" #" + i + ": " + date + " - " + found_block);
+ logger.debug(" #" + i + ": " + date + " - " + found_block);
+ }
}
List latest_blocks = filterLastDateEqualBlocks(text, found_candidates);
- logger.debug("latest blocks:");
- for (int i = 0; i < latest_blocks.size(); i++)
+ if (logger.isDebugEnabled())
{
- FoundBlock found_block = (FoundBlock) latest_blocks.get(i);
+ logger.debug("latest blocks:");
+ for (int i = 0; i < latest_blocks.size(); i++)
+ {
+ FoundBlock found_block = (FoundBlock) latest_blocks.get(i);
- String date_value = getDateValue(text, found_block);
- EGIZDate date = EGIZDate.parseFromString(date_value);
+ String date_value = getDateValue(text, found_block);
+ EGIZDate date = EGIZDate.parseFromString(date_value);
- logger.debug(" #" + i + ": " + date + " - " + found_block);
+ logger.debug(" #" + i + ": " + date + " - " + found_block);
+ }
}
- boolean semantic_equality = PdfAS.checkForSemanticEquality(latest_blocks);
- logger.debug("semantic_equality = " + semantic_equality);
- if (!semantic_equality)
- {
- throw new SignatureException(314, "The latest blocks weren't semantically equal.");
- }
+ // The semantic equality check has been outdated by the
+ // advanced choosing algorithm.
+ // boolean semantic_equality =
+ // PdfAS.checkForSemanticEquality(latest_blocks);
+ // logger.debug("semantic_equality = " + semantic_equality);
+ // if (!semantic_equality)
+ // {
+ // throw new SignatureException(314, "The latest blocks weren't semantically
+ // equal.");
+ // }
+
+ FoundBlock latest_block = chooseMostPossibleBlock(latest_blocks);
- FoundBlock latest_block = chooseMostPossibleSemanticallyEqualBlock(latest_blocks);
logger.debug("latest block = " + latest_block);
return latest_block;
}
@@ -269,7 +289,6 @@ public class AbsoluteTextSignature
}
}
-
for (int lci = 0; lci < found_last_captions.size(); lci++)
{
int last_caption_index = ((Integer) found_last_captions.get(lci)).intValue();
@@ -470,8 +489,6 @@ public class AbsoluteTextSignature
*/
public static int findEndOfValue(String text, int start_index)
{
- // FIXME[tknall]: this method does not work properly for landscape documents because <text> always starts with "\n". Look for errors in PdfAS.java, method findBlockInText(...) to set the start_index accordingly.
- // Hint: Captions and values of landscape documents are separated with " \n" and not only with " ".
int newline_index = text.indexOf('\n', start_index);
if (newline_index < 0)
{
@@ -685,44 +702,213 @@ public class AbsoluteTextSignature
}
/**
- * Chooses the most possible (best choice) block of the list of semantically
- * equal blocks.
+ * Chooses the most possible (best choice) block of the list of blocks.
*
* <p>
- * Thus blocks are considered semantically equal if their required keys are
- * semantically equal, semantically equal blocks may still differ in the
- * number of their non required fields. This may lead to multiple found blocks
- * of the same size in characters, but where some blocks' elements swallow
- * elements found by other blocks.
+ * The strategy to find the most possible block is to choose the very one
+ * block with the maximum number of captions. This block has extracted most
+ * information from the text.
* </p>
* <p>
- * The strategy to avoid this is to choose the very one block with the maximum
- * number of captions. This block has extracted most information from the
- * text.
+ * If there are still multiple blocks with the same number of cations, the
+ * blocks are compared caption-wise. The block with all captions being longer
+ * or equal to all other blocks' captions wins.
* </p>
*
* @param found_blocks
* The List of semantically equal blocks.
* @return Returns the best choice FoundBlock.
+ * @throws SignatureException
+ */
+ public static FoundBlock chooseMostPossibleBlock(List found_blocks) throws SignatureException
+ {
+ // int largest_block_index = 0;
+ // FoundBlock largest_block = (FoundBlock) found_blocks.get(0);
+ //
+ // for (int i = 1; i < found_blocks.size(); i++)
+ // {
+ // FoundBlock current_block = (FoundBlock) found_blocks.get(i);
+ //
+ // if (current_block.found_keys.size() > largest_block.found_keys.size())
+ // {
+ // largest_block = current_block;
+ // largest_block_index = i;
+ // }
+ // }
+
+ List vertically_largest = filterVerticallyLargestBlocks(found_blocks);
+ if (logger.isDebugEnabled())
+ {
+ logger.debug("vertically largest blocks:");
+ for (int i = 0; i < vertically_largest.size(); i++)
+ {
+ FoundBlock found_block = (FoundBlock) vertically_largest.get(i);
+
+ logger.debug(" #" + i + ": " + found_block);
+ }
+ }
+
+ List horizontally_largest = filterHorizontallyLargestBlocks(vertically_largest);
+ if (logger.isDebugEnabled())
+ {
+ logger.debug("horizontally largest blocks:");
+ for (int i = 0; i < horizontally_largest.size(); i++)
+ {
+ FoundBlock found_block = (FoundBlock) horizontally_largest.get(i);
+
+ logger.debug(" #" + i + ": " + found_block);
+ }
+ }
+
+ FoundBlock largest_block = (FoundBlock) horizontally_largest.get(0);
+
+ logger.debug("Chose largest block: " + largest_block);
+ return largest_block;
+ }
+
+ /**
+ * Filters out all blocks but the vertically largest ones.
+ *
+ * <p>
+ * A vertically largest block has the most found keys.
+ * </p>
+ *
+ * @param found_blocks
+ * The List of FoundBlock objects to be filtered.
+ * @return Returns the List of the vertically largest FoundBlock objects.
*/
- public static FoundBlock chooseMostPossibleSemanticallyEqualBlock(
- List found_blocks)
+ public static List filterVerticallyLargestBlocks(List found_blocks)
{
- int largest_block_index = 0;
+ // determine the size of the largest block(s)
+ int largest_size = Integer.MIN_VALUE;
+ for (int i = 1; i < found_blocks.size(); i++)
+ {
+ FoundBlock fb = (FoundBlock) found_blocks.get(i);
+ final int current_size = fb.found_keys.size();
+ if (current_size > largest_size)
+ {
+ largest_size = current_size;
+ }
+ }
+
+ // keep all blocks that have the largest_size
+ List largest_blocks = new ArrayList();
+ for (int i = 0; i < found_blocks.size(); i++)
+ {
+ FoundBlock fb = (FoundBlock) found_blocks.get(i);
+ if (fb.found_keys.size() < largest_size)
+ {
+ continue;
+ }
+ largest_blocks.add(fb);
+ }
+
+ return largest_blocks;
+ }
+
+ /**
+ * Filters out all blocks but the horizonally largest ones.
+ *
+ * <p>
+ * A vertically largest block has the most found keys.
+ * </p>
+ *
+ * @param found_blocks
+ * The List of FoundBlock objects to be filtered. All of these
+ * FoundBlock objects must have the same number of found keys.
+ * @return Returns the List of the horizontally largest FoundBlock objects.
+ * @throws SignatureException
+ */
+ public static List filterHorizontallyLargestBlocks(List found_blocks) throws SignatureException
+ {
+ List horizontally_largest = new ArrayList();
FoundBlock largest_block = (FoundBlock) found_blocks.get(0);
+ horizontally_largest.add(largest_block);
for (int i = 1; i < found_blocks.size(); i++)
{
- FoundBlock current_block = (FoundBlock) found_blocks.get(i);
+ FoundBlock fb = (FoundBlock) found_blocks.get(i);
+
+ if (isHorizontallyEqual(fb, largest_block))
+ {
+ horizontally_largest.add(fb);
+ continue;
+ }
- if (current_block.found_keys.size() > largest_block.found_keys.size())
+ if (isHorizontallyLarger(fb, largest_block))
+ {
+ horizontally_largest = new ArrayList();
+ largest_block = fb;
+ horizontally_largest.add(largest_block);
+ }
+ else
{
- largest_block = current_block;
- largest_block_index = i;
+ if (!isHorizontallyLarger(largest_block, fb))
+ {
+ // The block is neither equal nor larger nor lower.
+ // We cannot exactly determine which one to use.
+ throw new SignatureException(315, "The blocks are neither larger nor lower nor equal. Cannot decide which one to pick. fb = " + fb + ", largest_block = " + largest_block);
+ }
}
+
}
- logger.debug("Chose largest block with index #" + largest_block_index + ": " + largest_block);
- return largest_block;
+ return horizontally_largest;
}
+
+ protected static boolean isHorizontallyEqual(FoundBlock fb0, FoundBlock fb1)
+ {
+ final int num_keys = fb0.found_keys.size();
+ if (num_keys != fb1.found_keys.size())
+ {
+ throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size());
+ }
+
+ for (int i = 0; i < num_keys; i++)
+ {
+ FoundKey fk0 = (FoundKey) fb0.found_keys.get(i);
+ FoundKey fk1 = (FoundKey) fb1.found_keys.get(i);
+
+ if (fk0.caption.length() != fk1.caption.length())
+ {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ protected static boolean isHorizontallyLarger(FoundBlock fb0, FoundBlock fb1)
+ {
+ final int num_keys = fb0.found_keys.size();
+ if (num_keys != fb1.found_keys.size())
+ {
+ throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size());
+ }
+
+ boolean larger = false;
+
+ for (int i = 0; i < num_keys; i++)
+ {
+ FoundKey fk0 = (FoundKey) fb0.found_keys.get(i);
+ FoundKey fk1 = (FoundKey) fb1.found_keys.get(i);
+
+ if (fk0.caption.length() == fk1.caption.length())
+ {
+ continue;
+ }
+
+ if (fk0.caption.length() > fk1.caption.length())
+ {
+ larger = true;
+ continue;
+ }
+
+ // if (fk0.caption.length() < fk1.caption.length())
+ return false;
+ }
+
+ return larger;
+ }
+
}