aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java102
1 files changed, 86 insertions, 16 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
index 5523041..658f7dd 100644
--- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
+++ b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
@@ -14,7 +14,8 @@
* DERIVATIVES.
*
* $Id: AbsoluteTextSignature.java,v 1.1 2006/10/31 08:08:33 wprinz Exp $
- */package at.knowcenter.wag.egov.egiz.pdf;
+ */
+package at.knowcenter.wag.egov.egiz.pdf;
import java.util.ArrayList;
import java.util.Collections;
@@ -36,6 +37,7 @@ import at.knowcenter.wag.egov.egiz.sig.SignatureTypes;
/**
* Contains methods and helpers that implement the absolute text signature.
+ *
* @author wprinz
*/
public class AbsoluteTextSignature
@@ -46,7 +48,6 @@ public class AbsoluteTextSignature
*/
private static final Logger logger = ConfigLogger.getLogger(AbsoluteTextSignature.class);
-
/**
* Extracts all signature holders from a given text.
*
@@ -138,14 +139,14 @@ public class AbsoluteTextSignature
*/
public static FoundBlock findLatestBlock(String text) throws SignatureException, SignatureTypesException
{
-// try
-// {
-// writeTextToFile(text, new File("C:\\wprinz\\text.utf8.txt"));
-// }
-// catch (IOException e)
-// {
-// e.printStackTrace();
-// }
+ // try
+ // {
+ // writeTextToFile(text, new File("C:\\wprinz\\text.utf8.txt"));
+ // }
+ // catch (IOException e)
+ // {
+ // e.printStackTrace();
+ // }
SignatureTypes sig_types = SignatureTypes.getInstance();
List signatureTypes_ = sig_types.getSignatureTypeDefinitions();
@@ -212,7 +213,7 @@ public class AbsoluteTextSignature
throw new SignatureException(314, "The latest blocks weren't semantically equal.");
}
- FoundBlock latest_block = (FoundBlock) latest_blocks.get(0);
+ FoundBlock latest_block = chooseMostPossibleSemanticallyEqualBlock(latest_blocks);
logger.debug("latest block = " + latest_block);
return latest_block;
}
@@ -245,7 +246,20 @@ public class AbsoluteTextSignature
String last_caption = (String) captions.get(0);
logger.debug("last_caption = " + last_caption);
- List found_last_captions = findIndices(text, last_caption);
+ List found_last_captions = findIndicesWithStartingNL(text, last_caption);
+ if (last_key.equals(SignatureTypes.SIG_ID))
+ {
+ logger.debug("Last key is SIG_ID, so it may not be present. Searching for the previous to last key.");
+ String prevlast_key = (String) keys.get(1);
+ logger.debug("last_key = " + prevlast_key);
+ String prevlast_caption = (String) captions.get(1);
+ logger.debug("prevlast_caption = " + last_caption);
+ List found_prevlast_captions = findIndicesWithStartingNL(text, prevlast_caption);
+ if (!found_prevlast_captions.isEmpty())
+ {
+ found_last_captions.addAll(found_prevlast_captions);
+ }
+ }
if (logger.isDebugEnabled())
{
logger.debug("found " + found_last_captions.size() + " last captions.");
@@ -255,6 +269,7 @@ public class AbsoluteTextSignature
}
}
+
for (int lci = 0; lci < found_last_captions.size(); lci++)
{
int last_caption_index = ((Integer) found_last_captions.get(lci)).intValue();
@@ -300,7 +315,8 @@ public class AbsoluteTextSignature
}
/**
- * Finds all indices of the given subtext within a given text.
+ * Finds all indices of the given subtext (starting at a new line) within a
+ * given text.
*
* <p>
* This is usually used to find the indices of the last captions.
@@ -312,17 +328,30 @@ public class AbsoluteTextSignature
* The subtext to be sought.
* @return Returns the List of found indices.
*/
- public static List findIndices(String text, String subtext)
+ public static List findIndicesWithStartingNL(String text, String subtext)
{
List found_indices = new ArrayList();
+
+ // // for some reason "^" + subtext doesn't work as a pattern
+ // String pattern = "\n" + subtext;
+ // Pattern p = Pattern.compile(pattern);
+ // Matcher m = p.matcher(text);
+ //
+ // while (m.find())
+ // {
+ // int found_index = m.start() + 1; // +1 removes the newline
+ // found_indices.add(new Integer(found_index));
+ // }
+
int search_from_index = 0;
for (;;)
{
- int found_index = text.indexOf(subtext, search_from_index);
+ int found_index = text.indexOf("\n" + subtext, search_from_index);
if (found_index < 0)
{
break;
}
+ found_index += 1; // The +1 compensates the "\n"
found_indices.add(new Integer(found_index));
search_from_index = found_index + subtext.length();
}
@@ -471,7 +500,7 @@ public class AbsoluteTextSignature
int this_end_index = findEndOfValue(text, this_key.start_index);
if (this_end_index != next_key.start_index)
{
- logger.warn("multi line value: " + this_key);
+ logger.debug("multi line value: " + this_key);
// throw new RuntimeException("The end index of found key " + this_key +
// " doesn't match the start index of found key " + next_key);
}
@@ -653,4 +682,45 @@ public class AbsoluteTextSignature
return latest_blocks;
}
+ /**
+ * Chooses the most possible (best choice) block of the list of semantically
+ * equal blocks.
+ *
+ * <p>
+ * Thus blocks are considered semantically equal if their required keys are
+ * semantically equal, semantically equal blocks may still differ in the
+ * number of their non required fields. This may lead to multiple found blocks
+ * of the same size in characters, but where some blocks' elements swallow
+ * elements found by other blocks.
+ * </p>
+ * <p>
+ * The strategy to avoid this is to choose the very one block with the maximum
+ * number of captions. This block has extracted most information from the
+ * text.
+ * </p>
+ *
+ * @param found_blocks
+ * The List of semantically equal blocks.
+ * @return Returns the best choice FoundBlock.
+ */
+ public static FoundBlock chooseMostPossibleSemanticallyEqualBlock(
+ List found_blocks)
+ {
+ int largest_block_index = 0;
+ FoundBlock largest_block = (FoundBlock) found_blocks.get(0);
+
+ for (int i = 1; i < found_blocks.size(); i++)
+ {
+ FoundBlock current_block = (FoundBlock) found_blocks.get(i);
+
+ if (current_block.found_keys.size() > largest_block.found_keys.size())
+ {
+ largest_block = current_block;
+ largest_block_index = i;
+ }
+ }
+
+ logger.debug("Chose largest block with index #" + largest_block_index + ": " + largest_block);
+ return largest_block;
+ }
}