/**
*
* These SignatureTypeDefinitions are all text extractable, which means that they define all required fields as visible. *
** This method filters out all SignatureTypeDefinitions that are not text extractable. *
* * @return Returns a List of SignatureTypeDefinitions that can be extracted from text. * @throws SignatureTypesException F.e. */ public static List getSignatureTypesForTextAnalysis() throws SignatureTypesException { SignatureTypes sig_types = SignatureTypes.getInstance(); List allSignatureTypes = sig_types.getSignatureTypeDefinitions(); List textSignatureTypes = new ArrayList(allSignatureTypes.size()); Iterator it = allSignatureTypes.iterator(); while (it.hasNext()) { SignatureTypeDefinition std = (SignatureTypeDefinition) it.next(); if (!std.isTextExtractable()) { logger.debug("The profile " + std.getType() + " is not text extractable and is thereby not used for text analysis."); continue; } textSignatureTypes.add(std); } return allSignatureTypes; } /** * Extracts all signature holders from a given text. * ** First the latest signature holder is extracted. Then the latest signature * holder in the rest text, which is the second latest one, is extracted. Then * the third latest signature holder is extracted and so forth until no more * signature holders are found. *
* * @param text * The text. * @return Returns the List of extracted signature holders ordered by their * date ascendingly (the lowest, earliest date first, the latest, * newest date last). An empty list is returned if no signature * holders were found. * @throws SignatureException * F.e. * @throws SignatureTypesException * F.e. */ public static List extractSignatureHoldersFromText(String text) throws SignatureException, SignatureTypesException { List holders = new ArrayList(); String current_text = text; for (;;) { TextualSignatureHolder signature_holder = extractLatestBlock(current_text); if (signature_holder == null) { break; } holders.add(0, signature_holder); current_text = signature_holder.getSignedText(); } return holders; } /** * Extracts the latest signature block from the given text and creates a * SignatureHolder object that can be verified. * * @param text * The text. * @return Returns the SignatureObject extracted from the text, or null, if no * latest block was found. * @throws SignatureException * F.e. * @throws SignatureTypesException * F.e. */ public static TextualSignatureHolder extractLatestBlock(String text) throws SignatureException, SignatureTypesException { FoundBlock latest_block = findLatestBlock(text); if (latest_block == null) { return null; } String reconstructed_text = cutOutBlock(text, latest_block); SignatureObject so = createSignatureObjectFromFoundBlock(text, latest_block); TextualSignatureHolder tsh = new TextualSignatureHolder(reconstructed_text, so); return tsh; } /** * Finds the latest signature block for a given text. * ** The latest block is the one with the highest, most recent date. Usually * this block will be extracted (cut out) of the text which will result in the * originally signed text of this signature to be verified using the cut out * data. *
* * @param text * The text to be analyzed. * @return Returns the latest found block or null, if there was none. * @throws SignatureException * F.e. * @throws SignatureTypesException * F.e. */ public static FoundBlock findLatestBlock(String text) throws SignatureException, SignatureTypesException { List signatureTypes_ = getSignatureTypesForTextAnalysis(); List found_potential_candidates = new ArrayList(); for (int i = 0; i < signatureTypes_.size(); i++) { SignatureTypeDefinition block_type = (SignatureTypeDefinition) signatureTypes_.get(i); List found_candidates_for_type = findPotentialSignaturesForProfile(text, block_type); found_potential_candidates.addAll(found_candidates_for_type); } if (found_potential_candidates.isEmpty()) { logger.debug("no candidates found at all"); return null; } List found_candidates = new ArrayList(); logger.debug("checking block integrity"); for (int i = 0; i < found_potential_candidates.size(); i++) { FoundBlock found_block = (FoundBlock) found_potential_candidates.get(i); String date_value = getDateValue(text, found_block); try { EGIZDate date = EGIZDate.parseFromString(date_value); logger.debug("found_block = " + date + " - " + found_block); checkBlockIntegrity(text, found_block); found_candidates.add(found_block); } catch (Exception e) { logger.debug("Exception while checking the integrity of the found block " + found_block + ". Ignoring this block.", e); } } sortFoundBlocksByDate(text, found_candidates); if (logger.isDebugEnabled()) { logger.debug("sorted blocks:"); for (int i = 0; i < found_candidates.size(); i++) { FoundBlock found_block = (FoundBlock) found_candidates.get(i); String date_value = getDateValue(text, found_block); EGIZDate date = EGIZDate.parseFromString(date_value); logger.debug(" #" + i + ": " + date + " - " + found_block); } } List latest_blocks = filterLastDateEqualBlocks(text, found_candidates); if (logger.isDebugEnabled()) { logger.debug("latest blocks:"); for (int i = 0; i < latest_blocks.size(); i++) { FoundBlock found_block = (FoundBlock) latest_blocks.get(i); String date_value = getDateValue(text, found_block); EGIZDate date = EGIZDate.parseFromString(date_value); logger.debug(" #" + i + ": " + date + " - " + found_block); } } // The semantic equality check has been outdated by the // advanced choosing algorithm. // boolean semantic_equality = // PdfAS.checkForSemanticEquality(latest_blocks); // logger.debug("semantic_equality = " + semantic_equality); // if (!semantic_equality) // { // throw new SignatureException(314, "The latest blocks weren't semantically // equal."); // } FoundBlock latest_block = chooseMostPossibleBlock(latest_blocks); logger.debug("latest block = " + latest_block); return latest_block; } /** * Finds the List of potential blocks within the given text for the given * profile. * * @param text * The text, in which potential block are to be sought. * @param block_type * The profile for which the text is to be sought. * @return Returns the List of potential FoundBlocks or an empty List if none * could be found. */ public static List findPotentialSignaturesForProfile(String text, SignatureTypeDefinition block_type) { logger.debug("find potential signatures for " + block_type.getType()); List found_blocks = new ArrayList(); final boolean old_style = false; Vector keys = block_type.getRevertSortedKeys(); Vector captions = block_type.getRevertSortedCaptions(); String last_key = (String) keys.get(0); logger.debug("last_key = " + last_key); String last_caption = (String) captions.get(0); logger.debug("last_caption = " + last_caption); String current_last_caption= last_caption; List found_last_captions = findIndicesWithStartingNL(text, last_caption); if (last_key.equals(SignatureTypes.SIG_ID)) { logger.debug("Last key is SIG_ID, so it may not be present. Searching for the previous to last key."); String prevlast_key = (String) keys.get(1); String prevlast_caption = (String) captions.get(1); current_last_caption = prevlast_caption; List found_prevlast_captions = findIndicesWithStartingNL(text, prevlast_caption); if (!found_prevlast_captions.isEmpty()) { found_last_captions.addAll(found_prevlast_captions); } } if (logger.isDebugEnabled()) { logger.debug("found " + found_last_captions.size() + " last captions."); for (int i = 0; i < found_last_captions.size(); i++) { logger.debug(" found last caption at index " + found_last_captions.get(i)); } } for (int lci = 0; lci < found_last_captions.size(); lci++) { int last_caption_index = ((Integer) found_last_captions.get(lci)).intValue(); logger.debug("resolving signature block from last caption index " + last_caption_index); int potential_block_end = findEndOfValue(text, last_caption_index); if (potential_block_end == (last_caption_index + current_last_caption.length()+1)) { potential_block_end = findEndOfValue(text, potential_block_end); } logger.debug("potential_block_end = " + potential_block_end); List found_keys = PdfAS.findBlockInText(text.substring(0, potential_block_end), block_type, old_style); // findRestKeys(text, // keys, // captions, // last_caption_index); if (found_keys == null) { logger.debug("Not all other captions could be found for the last_caption_index " + last_caption_index + " ==> discarding this index."); continue; } // sort found keys ascendingly PdfAS.sortFoundKeysAscendingly(found_keys); boolean reverse_check_ok = reverseCheckFoundKeys(text, found_keys); if (!reverse_check_ok) { logger.debug("The reverse check ruled this list of found keys out ==> they are discarded."); continue; } logger.debug("The reverse check proved this list of found keys out ==> adding them as potential candidates."); FoundBlock found_block = new FoundBlock(); found_block.std = block_type; found_block.found_keys = found_keys; found_block.end_index = potential_block_end;//findEndOfValue(text, last_caption_index); found_blocks.add(found_block); } logger.debug("found " + found_blocks.size() + " potential signatures for " + block_type.getType()); return found_blocks; } /** * Finds all indices of the given subtext (starting at a new line) within a * given text. * ** This is usually used to find the indices of the last captions. *
* * @param text * The text to be searched. * @param subtext * The subtext to be sought. * @return Returns the List of found indices. */ public static List findIndicesWithStartingNL(String text, String subtext) { List found_indices = new ArrayList(); // // for some reason "^" + subtext doesn't work as a pattern // String pattern = "\n" + subtext; // Pattern p = Pattern.compile(pattern); // Matcher m = p.matcher(text); // // while (m.find()) // { // int found_index = m.start() + 1; // +1 removes the newline // found_indices.add(new Integer(found_index)); // } int search_from_index = 0; for (;;) { int found_index = text.indexOf("\n" + subtext, search_from_index); if (found_index < 0) { break; } found_index += 1; // The +1 compensates the "\n" found_indices.add(new Integer(found_index)); search_from_index = found_index + subtext.length(); } return found_indices; } /** * Finds the other keys/captions according to their order starting from the * last_caption index upwards. * * @param text * The text. * @param keys * The list of keys. * @param captions * The list of captions. * @param last_caption_index * The index of the last caption. * @return Returns the List of found keys, if all keys could be found, or null * if not all keys could be found. */ public static List findRestKeys(String text, List keys, List captions, int last_caption_index) { List found_keys = new ArrayList(); FoundKey last_caption_found_key = new FoundKey((String) keys.get(0), (String) captions.get(0), last_caption_index); found_keys.add(last_caption_found_key); String rest_text = text.substring(0, last_caption_index); for (int i = 1; i < captions.size(); i++) { String sought_caption = (String) captions.get(i); int index = rest_text.lastIndexOf(sought_caption); if (index < 0) { return null; } FoundKey found_key = new FoundKey((String) keys.get(i), (String) captions.get(i), index); found_keys.add(0, found_key); rest_text = rest_text.substring(0, index); } return found_keys; } /** * Performs a reverse (top to bottom) search for the found keys and checks * that these indices are the same as those that were found during the regular * (bottom up) search. ** If a reverse check proves that the found keys are not at the same positions * as during regular search, this list of found keys should be discarded. *
* * @param text * The text. * @param found_keys * The found keys to be reversely checked. * @return Returns true, if all (also the non required) captions could be * found at the same indices as during regular search, false * otherwise. */ public static boolean reverseCheckFoundKeys(String text, List found_keys) { int search_from_index = ((FoundKey) found_keys.get(0)).start_index; for (int i = 0; i < found_keys.size(); i++) { FoundKey found_key = (FoundKey) found_keys.get(i); int reverse_found_index = text.indexOf(found_key.caption, search_from_index); if (reverse_found_index < 0) { throw new RuntimeException("The caption " + found_key.caption + " wasn't found in the text during reverse checking - there is something wrong."); } if (reverse_found_index != found_key.start_index) { logger.debug("The index for caption " + found_key.caption + " wasn't proved during reverse checking."); return false; } search_from_index = found_key.start_index + found_key.caption.length(); } return true; } /** * Finds the end of the value in the text. * ** This simply scans for a '\n' from a given start index. The line up to and * inclusive the '\n' is considered to be the value. *
** Note that this method does NOT find the accurate value, if the value goes * over multiple lines! This may bear a serious problem. Usually this method * is only used to finding the end of the last value in a found block, because * mid- values are exactly determined by their start index and the start of * the next caption. Nevertheless, if the last value spans over multiple * lines, this method will not retrieve it completely. *
* * @param text * The text. * @param start_index * The start index from where the end of the value is sought. * @return Returns the end index of the value, which is the index of the first * character not belonging to the value anymore (the character after * the '\n'). */ public static int findEndOfValue(String text, int start_index) { int newline_index = text.indexOf('\n', start_index); if (newline_index < 0) { return text.length(); } return newline_index + 1; } /** * Checks the integrity of a found block. * ** This is an assertive function. *
* * @param text * The text. * @param found_block * The found block. */ public static void checkBlockIntegrity(String text, FoundBlock found_block) { for (int i = 0; i < found_block.found_keys.size() - 1; i++) { FoundKey this_key = (FoundKey) found_block.found_keys.get(i); FoundKey next_key = (FoundKey) found_block.found_keys.get(i + 1); int this_end_index = findEndOfValue(text, this_key.start_index); if (this_end_index != next_key.start_index) { logger.debug("multi line value: " + this_key); // throw new RuntimeException("The end index of found key " + this_key + // " doesn't match the start index of found key " + next_key); } } FoundKey last_key = (FoundKey) found_block.found_keys.get(found_block.found_keys.size() - 1); int end_of_block = findEndOfValue(text, last_key.start_index); if (end_of_block == (last_key.start_index+last_key.caption.length()+1)) { end_of_block = findEndOfValue(text,end_of_block); } if (end_of_block != found_block.end_index) { throw new RuntimeException("The end index of last key " + last_key + " doesn't match the end index of the block " + found_block); } } /** * Cuts out the given found block from the text. * * @param text * The text. * @param block * The found block. * @return Returns the rest text without the block. */ public static String cutOutBlock(String text, FoundBlock block) { int block_start_index = ((FoundKey) block.found_keys.get(0)).getStartIndex(); int block_end_index = block.end_index; if (block_start_index == 0 && block_end_index == text.length()) { // the block is the whole text - the rest text is empty. // This may happen if a (no-text) empty document contains a binary signature. // Then the "signed text" of the binary signature is empty. return ""; } if (block_end_index == text.length()) { // if the block is at the end of the text, remove the "\n" before the // block as well. String pre = text.substring(0, block_start_index - 1); return pre; } String pre = text.substring(0, block_start_index); String post = text.substring(block_end_index); String rest_text = pre + post; return rest_text; } /** * Returns the value of the date field as String. * * @param text * The text. * @param block * The found block. * @return Returns the date value. */ public static String getDateValue(String text, FoundBlock block) { FoundKey date_key = block.getDateFoundKey(); int date_value_start_index = date_key.start_index + date_key.caption.length(); int date_value_end_index = findEndOfValue(text, date_value_start_index); if (date_value_end_index == (date_value_start_index+1)) { date_value_end_index = findEndOfValue(text, date_value_end_index); } String date_value = text.substring(date_value_start_index, date_value_end_index).trim(); logger.debug("DateString="+date_value); return date_value; } /** * Creates a SignatureObject from a found block by extracting the * corresponding values. * * @param text * The text. * @param found_block * The found block. * @return Returns the created SignatureObject. * @throws SignatureTypesException * F.e. * @throws SignatureException * F.e. */ public static SignatureObject createSignatureObjectFromFoundBlock( String text, FoundBlock found_block) throws SignatureTypesException, SignatureException { SignatureObject signatureObject = new SignatureObject(); signatureObject.setSigType(found_block.std.getType()); signatureObject.initByType(); int end_index = found_block.end_index; for (int i = found_block.found_keys.size() - 1; i >= 0; i--) { FoundKey cur_key = (FoundKey) found_block.found_keys.get(i); int start_index = cur_key.getStartIndex() + cur_key.caption.length(); String value = text.substring(start_index, end_index); signatureObject.setSigValueCaption(cur_key.getKey(), value, cur_key.caption); end_index = cur_key.getStartIndex(); } return signatureObject; } /** * Parses the EGIZDate from a found block and the given text. * * @param text * The text. * @param found_block * The found block. * @return Returns the parsed EGIZDate. */ public static EGIZDate getDateFromFoundBlock(String text, FoundBlock found_block) { String date_value = getDateValue(text, found_block); EGIZDate date = EGIZDate.parseFromString(date_value); return date; } /** * Sorts the List of found blocks by date. * * @param text * The text. * @param found_blocks * The List of found blocks. */ public static void sortFoundBlocksByDate(final String text, List found_blocks) { Collections.sort(found_blocks, new Comparator() { public int compare(Object arg0, Object arg1) { FoundBlock fb0 = (FoundBlock) arg0; FoundBlock fb1 = (FoundBlock) arg1; EGIZDate date0 = getDateFromFoundBlock(text, fb0); EGIZDate date1 = getDateFromFoundBlock(text, fb1); return date0.compareTo(date1); } }); } /** * Given a List of FoundBlock objects, this method returns the last blocks of * this list that have the same date. * ** Usually a date sorted list (earliest first, latest last) will be provided * to this method. Then the last date equal blocks are returned, which are the * last blocks. *
* * @param text * The text to retrieve the values of the fields from. * @param found_blocks * The List of FoundBlock objects. * @return Returns the List of the last date equal blocks. */ public static List filterLastDateEqualBlocks(String text, List found_blocks) { List latest_blocks = new ArrayList(); latest_blocks.add(found_blocks.get(found_blocks.size() - 1)); for (int i = found_blocks.size() - 2; i >= 0; i--) { FoundBlock this_block = (FoundBlock) found_blocks.get(i); FoundBlock succ_block = (FoundBlock) found_blocks.get(i + 1); EGIZDate this_date = getDateFromFoundBlock(text, this_block); EGIZDate succ_date = getDateFromFoundBlock(text, succ_block); if (!this_date.equals(succ_date)) { break; } latest_blocks.add(0, this_block); } return latest_blocks; } /** * Chooses the most possible (best choice) block of the list of blocks. * ** The strategy to find the most possible block is to choose the very one * block with the maximum number of captions. This block has extracted most * information from the text. *
** If there are still multiple blocks with the same number of cations, the * blocks are compared caption-wise. The block with all captions being longer * or equal to all other blocks' captions wins. *
* * @param found_blocks * The List of semantically equal blocks. * @return Returns the best choice FoundBlock. * @throws SignatureException */ public static FoundBlock chooseMostPossibleBlock(List found_blocks) throws SignatureException { // int largest_block_index = 0; // FoundBlock largest_block = (FoundBlock) found_blocks.get(0); // // for (int i = 1; i < found_blocks.size(); i++) // { // FoundBlock current_block = (FoundBlock) found_blocks.get(i); // // if (current_block.found_keys.size() > largest_block.found_keys.size()) // { // largest_block = current_block; // largest_block_index = i; // } // } List vertically_largest = filterVerticallyLargestBlocks(found_blocks); if (logger.isDebugEnabled()) { logger.debug("vertically largest blocks:"); for (int i = 0; i < vertically_largest.size(); i++) { FoundBlock found_block = (FoundBlock) vertically_largest.get(i); logger.debug(" #" + i + ": " + found_block); } } List horizontally_largest = filterHorizontallyLargestBlocks(vertically_largest); if (logger.isDebugEnabled()) { logger.debug("horizontally largest blocks:"); for (int i = 0; i < horizontally_largest.size(); i++) { FoundBlock found_block = (FoundBlock) horizontally_largest.get(i); logger.debug(" #" + i + ": " + found_block); } } FoundBlock largest_block = (FoundBlock) horizontally_largest.get(0); logger.debug("Chose largest block: " + largest_block); return largest_block; } /** * Filters out all blocks but the vertically largest ones. * ** A vertically largest block has the most found keys. *
* * @param found_blocks * The List of FoundBlock objects to be filtered. * @return Returns the List of the vertically largest FoundBlock objects. */ public static List filterVerticallyLargestBlocks(List found_blocks) { // determine the size of the largest block(s) int largest_size = Integer.MIN_VALUE; for (int i = 1; i < found_blocks.size(); i++) { FoundBlock fb = (FoundBlock) found_blocks.get(i); final int current_size = fb.found_keys.size(); if (current_size > largest_size) { largest_size = current_size; } } // keep all blocks that have the largest_size List largest_blocks = new ArrayList(); for (int i = 0; i < found_blocks.size(); i++) { FoundBlock fb = (FoundBlock) found_blocks.get(i); if (fb.found_keys.size() < largest_size) { continue; } largest_blocks.add(fb); } return largest_blocks; } /** * Filters out all blocks but the horizonally largest ones. * ** A vertically largest block has the most found keys. *
* * @param found_blocks * The List of FoundBlock objects to be filtered. All of these * FoundBlock objects must have the same number of found keys. * @return Returns the List of the horizontally largest FoundBlock objects. * @throws SignatureException */ public static List filterHorizontallyLargestBlocks(List found_blocks) throws SignatureException { List horizontally_largest = new ArrayList(); FoundBlock largest_block = (FoundBlock) found_blocks.get(0); horizontally_largest.add(largest_block); for (int i = 1; i < found_blocks.size(); i++) { FoundBlock fb = (FoundBlock) found_blocks.get(i); if (isHorizontallyEqual(fb, largest_block)) { horizontally_largest.add(fb); continue; } if (isHorizontallyLarger(fb, largest_block)) { horizontally_largest = new ArrayList(); largest_block = fb; horizontally_largest.add(largest_block); } else { if (!isHorizontallyLarger(largest_block, fb)) { // The block is neither equal nor larger nor lower. // We cannot exactly determine which one to use. throw new SignatureException(315, "The blocks are neither larger nor lower nor equal. Cannot decide which one to pick. fb = " + fb + ", largest_block = " + largest_block); } } } return horizontally_largest; } protected static boolean isHorizontallyEqual(FoundBlock fb0, FoundBlock fb1) { final int num_keys = fb0.found_keys.size(); if (num_keys != fb1.found_keys.size()) { throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size()); } for (int i = 0; i < num_keys; i++) { FoundKey fk0 = (FoundKey) fb0.found_keys.get(i); FoundKey fk1 = (FoundKey) fb1.found_keys.get(i); if (fk0.caption.length() != fk1.caption.length()) { return false; } } return true; } protected static boolean isHorizontallyLarger(FoundBlock fb0, FoundBlock fb1) { final int num_keys = fb0.found_keys.size(); if (num_keys != fb1.found_keys.size()) { throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size()); } boolean larger = false; for (int i = 0; i < num_keys; i++) { FoundKey fk0 = (FoundKey) fb0.found_keys.get(i); FoundKey fk1 = (FoundKey) fb1.found_keys.get(i); if (fk0.caption.length() == fk1.caption.length()) { continue; } if (fk0.caption.length() > fk1.caption.length()) { larger = true; continue; } // if (fk0.caption.length() < fk1.caption.length()) return false; } return larger; } }