From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/egov/egiz/pdf/AbsoluteTextSignature.java | 956 +++++++++++++++++++++ 1 file changed, 956 insertions(+) create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java') diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java new file mode 100644 index 0000000..fd59d34 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java @@ -0,0 +1,956 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: AbsoluteTextSignature.java,v 1.1 2006/10/31 08:08:33 wprinz Exp $ + */ +package at.knowcenter.wag.egov.egiz.pdf; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Vector; + +import org.apache.log4j.Logger; + +import at.knowcenter.wag.egov.egiz.PdfAS; +import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger; +import at.knowcenter.wag.egov.egiz.exceptions.SignatureException; +import at.knowcenter.wag.egov.egiz.exceptions.SignatureTypesException; +import at.knowcenter.wag.egov.egiz.framework.FoundBlock; +import at.knowcenter.wag.egov.egiz.framework.FoundKey; +import at.knowcenter.wag.egov.egiz.sig.SignatureObject; +import at.knowcenter.wag.egov.egiz.sig.SignatureTypeDefinition; +import at.knowcenter.wag.egov.egiz.sig.SignatureTypes; + +/** + * Contains methods and helpers that implement the absolute text signature. + * + * @author wprinz + */ +public class AbsoluteTextSignature +{ + + /** + * The logger definition. + */ + private static final Logger logger = ConfigLogger.getLogger(AbsoluteTextSignature.class); + + /** + * Returns a List of SignatureTypeDefinitions that can be extracted from text. + * + *

+ * These SignatureTypeDefinitions are all text extractable, which means that they define all required fields as visible. + *

+ *

+ * This method filters out all SignatureTypeDefinitions that are not text extractable. + *

+ * + * @return Returns a List of SignatureTypeDefinitions that can be extracted from text. + * @throws SignatureTypesException F.e. + */ + public static List getSignatureTypesForTextAnalysis() throws SignatureTypesException + { + SignatureTypes sig_types = SignatureTypes.getInstance(); + List allSignatureTypes = sig_types.getSignatureTypeDefinitions(); + + List textSignatureTypes = new ArrayList(allSignatureTypes.size()); + Iterator it = allSignatureTypes.iterator(); + while (it.hasNext()) + { + SignatureTypeDefinition std = (SignatureTypeDefinition) it.next(); + if (!std.isTextExtractable()) + { + logger.debug("The profile " + std.getType() + " is not text extractable and is thereby not used for text analysis."); + continue; + } + textSignatureTypes.add(std); + } + + return textSignatureTypes; + } + + /** + * Extracts all signature holders from a given text. + * + *

+ * First the latest signature holder is extracted. Then the latest signature + * holder in the rest text, which is the second latest one, is extracted. Then + * the third latest signature holder is extracted and so forth until no more + * signature holders are found. + *

+ * + * @param text + * The text. + * @return Returns the List of extracted signature holders ordered by their + * date ascendingly (the lowest, earliest date first, the latest, + * newest date last). An empty list is returned if no signature + * holders were found. + * @throws SignatureException + * F.e. + * @throws SignatureTypesException + * F.e. + */ + public static List extractSignatureHoldersFromText(String text) throws SignatureException, SignatureTypesException + { + List holders = new ArrayList(); + String current_text = text; + for (;;) + { + TextualSignatureHolder signature_holder = extractLatestBlock(current_text); + if (signature_holder == null) + { + break; + } + holders.add(0, signature_holder); + current_text = signature_holder.getSignedText(); + } + return holders; + } + + /** + * Extracts the latest signature block from the given text and creates a + * SignatureHolder object that can be verified. + * + * @param text + * The text. + * @return Returns the SignatureObject extracted from the text, or null, if no + * latest block was found. + * @throws SignatureException + * F.e. + * @throws SignatureTypesException + * F.e. + */ + public static TextualSignatureHolder extractLatestBlock(String text) throws SignatureException, SignatureTypesException + { + FoundBlock latest_block = findLatestBlock(text); + if (latest_block == null) + { + return null; + } + String reconstructed_text = cutOutBlock(text, latest_block); + SignatureObject so = createSignatureObjectFromFoundBlock(text, latest_block); + TextualSignatureHolder tsh = new TextualSignatureHolder(reconstructed_text, so); + return tsh; + } + + /** + * Finds the latest signature block for a given text. + * + *

+ * The latest block is the one with the highest, most recent date. Usually + * this block will be extracted (cut out) of the text which will result in the + * originally signed text of this signature to be verified using the cut out + * data. + *

+ * + * @param text + * The text to be analyzed. + * @return Returns the latest found block or null, if there was none. + * @throws SignatureException + * F.e. + * @throws SignatureTypesException + * F.e. + */ + public static FoundBlock findLatestBlock(String text) throws SignatureException, SignatureTypesException + { + List signatureTypes_ = getSignatureTypesForTextAnalysis(); + + List found_potential_candidates = new ArrayList(); + + for (int i = 0; i < signatureTypes_.size(); i++) + { + SignatureTypeDefinition block_type = (SignatureTypeDefinition) signatureTypes_.get(i); + List found_candidates_for_type = findPotentialSignaturesForProfile(text, block_type); + + found_potential_candidates.addAll(found_candidates_for_type); + } + + if (found_potential_candidates.isEmpty()) + { + logger.debug("no candidates found at all"); + return null; + } + + List found_candidates = new ArrayList(); + logger.debug("checking block integrity"); + for (int i = 0; i < found_potential_candidates.size(); i++) + { + FoundBlock found_block = (FoundBlock) found_potential_candidates.get(i); + String date_value = getDateValue(text, found_block); + try + { + EGIZDate date = EGIZDate.parseFromString(date_value); + + logger.debug("found_block = " + date + " - " + found_block); + + checkBlockIntegrity(text, found_block); + found_candidates.add(found_block); + } + catch (Exception e) + { + logger.debug("Exception while checking the integrity of the found block " + found_block + ". Ignoring this block.", e); + } + } + + sortFoundBlocksByDate(text, found_candidates); + if (logger.isDebugEnabled()) + { + logger.debug("sorted blocks:"); + for (int i = 0; i < found_candidates.size(); i++) + { + FoundBlock found_block = (FoundBlock) found_candidates.get(i); + + String date_value = getDateValue(text, found_block); + EGIZDate date = EGIZDate.parseFromString(date_value); + + logger.debug(" #" + i + ": " + date + " - " + found_block); + } + } + + List latest_blocks = filterLastDateEqualBlocks(text, found_candidates); + if (logger.isDebugEnabled()) + { + logger.debug("latest blocks:"); + for (int i = 0; i < latest_blocks.size(); i++) + { + FoundBlock found_block = (FoundBlock) latest_blocks.get(i); + + String date_value = getDateValue(text, found_block); + EGIZDate date = EGIZDate.parseFromString(date_value); + + logger.debug(" #" + i + ": " + date + " - " + found_block); + } + } + + // The semantic equality check has been outdated by the + // advanced choosing algorithm. + // boolean semantic_equality = + // PdfAS.checkForSemanticEquality(latest_blocks); + // logger.debug("semantic_equality = " + semantic_equality); + // if (!semantic_equality) + // { + // throw new SignatureException(314, "The latest blocks weren't semantically + // equal."); + // } + + FoundBlock latest_block = chooseMostPossibleBlock(latest_blocks); + + logger.debug("latest block = " + latest_block); + return latest_block; + } + + /** + * Finds the List of potential blocks within the given text for the given + * profile. + * + * @param text + * The text, in which potential block are to be sought. + * @param block_type + * The profile for which the text is to be sought. + * @return Returns the List of potential FoundBlocks or an empty List if none + * could be found. + */ + public static List findPotentialSignaturesForProfile(String text, + SignatureTypeDefinition block_type) + { + logger.debug("find potential signatures for " + block_type.getType()); + List found_blocks = new ArrayList(); + + final boolean old_style = false; + + Vector keys = block_type.getRevertSortedKeys(); + Vector captions = block_type.getRevertSortedCaptions(); + + String last_key = (String) keys.get(0); + logger.debug("last_key = " + last_key); + String last_caption = (String) captions.get(0); + logger.debug("last_caption = " + last_caption); + String current_last_caption= last_caption; + List found_last_captions = findIndicesWithStartingNL(text, last_caption); + if (last_key.equals(SignatureTypes.SIG_ID)) + { + logger.debug("Last key is SIG_ID, so it may not be present. Searching for the previous to last key."); + String prevlast_key = (String) keys.get(1); + String prevlast_caption = (String) captions.get(1); + current_last_caption = prevlast_caption; + List found_prevlast_captions = findIndicesWithStartingNL(text, prevlast_caption); + if (!found_prevlast_captions.isEmpty()) + { + found_last_captions.addAll(found_prevlast_captions); + } + } + if (logger.isDebugEnabled()) + { + logger.debug("found " + found_last_captions.size() + " last captions."); + for (int i = 0; i < found_last_captions.size(); i++) + { + logger.debug(" found last caption at index " + found_last_captions.get(i)); + } + } + + for (int lci = 0; lci < found_last_captions.size(); lci++) + { + int last_caption_index = ((Integer) found_last_captions.get(lci)).intValue(); + logger.debug("resolving signature block from last caption index " + last_caption_index); + int potential_block_end = findEndOfValue(text, last_caption_index); + if (potential_block_end == (last_caption_index + current_last_caption.length()+1)) + { + potential_block_end = findEndOfValue(text, potential_block_end); + } + + // FIXME: complete HOTFIX + /* + int extendedValueEnd = potential_block_end; + String cv; + do { + extendedValueEnd = findEndOfValue(text, extendedValueEnd); + cv = text.substring(last_caption_index + current_last_caption.length()+1, extendedValueEnd); + } while (extendedValueEnd < text.length()); + */ + + logger.debug("potential_block_end = " + potential_block_end); + List found_keys = PdfAS.findBlockInText(text.substring(0, potential_block_end), block_type, old_style); // findRestKeys(text, + // keys, + // captions, + // last_caption_index); + + if (found_keys == null) + { + logger.debug("Not all other captions could be found for the last_caption_index " + last_caption_index + " ==> discarding this index."); + + continue; + } + + // sort found keys ascendingly + PdfAS.sortFoundKeysAscendingly(found_keys); + + boolean reverse_check_ok = reverseCheckFoundKeys(text, found_keys); + if (!reverse_check_ok) + { + logger.debug("The reverse check ruled this list of found keys out ==> they are discarded."); + + continue; + } + + logger.debug("The reverse check proved this list of found keys out ==> adding them as potential candidates."); + + FoundBlock found_block = new FoundBlock(); + found_block.std = block_type; + found_block.found_keys = found_keys; + found_block.end_index = potential_block_end;//findEndOfValue(text, last_caption_index); + found_blocks.add(found_block); + } + + logger.debug("found " + found_blocks.size() + " potential signatures for " + block_type.getType()); + return found_blocks; + } + + /** + * Finds all indices of the given subtext (starting at a new line) within a + * given text. + * + *

+ * This is usually used to find the indices of the last captions. + *

+ * + * @param text + * The text to be searched. + * @param subtext + * The subtext to be sought. + * @return Returns the List of found indices. + */ + public static List findIndicesWithStartingNL(String text, String subtext) + { + List found_indices = new ArrayList(); + + // // for some reason "^" + subtext doesn't work as a pattern + // String pattern = "\n" + subtext; + // Pattern p = Pattern.compile(pattern); + // Matcher m = p.matcher(text); + // + // while (m.find()) + // { + // int found_index = m.start() + 1; // +1 removes the newline + // found_indices.add(new Integer(found_index)); + // } + + int search_from_index = 0; + for (;;) + { + int found_index = text.indexOf("\n" + subtext, search_from_index); + if (found_index < 0) + { + break; + } + found_index += 1; // The +1 compensates the "\n" + found_indices.add(new Integer(found_index)); + search_from_index = found_index + subtext.length(); + } + return found_indices; + } + + /** + * Finds the other keys/captions according to their order starting from the + * last_caption index upwards. + * + * @param text + * The text. + * @param keys + * The list of keys. + * @param captions + * The list of captions. + * @param last_caption_index + * The index of the last caption. + * @return Returns the List of found keys, if all keys could be found, or null + * if not all keys could be found. + */ + public static List findRestKeys(String text, List keys, List captions, + int last_caption_index) + { + List found_keys = new ArrayList(); + + FoundKey last_caption_found_key = new FoundKey((String) keys.get(0), (String) captions.get(0), last_caption_index); + found_keys.add(last_caption_found_key); + + String rest_text = text.substring(0, last_caption_index); + + for (int i = 1; i < captions.size(); i++) + { + String sought_caption = (String) captions.get(i); + int index = rest_text.lastIndexOf(sought_caption); + + if (index < 0) + { + return null; + } + FoundKey found_key = new FoundKey((String) keys.get(i), (String) captions.get(i), index); + found_keys.add(0, found_key); + + rest_text = rest_text.substring(0, index); + } + + return found_keys; + } + + /** + * Performs a reverse (top to bottom) search for the found keys and checks + * that these indices are the same as those that were found during the regular + * (bottom up) search. + *

+ * If a reverse check proves that the found keys are not at the same positions + * as during regular search, this list of found keys should be discarded. + *

+ * + * @param text + * The text. + * @param found_keys + * The found keys to be reversely checked. + * @return Returns true, if all (also the non required) captions could be + * found at the same indices as during regular search, false + * otherwise. + */ + public static boolean reverseCheckFoundKeys(String text, List found_keys) + { + int search_from_index = ((FoundKey) found_keys.get(0)).start_index; + for (int i = 0; i < found_keys.size(); i++) + { + FoundKey found_key = (FoundKey) found_keys.get(i); + + int reverse_found_index = text.indexOf(found_key.caption, search_from_index); + + // dferbas fix #331 ?? + + if (reverse_found_index < 0) + { + throw new RuntimeException("The caption " + found_key.caption + " wasn't found in the text during reverse checking - there is something wrong."); + } + + if (reverse_found_index != found_key.start_index) + { + logger.debug("The index for caption " + found_key.caption + " wasn't proved during reverse checking."); + return false; + } + search_from_index = found_key.start_index + found_key.caption.length(); + } + return true; + } + + /** + * Finds the end of the value in the text. + * + *

+ * This simply scans for a '\n' from a given start index. The line up to and + * inclusive the '\n' is considered to be the value. + *

+ *

+ * Note that this method does NOT find the accurate value, if the value goes + * over multiple lines! This may bear a serious problem. Usually this method + * is only used to finding the end of the last value in a found block, because + * mid- values are exactly determined by their start index and the start of + * the next caption. Nevertheless, if the last value spans over multiple + * lines, this method will not retrieve it completely. + *

+ * + * @param text + * The text. + * @param start_index + * The start index from where the end of the value is sought. + * @return Returns the end index of the value, which is the index of the first + * character not belonging to the value anymore (the character after + * the '\n'). + */ + public static int findEndOfValue(String text, int start_index) + { + int newline_index = text.indexOf('\n', start_index); + if (newline_index < 0) + { + return text.length(); + } + return newline_index + 1; + } + + /** + * Checks the integrity of a found block. + * + *

+ * This is an assertive function. + *

+ * + * @param text + * The text. + * @param found_block + * The found block. + */ + public static void checkBlockIntegrity(String text, FoundBlock found_block) + { + for (int i = 0; i < found_block.found_keys.size() - 1; i++) + { + FoundKey this_key = (FoundKey) found_block.found_keys.get(i); + FoundKey next_key = (FoundKey) found_block.found_keys.get(i + 1); + + int this_end_index = findEndOfValue(text, this_key.start_index); + if (this_end_index != next_key.start_index) + { + logger.debug("multi line value: " + this_key); + // throw new RuntimeException("The end index of found key " + this_key + + // " doesn't match the start index of found key " + next_key); + } + } + + FoundKey last_key = (FoundKey) found_block.found_keys.get(found_block.found_keys.size() - 1); + int end_of_block = findEndOfValue(text, last_key.start_index); + if (end_of_block == (last_key.start_index+last_key.caption.length()+1)) + { + end_of_block = findEndOfValue(text,end_of_block); + } + if (end_of_block != found_block.end_index) + { + throw new RuntimeException("The end index of last key " + last_key + " doesn't match the end index of the block " + found_block); + } + } + + /** + * Cuts out the given found block from the text. + * + * @param text + * The text. + * @param block + * The found block. + * @return Returns the rest text without the block. + */ + public static String cutOutBlock(String text, FoundBlock block) + { + int block_start_index = ((FoundKey) block.found_keys.get(0)).getStartIndex(); + int block_end_index = block.end_index; + + if (block_start_index == 0 && block_end_index == text.length()) + { + // the block is the whole text - the rest text is empty. + // This may happen if a (no-text) empty document contains a binary signature. + // Then the "signed text" of the binary signature is empty. + return ""; + } + + if (block_end_index == text.length()) + { + // if the block is at the end of the text, remove the "\n" before the + // block as well. + String pre = text.substring(0, block_start_index - 1); + return pre; + } + + String pre = text.substring(0, block_start_index); + String post = text.substring(block_end_index); + + String rest_text = pre + post; + return rest_text; + } + + /** + * Returns the value of the date field as String. + * + * @param text + * The text. + * @param block + * The found block. + * @return Returns the date value. + */ + public static String getDateValue(String text, FoundBlock block) + { + FoundKey date_key = block.getDateFoundKey(); + int date_value_start_index = date_key.start_index + date_key.caption.length(); + int date_value_end_index = findEndOfValue(text, date_value_start_index); + if (date_value_end_index == (date_value_start_index+1)) + { + date_value_end_index = findEndOfValue(text, date_value_end_index); + } + String date_value = text.substring(date_value_start_index, date_value_end_index).trim(); + logger.debug("DateString="+date_value); + return date_value; + } + + /** + * Creates a SignatureObject from a found block by extracting the + * corresponding values. + * + * @param text + * The text. + * @param found_block + * The found block. + * @return Returns the created SignatureObject. + * @throws SignatureTypesException + * F.e. + * @throws SignatureException + * F.e. + */ + public static SignatureObject createSignatureObjectFromFoundBlock( + String text, FoundBlock found_block) throws SignatureTypesException, SignatureException + { + SignatureObject signatureObject = new SignatureObject(); + + signatureObject.setSigType(found_block.std.getType()); + signatureObject.initByType(); + + int end_index = found_block.end_index; + for (int i = found_block.found_keys.size() - 1; i >= 0; i--) + { + FoundKey cur_key = (FoundKey) found_block.found_keys.get(i); + int start_index = cur_key.getStartIndex() + cur_key.caption.length(); + + String value = text.substring(start_index, end_index); + + signatureObject.setSigValueCaption(cur_key.getKey(), value, cur_key.caption); + + end_index = cur_key.getStartIndex(); + } + + return signatureObject; + + } + + /** + * Parses the EGIZDate from a found block and the given text. + * + * @param text + * The text. + * @param found_block + * The found block. + * @return Returns the parsed EGIZDate. + */ + public static EGIZDate getDateFromFoundBlock(String text, + FoundBlock found_block) + { + String date_value = getDateValue(text, found_block); + EGIZDate date = EGIZDate.parseFromString(date_value); + return date; + } + + /** + * Sorts the List of found blocks by date. + * + * @param text + * The text. + * @param found_blocks + * The List of found blocks. + */ + public static void sortFoundBlocksByDate(final String text, List found_blocks) + { + Collections.sort(found_blocks, new Comparator() + { + public int compare(Object arg0, Object arg1) + { + FoundBlock fb0 = (FoundBlock) arg0; + FoundBlock fb1 = (FoundBlock) arg1; + + EGIZDate date0 = getDateFromFoundBlock(text, fb0); + EGIZDate date1 = getDateFromFoundBlock(text, fb1); + return date0.compareTo(date1); + } + }); + } + + /** + * Given a List of FoundBlock objects, this method returns the last blocks of + * this list that have the same date. + * + *

+ * Usually a date sorted list (earliest first, latest last) will be provided + * to this method. Then the last date equal blocks are returned, which are the + * last blocks. + *

+ * + * @param text + * The text to retrieve the values of the fields from. + * @param found_blocks + * The List of FoundBlock objects. + * @return Returns the List of the last date equal blocks. + */ + public static List filterLastDateEqualBlocks(String text, List found_blocks) + { + List latest_blocks = new ArrayList(); + latest_blocks.add(found_blocks.get(found_blocks.size() - 1)); + for (int i = found_blocks.size() - 2; i >= 0; i--) + { + FoundBlock this_block = (FoundBlock) found_blocks.get(i); + FoundBlock succ_block = (FoundBlock) found_blocks.get(i + 1); + + EGIZDate this_date = getDateFromFoundBlock(text, this_block); + EGIZDate succ_date = getDateFromFoundBlock(text, succ_block); + + if (!this_date.equals(succ_date)) + { + break; + } + latest_blocks.add(0, this_block); + } + + return latest_blocks; + } + /** + * Chooses the most possible (best choice) block of the list of blocks. + * + *

+ * The strategy to find the most possible block is to choose the very one + * block with the maximum number of captions. This block has extracted most + * information from the text. + *

+ *

+ * If there are still multiple blocks with the same number of cations, the + * blocks are compared caption-wise. The block with all captions being longer + * or equal to all other blocks' captions wins. + *

+ * + * @param found_blocks + * The List of semantically equal blocks. + * @return Returns the best choice FoundBlock. + * @throws SignatureException + */ + public static FoundBlock chooseMostPossibleBlock(List found_blocks) throws SignatureException + { + // int largest_block_index = 0; + // FoundBlock largest_block = (FoundBlock) found_blocks.get(0); + // + // for (int i = 1; i < found_blocks.size(); i++) + // { + // FoundBlock current_block = (FoundBlock) found_blocks.get(i); + // + // if (current_block.found_keys.size() > largest_block.found_keys.size()) + // { + // largest_block = current_block; + // largest_block_index = i; + // } + // } + + List vertically_largest = filterVerticallyLargestBlocks(found_blocks); + if (logger.isDebugEnabled()) + { + logger.debug("vertically largest blocks:"); + for (int i = 0; i < vertically_largest.size(); i++) + { + FoundBlock found_block = (FoundBlock) vertically_largest.get(i); + logger.debug(" #" + i + ": " + found_block); + } + } + + List horizontally_largest = filterHorizontallyLargestBlocks(vertically_largest); + if (logger.isDebugEnabled()) + { + logger.debug("horizontally largest blocks:"); + for (int i = 0; i < horizontally_largest.size(); i++) + { + FoundBlock found_block = (FoundBlock) horizontally_largest.get(i); + logger.debug(" #" + i + ": " + found_block); + } + } + FoundBlock largest_block = (FoundBlock) horizontally_largest.get(0); + logger.debug("Chose largest block: " + largest_block); + return largest_block; + } + + /** + * Filters out all blocks but the vertically largest ones. + * + *

+ * A vertically largest block has the most found keys. + *

+ * + * @param found_blocks + * The List of FoundBlock objects to be filtered. + * @return Returns the List of the vertically largest FoundBlock objects. + */ + public static List filterVerticallyLargestBlocks(List found_blocks) + { + // determine the size of the largest block(s) + int largest_size = Integer.MIN_VALUE; + for (int i = 0; i < found_blocks.size(); i++) + { + FoundBlock fb = (FoundBlock) found_blocks.get(i); + final int current_size = fb.found_keys.size(); + if (current_size > largest_size) + { + largest_size = current_size; + } + } + + // keep all blocks that have the largest_size + List largest_blocks = new ArrayList(); + for (int i = 0; i < found_blocks.size(); i++) + { + FoundBlock fb = (FoundBlock) found_blocks.get(i); + if (fb.found_keys.size() < largest_size) + { + continue; + } + largest_blocks.add(fb); + } + return largest_blocks; + } + + /** + * Filters out all blocks but the horizonally largest ones. + * + *

+ * A vertically largest block has the most found keys. + *

+ * + * @param found_blocks + * The List of FoundBlock objects to be filtered. All of these + * FoundBlock objects must have the same number of found keys. + * @return Returns the List of the horizontally largest FoundBlock objects. + * @throws SignatureException + */ + public static List filterHorizontallyLargestBlocks(List found_blocks) throws SignatureException + { + List horizontally_largest = new ArrayList(); + FoundBlock largest_block = (FoundBlock) found_blocks.get(0); + horizontally_largest.add(largest_block); + + for (int i = 1; i < found_blocks.size(); i++) + { + FoundBlock fb = (FoundBlock) found_blocks.get(i); + + if (isHorizontallyEqual(fb, largest_block)) + { + horizontally_largest.add(fb); + continue; + } + + if (isHorizontallyLarger(fb, largest_block)) + { + horizontally_largest = new ArrayList(); + largest_block = fb; + horizontally_largest.add(largest_block); + } + else + { + if (!isHorizontallyLarger(largest_block, fb)) + { + // The block is neither equal nor larger nor lower. + // We cannot exactly determine which one to use. + throw new SignatureException(315, "The blocks are neither larger nor lower nor equal. Cannot decide which one to pick. fb = " + fb + ", largest_block = " + largest_block); + } + } + + } + + return horizontally_largest; + } + + protected static boolean isHorizontallyEqual(FoundBlock fb0, FoundBlock fb1) + { + final int num_keys = fb0.found_keys.size(); + if (num_keys != fb1.found_keys.size()) + { + throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size()); + } + + for (int i = 0; i < num_keys; i++) + { + FoundKey fk0 = (FoundKey) fb0.found_keys.get(i); + FoundKey fk1 = (FoundKey) fb1.found_keys.get(i); + + if (fk0.caption.length() != fk1.caption.length()) + { + return false; + } + } + + return true; + } + + protected static boolean isHorizontallyLarger(FoundBlock fb0, FoundBlock fb1) + { + final int num_keys = fb0.found_keys.size(); + if (num_keys != fb1.found_keys.size()) + { + throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size()); + } + + boolean larger = false; + + for (int i = 0; i < num_keys; i++) + { + FoundKey fk0 = (FoundKey) fb0.found_keys.get(i); + FoundKey fk1 = (FoundKey) fb1.found_keys.get(i); + + if (fk0.caption.length() == fk1.caption.length()) + { + continue; + } + + if (fk0.caption.length() > fk1.caption.length()) + { + larger = true; + continue; + } + + // if (fk0.caption.length() < fk1.caption.length()) + return false; + } + + return larger; + } + +} -- cgit v1.2.3