aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
diff options
context:
space:
mode:
authortknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2013-01-09 15:41:29 +0000
committertknall <tknall@7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c>2013-01-09 15:41:29 +0000
commit535a04fa05f739ec16dd81666e3b0f82dfbd442d (patch)
tree0804f301c1a9ceb303a8441b7b29244fc8eb7ff0 /src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
parent1efaf6fd5619dfa95c9d7e8c71eda4c2ffba4998 (diff)
downloadpdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.gz
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.tar.bz2
pdf-as-3-535a04fa05f739ec16dd81666e3b0f82dfbd442d.zip
pdf-as-lib maven project files moved to pdf-as-lib
git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c
Diffstat (limited to 'src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java')
-rw-r--r--src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java956
1 files changed, 0 insertions, 956 deletions
diff --git a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java b/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
deleted file mode 100644
index fd59d34..0000000
--- a/src/main/java/at/knowcenter/wag/egov/egiz/pdf/AbsoluteTextSignature.java
+++ /dev/null
@@ -1,956 +0,0 @@
-/**
- * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
- * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
- * joint initiative of the Federal Chancellery Austria and Graz University of
- * Technology.
- *
- * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
- * the European Commission - subsequent versions of the EUPL (the "Licence");
- * You may not use this work except in compliance with the Licence.
- * You may obtain a copy of the Licence at:
- * http://www.osor.eu/eupl/
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the Licence is distributed on an "AS IS" basis,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the Licence for the specific language governing permissions and
- * limitations under the Licence.
- *
- * This product combines work with different licenses. See the "NOTICE" text
- * file for details on the various modules and licenses.
- * The "NOTICE" text file is part of the distribution. Any derivative works
- * that you distribute must include a readable copy of the "NOTICE" text file.
- *
- * $Id: AbsoluteTextSignature.java,v 1.1 2006/10/31 08:08:33 wprinz Exp $
- */
-package at.knowcenter.wag.egov.egiz.pdf;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Vector;
-
-import org.apache.log4j.Logger;
-
-import at.knowcenter.wag.egov.egiz.PdfAS;
-import at.knowcenter.wag.egov.egiz.cfg.ConfigLogger;
-import at.knowcenter.wag.egov.egiz.exceptions.SignatureException;
-import at.knowcenter.wag.egov.egiz.exceptions.SignatureTypesException;
-import at.knowcenter.wag.egov.egiz.framework.FoundBlock;
-import at.knowcenter.wag.egov.egiz.framework.FoundKey;
-import at.knowcenter.wag.egov.egiz.sig.SignatureObject;
-import at.knowcenter.wag.egov.egiz.sig.SignatureTypeDefinition;
-import at.knowcenter.wag.egov.egiz.sig.SignatureTypes;
-
-/**
- * Contains methods and helpers that implement the absolute text signature.
- *
- * @author wprinz
- */
-public class AbsoluteTextSignature
-{
-
- /**
- * The logger definition.
- */
- private static final Logger logger = ConfigLogger.getLogger(AbsoluteTextSignature.class);
-
- /**
- * Returns a List of SignatureTypeDefinitions that can be extracted from text.
- *
- * <p>
- * These SignatureTypeDefinitions are all text extractable, which means that they define all required fields as visible.
- * </p>
- * <p>
- * This method filters out all SignatureTypeDefinitions that are not text extractable.
- * </p>
- *
- * @return Returns a List of SignatureTypeDefinitions that can be extracted from text.
- * @throws SignatureTypesException F.e.
- */
- public static List getSignatureTypesForTextAnalysis() throws SignatureTypesException
- {
- SignatureTypes sig_types = SignatureTypes.getInstance();
- List allSignatureTypes = sig_types.getSignatureTypeDefinitions();
-
- List textSignatureTypes = new ArrayList(allSignatureTypes.size());
- Iterator it = allSignatureTypes.iterator();
- while (it.hasNext())
- {
- SignatureTypeDefinition std = (SignatureTypeDefinition) it.next();
- if (!std.isTextExtractable())
- {
- logger.debug("The profile " + std.getType() + " is not text extractable and is thereby not used for text analysis.");
- continue;
- }
- textSignatureTypes.add(std);
- }
-
- return textSignatureTypes;
- }
-
- /**
- * Extracts all signature holders from a given text.
- *
- * <p>
- * First the latest signature holder is extracted. Then the latest signature
- * holder in the rest text, which is the second latest one, is extracted. Then
- * the third latest signature holder is extracted and so forth until no more
- * signature holders are found.
- * </p>
- *
- * @param text
- * The text.
- * @return Returns the List of extracted signature holders ordered by their
- * date ascendingly (the lowest, earliest date first, the latest,
- * newest date last). An empty list is returned if no signature
- * holders were found.
- * @throws SignatureException
- * F.e.
- * @throws SignatureTypesException
- * F.e.
- */
- public static List extractSignatureHoldersFromText(String text) throws SignatureException, SignatureTypesException
- {
- List holders = new ArrayList();
- String current_text = text;
- for (;;)
- {
- TextualSignatureHolder signature_holder = extractLatestBlock(current_text);
- if (signature_holder == null)
- {
- break;
- }
- holders.add(0, signature_holder);
- current_text = signature_holder.getSignedText();
- }
- return holders;
- }
-
- /**
- * Extracts the latest signature block from the given text and creates a
- * SignatureHolder object that can be verified.
- *
- * @param text
- * The text.
- * @return Returns the SignatureObject extracted from the text, or null, if no
- * latest block was found.
- * @throws SignatureException
- * F.e.
- * @throws SignatureTypesException
- * F.e.
- */
- public static TextualSignatureHolder extractLatestBlock(String text) throws SignatureException, SignatureTypesException
- {
- FoundBlock latest_block = findLatestBlock(text);
- if (latest_block == null)
- {
- return null;
- }
- String reconstructed_text = cutOutBlock(text, latest_block);
- SignatureObject so = createSignatureObjectFromFoundBlock(text, latest_block);
- TextualSignatureHolder tsh = new TextualSignatureHolder(reconstructed_text, so);
- return tsh;
- }
-
- /**
- * Finds the latest signature block for a given text.
- *
- * <p>
- * The latest block is the one with the highest, most recent date. Usually
- * this block will be extracted (cut out) of the text which will result in the
- * originally signed text of this signature to be verified using the cut out
- * data.
- * </p>
- *
- * @param text
- * The text to be analyzed.
- * @return Returns the latest found block or null, if there was none.
- * @throws SignatureException
- * F.e.
- * @throws SignatureTypesException
- * F.e.
- */
- public static FoundBlock findLatestBlock(String text) throws SignatureException, SignatureTypesException
- {
- List signatureTypes_ = getSignatureTypesForTextAnalysis();
-
- List found_potential_candidates = new ArrayList();
-
- for (int i = 0; i < signatureTypes_.size(); i++)
- {
- SignatureTypeDefinition block_type = (SignatureTypeDefinition) signatureTypes_.get(i);
- List found_candidates_for_type = findPotentialSignaturesForProfile(text, block_type);
-
- found_potential_candidates.addAll(found_candidates_for_type);
- }
-
- if (found_potential_candidates.isEmpty())
- {
- logger.debug("no candidates found at all");
- return null;
- }
-
- List found_candidates = new ArrayList();
- logger.debug("checking block integrity");
- for (int i = 0; i < found_potential_candidates.size(); i++)
- {
- FoundBlock found_block = (FoundBlock) found_potential_candidates.get(i);
- String date_value = getDateValue(text, found_block);
- try
- {
- EGIZDate date = EGIZDate.parseFromString(date_value);
-
- logger.debug("found_block = " + date + " - " + found_block);
-
- checkBlockIntegrity(text, found_block);
- found_candidates.add(found_block);
- }
- catch (Exception e)
- {
- logger.debug("Exception while checking the integrity of the found block " + found_block + ". Ignoring this block.", e);
- }
- }
-
- sortFoundBlocksByDate(text, found_candidates);
- if (logger.isDebugEnabled())
- {
- logger.debug("sorted blocks:");
- for (int i = 0; i < found_candidates.size(); i++)
- {
- FoundBlock found_block = (FoundBlock) found_candidates.get(i);
-
- String date_value = getDateValue(text, found_block);
- EGIZDate date = EGIZDate.parseFromString(date_value);
-
- logger.debug(" #" + i + ": " + date + " - " + found_block);
- }
- }
-
- List latest_blocks = filterLastDateEqualBlocks(text, found_candidates);
- if (logger.isDebugEnabled())
- {
- logger.debug("latest blocks:");
- for (int i = 0; i < latest_blocks.size(); i++)
- {
- FoundBlock found_block = (FoundBlock) latest_blocks.get(i);
-
- String date_value = getDateValue(text, found_block);
- EGIZDate date = EGIZDate.parseFromString(date_value);
-
- logger.debug(" #" + i + ": " + date + " - " + found_block);
- }
- }
-
- // The semantic equality check has been outdated by the
- // advanced choosing algorithm.
- // boolean semantic_equality =
- // PdfAS.checkForSemanticEquality(latest_blocks);
- // logger.debug("semantic_equality = " + semantic_equality);
- // if (!semantic_equality)
- // {
- // throw new SignatureException(314, "The latest blocks weren't semantically
- // equal.");
- // }
-
- FoundBlock latest_block = chooseMostPossibleBlock(latest_blocks);
-
- logger.debug("latest block = " + latest_block);
- return latest_block;
- }
-
- /**
- * Finds the List of potential blocks within the given text for the given
- * profile.
- *
- * @param text
- * The text, in which potential block are to be sought.
- * @param block_type
- * The profile for which the text is to be sought.
- * @return Returns the List of potential FoundBlocks or an empty List if none
- * could be found.
- */
- public static List findPotentialSignaturesForProfile(String text,
- SignatureTypeDefinition block_type)
- {
- logger.debug("find potential signatures for " + block_type.getType());
- List found_blocks = new ArrayList();
-
- final boolean old_style = false;
-
- Vector keys = block_type.getRevertSortedKeys();
- Vector captions = block_type.getRevertSortedCaptions();
-
- String last_key = (String) keys.get(0);
- logger.debug("last_key = " + last_key);
- String last_caption = (String) captions.get(0);
- logger.debug("last_caption = " + last_caption);
- String current_last_caption= last_caption;
- List found_last_captions = findIndicesWithStartingNL(text, last_caption);
- if (last_key.equals(SignatureTypes.SIG_ID))
- {
- logger.debug("Last key is SIG_ID, so it may not be present. Searching for the previous to last key.");
- String prevlast_key = (String) keys.get(1);
- String prevlast_caption = (String) captions.get(1);
- current_last_caption = prevlast_caption;
- List found_prevlast_captions = findIndicesWithStartingNL(text, prevlast_caption);
- if (!found_prevlast_captions.isEmpty())
- {
- found_last_captions.addAll(found_prevlast_captions);
- }
- }
- if (logger.isDebugEnabled())
- {
- logger.debug("found " + found_last_captions.size() + " last captions.");
- for (int i = 0; i < found_last_captions.size(); i++)
- {
- logger.debug(" found last caption at index " + found_last_captions.get(i));
- }
- }
-
- for (int lci = 0; lci < found_last_captions.size(); lci++)
- {
- int last_caption_index = ((Integer) found_last_captions.get(lci)).intValue();
- logger.debug("resolving signature block from last caption index " + last_caption_index);
- int potential_block_end = findEndOfValue(text, last_caption_index);
- if (potential_block_end == (last_caption_index + current_last_caption.length()+1))
- {
- potential_block_end = findEndOfValue(text, potential_block_end);
- }
-
- // FIXME: complete HOTFIX
- /*
- int extendedValueEnd = potential_block_end;
- String cv;
- do {
- extendedValueEnd = findEndOfValue(text, extendedValueEnd);
- cv = text.substring(last_caption_index + current_last_caption.length()+1, extendedValueEnd);
- } while (extendedValueEnd < text.length());
- */
-
- logger.debug("potential_block_end = " + potential_block_end);
- List found_keys = PdfAS.findBlockInText(text.substring(0, potential_block_end), block_type, old_style); // findRestKeys(text,
- // keys,
- // captions,
- // last_caption_index);
-
- if (found_keys == null)
- {
- logger.debug("Not all other captions could be found for the last_caption_index " + last_caption_index + " ==> discarding this index.");
-
- continue;
- }
-
- // sort found keys ascendingly
- PdfAS.sortFoundKeysAscendingly(found_keys);
-
- boolean reverse_check_ok = reverseCheckFoundKeys(text, found_keys);
- if (!reverse_check_ok)
- {
- logger.debug("The reverse check ruled this list of found keys out ==> they are discarded.");
-
- continue;
- }
-
- logger.debug("The reverse check proved this list of found keys out ==> adding them as potential candidates.");
-
- FoundBlock found_block = new FoundBlock();
- found_block.std = block_type;
- found_block.found_keys = found_keys;
- found_block.end_index = potential_block_end;//findEndOfValue(text, last_caption_index);
- found_blocks.add(found_block);
- }
-
- logger.debug("found " + found_blocks.size() + " potential signatures for " + block_type.getType());
- return found_blocks;
- }
-
- /**
- * Finds all indices of the given subtext (starting at a new line) within a
- * given text.
- *
- * <p>
- * This is usually used to find the indices of the last captions.
- * </p>
- *
- * @param text
- * The text to be searched.
- * @param subtext
- * The subtext to be sought.
- * @return Returns the List of found indices.
- */
- public static List findIndicesWithStartingNL(String text, String subtext)
- {
- List found_indices = new ArrayList();
-
- // // for some reason "^" + subtext doesn't work as a pattern
- // String pattern = "\n" + subtext;
- // Pattern p = Pattern.compile(pattern);
- // Matcher m = p.matcher(text);
- //
- // while (m.find())
- // {
- // int found_index = m.start() + 1; // +1 removes the newline
- // found_indices.add(new Integer(found_index));
- // }
-
- int search_from_index = 0;
- for (;;)
- {
- int found_index = text.indexOf("\n" + subtext, search_from_index);
- if (found_index < 0)
- {
- break;
- }
- found_index += 1; // The +1 compensates the "\n"
- found_indices.add(new Integer(found_index));
- search_from_index = found_index + subtext.length();
- }
- return found_indices;
- }
-
- /**
- * Finds the other keys/captions according to their order starting from the
- * last_caption index upwards.
- *
- * @param text
- * The text.
- * @param keys
- * The list of keys.
- * @param captions
- * The list of captions.
- * @param last_caption_index
- * The index of the last caption.
- * @return Returns the List of found keys, if all keys could be found, or null
- * if not all keys could be found.
- */
- public static List findRestKeys(String text, List keys, List captions,
- int last_caption_index)
- {
- List found_keys = new ArrayList();
-
- FoundKey last_caption_found_key = new FoundKey((String) keys.get(0), (String) captions.get(0), last_caption_index);
- found_keys.add(last_caption_found_key);
-
- String rest_text = text.substring(0, last_caption_index);
-
- for (int i = 1; i < captions.size(); i++)
- {
- String sought_caption = (String) captions.get(i);
- int index = rest_text.lastIndexOf(sought_caption);
-
- if (index < 0)
- {
- return null;
- }
- FoundKey found_key = new FoundKey((String) keys.get(i), (String) captions.get(i), index);
- found_keys.add(0, found_key);
-
- rest_text = rest_text.substring(0, index);
- }
-
- return found_keys;
- }
-
- /**
- * Performs a reverse (top to bottom) search for the found keys and checks
- * that these indices are the same as those that were found during the regular
- * (bottom up) search.
- * <p>
- * If a reverse check proves that the found keys are not at the same positions
- * as during regular search, this list of found keys should be discarded.
- * </p>
- *
- * @param text
- * The text.
- * @param found_keys
- * The found keys to be reversely checked.
- * @return Returns true, if all (also the non required) captions could be
- * found at the same indices as during regular search, false
- * otherwise.
- */
- public static boolean reverseCheckFoundKeys(String text, List found_keys)
- {
- int search_from_index = ((FoundKey) found_keys.get(0)).start_index;
- for (int i = 0; i < found_keys.size(); i++)
- {
- FoundKey found_key = (FoundKey) found_keys.get(i);
-
- int reverse_found_index = text.indexOf(found_key.caption, search_from_index);
-
- // dferbas fix #331 ??
-
- if (reverse_found_index < 0)
- {
- throw new RuntimeException("The caption " + found_key.caption + " wasn't found in the text during reverse checking - there is something wrong.");
- }
-
- if (reverse_found_index != found_key.start_index)
- {
- logger.debug("The index for caption " + found_key.caption + " wasn't proved during reverse checking.");
- return false;
- }
- search_from_index = found_key.start_index + found_key.caption.length();
- }
- return true;
- }
-
- /**
- * Finds the end of the value in the text.
- *
- * <p>
- * This simply scans for a '\n' from a given start index. The line up to and
- * inclusive the '\n' is considered to be the value.
- * </p>
- * <p>
- * Note that this method does NOT find the accurate value, if the value goes
- * over multiple lines! This may bear a serious problem. Usually this method
- * is only used to finding the end of the last value in a found block, because
- * mid- values are exactly determined by their start index and the start of
- * the next caption. Nevertheless, if the last value spans over multiple
- * lines, this method will not retrieve it completely.
- * </p>
- *
- * @param text
- * The text.
- * @param start_index
- * The start index from where the end of the value is sought.
- * @return Returns the end index of the value, which is the index of the first
- * character not belonging to the value anymore (the character after
- * the '\n').
- */
- public static int findEndOfValue(String text, int start_index)
- {
- int newline_index = text.indexOf('\n', start_index);
- if (newline_index < 0)
- {
- return text.length();
- }
- return newline_index + 1;
- }
-
- /**
- * Checks the integrity of a found block.
- *
- * <p>
- * This is an assertive function.
- * </p>
- *
- * @param text
- * The text.
- * @param found_block
- * The found block.
- */
- public static void checkBlockIntegrity(String text, FoundBlock found_block)
- {
- for (int i = 0; i < found_block.found_keys.size() - 1; i++)
- {
- FoundKey this_key = (FoundKey) found_block.found_keys.get(i);
- FoundKey next_key = (FoundKey) found_block.found_keys.get(i + 1);
-
- int this_end_index = findEndOfValue(text, this_key.start_index);
- if (this_end_index != next_key.start_index)
- {
- logger.debug("multi line value: " + this_key);
- // throw new RuntimeException("The end index of found key " + this_key +
- // " doesn't match the start index of found key " + next_key);
- }
- }
-
- FoundKey last_key = (FoundKey) found_block.found_keys.get(found_block.found_keys.size() - 1);
- int end_of_block = findEndOfValue(text, last_key.start_index);
- if (end_of_block == (last_key.start_index+last_key.caption.length()+1))
- {
- end_of_block = findEndOfValue(text,end_of_block);
- }
- if (end_of_block != found_block.end_index)
- {
- throw new RuntimeException("The end index of last key " + last_key + " doesn't match the end index of the block " + found_block);
- }
- }
-
- /**
- * Cuts out the given found block from the text.
- *
- * @param text
- * The text.
- * @param block
- * The found block.
- * @return Returns the rest text without the block.
- */
- public static String cutOutBlock(String text, FoundBlock block)
- {
- int block_start_index = ((FoundKey) block.found_keys.get(0)).getStartIndex();
- int block_end_index = block.end_index;
-
- if (block_start_index == 0 && block_end_index == text.length())
- {
- // the block is the whole text - the rest text is empty.
- // This may happen if a (no-text) empty document contains a binary signature.
- // Then the "signed text" of the binary signature is empty.
- return "";
- }
-
- if (block_end_index == text.length())
- {
- // if the block is at the end of the text, remove the "\n" before the
- // block as well.
- String pre = text.substring(0, block_start_index - 1);
- return pre;
- }
-
- String pre = text.substring(0, block_start_index);
- String post = text.substring(block_end_index);
-
- String rest_text = pre + post;
- return rest_text;
- }
-
- /**
- * Returns the value of the date field as String.
- *
- * @param text
- * The text.
- * @param block
- * The found block.
- * @return Returns the date value.
- */
- public static String getDateValue(String text, FoundBlock block)
- {
- FoundKey date_key = block.getDateFoundKey();
- int date_value_start_index = date_key.start_index + date_key.caption.length();
- int date_value_end_index = findEndOfValue(text, date_value_start_index);
- if (date_value_end_index == (date_value_start_index+1))
- {
- date_value_end_index = findEndOfValue(text, date_value_end_index);
- }
- String date_value = text.substring(date_value_start_index, date_value_end_index).trim();
- logger.debug("DateString="+date_value);
- return date_value;
- }
-
- /**
- * Creates a SignatureObject from a found block by extracting the
- * corresponding values.
- *
- * @param text
- * The text.
- * @param found_block
- * The found block.
- * @return Returns the created SignatureObject.
- * @throws SignatureTypesException
- * F.e.
- * @throws SignatureException
- * F.e.
- */
- public static SignatureObject createSignatureObjectFromFoundBlock(
- String text, FoundBlock found_block) throws SignatureTypesException, SignatureException
- {
- SignatureObject signatureObject = new SignatureObject();
-
- signatureObject.setSigType(found_block.std.getType());
- signatureObject.initByType();
-
- int end_index = found_block.end_index;
- for (int i = found_block.found_keys.size() - 1; i >= 0; i--)
- {
- FoundKey cur_key = (FoundKey) found_block.found_keys.get(i);
- int start_index = cur_key.getStartIndex() + cur_key.caption.length();
-
- String value = text.substring(start_index, end_index);
-
- signatureObject.setSigValueCaption(cur_key.getKey(), value, cur_key.caption);
-
- end_index = cur_key.getStartIndex();
- }
-
- return signatureObject;
-
- }
-
- /**
- * Parses the EGIZDate from a found block and the given text.
- *
- * @param text
- * The text.
- * @param found_block
- * The found block.
- * @return Returns the parsed EGIZDate.
- */
- public static EGIZDate getDateFromFoundBlock(String text,
- FoundBlock found_block)
- {
- String date_value = getDateValue(text, found_block);
- EGIZDate date = EGIZDate.parseFromString(date_value);
- return date;
- }
-
- /**
- * Sorts the List of found blocks by date.
- *
- * @param text
- * The text.
- * @param found_blocks
- * The List of found blocks.
- */
- public static void sortFoundBlocksByDate(final String text, List found_blocks)
- {
- Collections.sort(found_blocks, new Comparator()
- {
- public int compare(Object arg0, Object arg1)
- {
- FoundBlock fb0 = (FoundBlock) arg0;
- FoundBlock fb1 = (FoundBlock) arg1;
-
- EGIZDate date0 = getDateFromFoundBlock(text, fb0);
- EGIZDate date1 = getDateFromFoundBlock(text, fb1);
- return date0.compareTo(date1);
- }
- });
- }
-
- /**
- * Given a List of FoundBlock objects, this method returns the last blocks of
- * this list that have the same date.
- *
- * <p>
- * Usually a date sorted list (earliest first, latest last) will be provided
- * to this method. Then the last date equal blocks are returned, which are the
- * last blocks.
- * </p>
- *
- * @param text
- * The text to retrieve the values of the fields from.
- * @param found_blocks
- * The List of FoundBlock objects.
- * @return Returns the List of the last date equal blocks.
- */
- public static List filterLastDateEqualBlocks(String text, List found_blocks)
- {
- List latest_blocks = new ArrayList();
- latest_blocks.add(found_blocks.get(found_blocks.size() - 1));
- for (int i = found_blocks.size() - 2; i >= 0; i--)
- {
- FoundBlock this_block = (FoundBlock) found_blocks.get(i);
- FoundBlock succ_block = (FoundBlock) found_blocks.get(i + 1);
-
- EGIZDate this_date = getDateFromFoundBlock(text, this_block);
- EGIZDate succ_date = getDateFromFoundBlock(text, succ_block);
-
- if (!this_date.equals(succ_date))
- {
- break;
- }
- latest_blocks.add(0, this_block);
- }
-
- return latest_blocks;
- }
- /**
- * Chooses the most possible (best choice) block of the list of blocks.
- *
- * <p>
- * The strategy to find the most possible block is to choose the very one
- * block with the maximum number of captions. This block has extracted most
- * information from the text.
- * </p>
- * <p>
- * If there are still multiple blocks with the same number of cations, the
- * blocks are compared caption-wise. The block with all captions being longer
- * or equal to all other blocks' captions wins.
- * </p>
- *
- * @param found_blocks
- * The List of semantically equal blocks.
- * @return Returns the best choice FoundBlock.
- * @throws SignatureException
- */
- public static FoundBlock chooseMostPossibleBlock(List found_blocks) throws SignatureException
- {
- // int largest_block_index = 0;
- // FoundBlock largest_block = (FoundBlock) found_blocks.get(0);
- //
- // for (int i = 1; i < found_blocks.size(); i++)
- // {
- // FoundBlock current_block = (FoundBlock) found_blocks.get(i);
- //
- // if (current_block.found_keys.size() > largest_block.found_keys.size())
- // {
- // largest_block = current_block;
- // largest_block_index = i;
- // }
- // }
-
- List vertically_largest = filterVerticallyLargestBlocks(found_blocks);
- if (logger.isDebugEnabled())
- {
- logger.debug("vertically largest blocks:");
- for (int i = 0; i < vertically_largest.size(); i++)
- {
- FoundBlock found_block = (FoundBlock) vertically_largest.get(i);
- logger.debug(" #" + i + ": " + found_block);
- }
- }
-
- List horizontally_largest = filterHorizontallyLargestBlocks(vertically_largest);
- if (logger.isDebugEnabled())
- {
- logger.debug("horizontally largest blocks:");
- for (int i = 0; i < horizontally_largest.size(); i++)
- {
- FoundBlock found_block = (FoundBlock) horizontally_largest.get(i);
- logger.debug(" #" + i + ": " + found_block);
- }
- }
- FoundBlock largest_block = (FoundBlock) horizontally_largest.get(0);
- logger.debug("Chose largest block: " + largest_block);
- return largest_block;
- }
-
- /**
- * Filters out all blocks but the vertically largest ones.
- *
- * <p>
- * A vertically largest block has the most found keys.
- * </p>
- *
- * @param found_blocks
- * The List of FoundBlock objects to be filtered.
- * @return Returns the List of the vertically largest FoundBlock objects.
- */
- public static List filterVerticallyLargestBlocks(List found_blocks)
- {
- // determine the size of the largest block(s)
- int largest_size = Integer.MIN_VALUE;
- for (int i = 0; i < found_blocks.size(); i++)
- {
- FoundBlock fb = (FoundBlock) found_blocks.get(i);
- final int current_size = fb.found_keys.size();
- if (current_size > largest_size)
- {
- largest_size = current_size;
- }
- }
-
- // keep all blocks that have the largest_size
- List largest_blocks = new ArrayList();
- for (int i = 0; i < found_blocks.size(); i++)
- {
- FoundBlock fb = (FoundBlock) found_blocks.get(i);
- if (fb.found_keys.size() < largest_size)
- {
- continue;
- }
- largest_blocks.add(fb);
- }
- return largest_blocks;
- }
-
- /**
- * Filters out all blocks but the horizonally largest ones.
- *
- * <p>
- * A vertically largest block has the most found keys.
- * </p>
- *
- * @param found_blocks
- * The List of FoundBlock objects to be filtered. All of these
- * FoundBlock objects must have the same number of found keys.
- * @return Returns the List of the horizontally largest FoundBlock objects.
- * @throws SignatureException
- */
- public static List filterHorizontallyLargestBlocks(List found_blocks) throws SignatureException
- {
- List horizontally_largest = new ArrayList();
- FoundBlock largest_block = (FoundBlock) found_blocks.get(0);
- horizontally_largest.add(largest_block);
-
- for (int i = 1; i < found_blocks.size(); i++)
- {
- FoundBlock fb = (FoundBlock) found_blocks.get(i);
-
- if (isHorizontallyEqual(fb, largest_block))
- {
- horizontally_largest.add(fb);
- continue;
- }
-
- if (isHorizontallyLarger(fb, largest_block))
- {
- horizontally_largest = new ArrayList();
- largest_block = fb;
- horizontally_largest.add(largest_block);
- }
- else
- {
- if (!isHorizontallyLarger(largest_block, fb))
- {
- // The block is neither equal nor larger nor lower.
- // We cannot exactly determine which one to use.
- throw new SignatureException(315, "The blocks are neither larger nor lower nor equal. Cannot decide which one to pick. fb = " + fb + ", largest_block = " + largest_block);
- }
- }
-
- }
-
- return horizontally_largest;
- }
-
- protected static boolean isHorizontallyEqual(FoundBlock fb0, FoundBlock fb1)
- {
- final int num_keys = fb0.found_keys.size();
- if (num_keys != fb1.found_keys.size())
- {
- throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size());
- }
-
- for (int i = 0; i < num_keys; i++)
- {
- FoundKey fk0 = (FoundKey) fb0.found_keys.get(i);
- FoundKey fk1 = (FoundKey) fb1.found_keys.get(i);
-
- if (fk0.caption.length() != fk1.caption.length())
- {
- return false;
- }
- }
-
- return true;
- }
-
- protected static boolean isHorizontallyLarger(FoundBlock fb0, FoundBlock fb1)
- {
- final int num_keys = fb0.found_keys.size();
- if (num_keys != fb1.found_keys.size())
- {
- throw new IllegalArgumentException("Cannot compare FoundBlock keys: fb0 doesn't have the same number of keys as fb1. " + fb0.found_keys.size() + " vs. " + fb1.found_keys.size());
- }
-
- boolean larger = false;
-
- for (int i = 0; i < num_keys; i++)
- {
- FoundKey fk0 = (FoundKey) fb0.found_keys.get(i);
- FoundKey fk1 = (FoundKey) fb1.found_keys.get(i);
-
- if (fk0.caption.length() == fk1.caption.length())
- {
- continue;
- }
-
- if (fk0.caption.length() > fk1.caption.length())
- {
- larger = true;
- continue;
- }
-
- // if (fk0.caption.length() < fk1.caption.length())
- return false;
- }
-
- return larger;
- }
-
-}