From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../knowcenter/wag/exactparser/ParseDocument.java | 272 +++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java') diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java new file mode 100644 index 0000000..fbaa4de --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java @@ -0,0 +1,272 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: ParseDocument.java,v 1.1 2006/08/25 17:00:59 wprinz Exp $ + */ +package at.knowcenter.wag.exactparser; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import at.knowcenter.wag.exactparser.parsing.PDFUtils; +import at.knowcenter.wag.exactparser.parsing.results.DictionaryParseResult; +import at.knowcenter.wag.exactparser.parsing.results.FooterParseResult; +import at.knowcenter.wag.exactparser.parsing.results.IndirectObjectReferenceParseResult; +import at.knowcenter.wag.exactparser.parsing.results.NameParseResult; +import at.knowcenter.wag.exactparser.parsing.results.NumberParseResult; +import at.knowcenter.wag.exactparser.parsing.results.ObjectParseResult; +import at.knowcenter.wag.exactparser.parsing.results.StartXRefParseResult; +import at.knowcenter.wag.exactparser.parsing.results.TrailerParseResult; +import at.knowcenter.wag.exactparser.parsing.results.XRefSectionParseResult; + + +/** + * Test class. + * @author wprinz + */ +public class ParseDocument +{ + + public static final String DOCUMENT = "C:/wprinz/temp.pdf"; + + public static final byte[] EGIZ_DICT_NAME = { 'E', 'G', 'I', 'Z', 'S', 'i', + 'g', 'D', 'i', 'c', 't' }; + + public static final byte[] EGIZ_ODS_NAME = { 'O', 'D', 'S' }; + + public static final byte[] EGIZ_XOBJ_NAME = { 'S', 'i', 'g', 'X', 'O', 'b', + 'j', 'e', 'c', 't' }; + + /** + * @param args + */ + public static void main(String[] args) + { + + try + { + File in = new File(DOCUMENT); + FileInputStream fis = new FileInputStream(in); + byte[] pdf = new byte[(int) in.length()]; + fis.read(pdf); + fis.close(); + fis = null; + + List blocks = parseDocument(pdf); + + Iterator it = blocks.iterator(); + while (it.hasNext()) + { + FooterParseResult bpr = (FooterParseResult) it.next(); + + System.out.print("block from " + bpr.start_index + " to " + bpr.next_index); + + if (bpr.tpr.root != null) + { + int root_index = PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(bpr.xpr, bpr.tpr.root.ior); + ObjectParseResult root_opr = PDFUtils.parseObject(pdf, root_index); + DictionaryParseResult root_dpr = (DictionaryParseResult) root_opr.object; + + int egiz_index = PDFUtils.indexOfName(pdf, root_dpr.names, EGIZ_DICT_NAME); + if (egiz_index >= 0) + { + System.out.print(" == EGIZDict"); + } + } + + System.out.println(); + } + + } + catch (IOException e) + { + e.printStackTrace(); + } + } + + public static List parseDocument(final byte[] pdf) throws IOException + { + //HeaderParseResult hpr = PDFUtils.parseHeader(pdf, 0); + //System.out.println("PDF-version = " + hpr.major + "." + hpr.minor); + + List blocks = new ArrayList(); + + int last_start_xref = PDFUtils.findLastStartXRef(pdf); + StartXRefParseResult last_sxpr = PDFUtils.parseStartXRef(pdf, last_start_xref); + int xref_index = last_sxpr.xref_index; + + for (;;) + { + FooterParseResult fpr = PDFUtils.parseFooter(pdf, xref_index); + blocks.add(0, fpr); + + //System.out.println("tpr.has_predecessor = " + fpr.tpr.has_predecessor); + if (!fpr.tpr.has_predecessor) + { + // eventually parse the PDF header here. + break; + } + + //System.out.println("tpr.prev = " + fpr.tpr.getPrev()); + + xref_index = fpr.tpr.getPrev(); + } + + return blocks; + } + + // public static void parseEGIZ() + // { + // + // int root_index = + // PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(bpr.xpr, + // bpr.tpr.root.ior); + // ObjectParseResult root_opr = PDFUtils.parseObject(pdf, root_index); + // DictionaryParseResult root_dpr = (DictionaryParseResult) root_opr.object; + // + // int egiz_index = PDFUtils.indexOfName(pdf, root_dpr.names, EGIZ_DICT_NAME); + // if (egiz_index >= 0) + // { + // IndirectObjectReferenceParseResult egiz_iorpr = + // (IndirectObjectReferenceParseResult) root_dpr.values.get(egiz_index); + // System.out.println("EGIZ signature info at = " + egiz_iorpr); + // + // int egiz_dict_index = + // PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(bpr.xpr, + // egiz_iorpr.ior); + // ObjectParseResult opr = PDFUtils.parseObject(pdf, egiz_dict_index); + // DictionaryParseResult egiz_dict = (DictionaryParseResult) opr.object; + // + // for (int i = 0; i < egiz_dict.names.size(); i++) + // { + // NameParseResult npr = egiz_dict.names.get(i); + // int len = npr.next_index - npr.name_start_index; + // byte[] name = new byte[len]; + // System.arraycopy(pdf, npr.name_start_index, name, 0, len); + // System.out.print(" " + new String(name, "US-ASCII") + " = "); + // + // System.out.println(egiz_dict.values.get(i)); + // } + // + // // int key = PDFUtils.indexOfName(pdf, egiz_dict.names, new byte [] { 'K', + // // 'e', 'y'}); + // // IndirectObjectReferenceParseResult key_iorpr = + // // (IndirectObjectReferenceParseResult) egiz_dict.values.get(key); + // // int key_offset = + // // PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(xpr, + // // key_iorpr.ior); + // // ObjectParseResult key_opr = PDFUtils.parseObject(pdf, key_offset); + // // StreamParseResult spr = (StreamParseResult) key_opr.object; + // // System.out.println(" key stream from " + spr.content_start_index + " to + // // " + spr.content_end_index); + // // + // // int data_len = spr.content_end_index - spr.content_start_index; + // // byte [] data = new byte[data_len]; + // // System.arraycopy(pdf, spr.content_start_index, data, 0, data_len); + // // System.out.println(new String(data, "US-ASCII")); + // + // } + // else + // { + // System.out.println("No EGIZ block found."); + // } + // + // } + + public static byte[] getOriginalDocument(final File file_name) throws IOException + { + FileInputStream fis = new FileInputStream(file_name); + byte[] pdf = new byte[(int) file_name.length()]; + fis.read(pdf); + fis.close(); + fis = null; + + int last_start_xref = PDFUtils.findLastStartXRef(pdf); + + StartXRefParseResult sxpr = PDFUtils.parseStartXRef(pdf, last_start_xref); + + XRefSectionParseResult xpr = PDFUtils.parseXRefSection(pdf, sxpr.xref_index); + + TrailerParseResult tpr = PDFUtils.parseTrailer(pdf, xpr.next_index); + + System.out.println("tpr.info = " + tpr.info); + System.out.println("tpr.root = " + tpr.root); + System.out.println("tpr.size = " + tpr.size); + + System.out.println("tpr.has_predecessor = " + tpr.has_predecessor); + if (tpr.has_predecessor) + { + System.out.println("tpr.prev = " + tpr.getPrev()); + } + + int root_index = PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(xpr, tpr.root.ior); + ObjectParseResult root_opr = PDFUtils.parseObject(pdf, root_index); + DictionaryParseResult root_dpr = (DictionaryParseResult) root_opr.object; + + byte[] EGIZ_TYPE = new String("EGIZSigDict").getBytes("US-ASCII"); + int egiz_index = PDFUtils.indexOfName(pdf, root_dpr.names, EGIZ_TYPE); + if (egiz_index >= 0) + { + System.out.println("The document is EGIZ-signed. ==> extract original document"); + + IndirectObjectReferenceParseResult egiz_iorpr = (IndirectObjectReferenceParseResult) root_dpr.values.get(egiz_index); + System.out.println("EGIZ signature info at = " + egiz_iorpr); + + int egiz_dict_index = PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(xpr, egiz_iorpr.ior); + ObjectParseResult opr = PDFUtils.parseObject(pdf, egiz_dict_index); + DictionaryParseResult egiz_dict = (DictionaryParseResult) opr.object; + + for (int i = 0; i < egiz_dict.names.size(); i++) + { + NameParseResult npr = (NameParseResult) egiz_dict.names.get(i); + int len = npr.next_index - npr.name_start_index; + byte[] name = new byte[len]; + System.arraycopy(pdf, npr.name_start_index, name, 0, len); + System.out.print(" " + new String(name, "US-ASCII") + " = "); + + System.out.println(egiz_dict.values.get(i)); + } + + // Original document size + int key = PDFUtils.indexOfName(pdf, egiz_dict.names, new byte[] { 'O', + 'D', 'S' }); + NumberParseResult ods = (NumberParseResult) egiz_dict.values.get(key); + + int original_document_size = ods.number; + System.out.println("Original Document Size = " + original_document_size); + + byte[] original = new byte[original_document_size]; + System.arraycopy(pdf, 0, original, 0, original_document_size); + + return original; + } + + System.out.println("No EGIZ block found. ==> the whold document is the original document"); + return pdf; + } + +} -- cgit v1.2.3