/** * Copyright 2006 by Know-Center, Graz, Austria * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a * joint initiative of the Federal Chancellery Austria and Graz University of * Technology. * * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by * the European Commission - subsequent versions of the EUPL (the "Licence"); * You may not use this work except in compliance with the Licence. * You may obtain a copy of the Licence at: * http://www.osor.eu/eupl/ * * Unless required by applicable law or agreed to in writing, software * distributed under the Licence is distributed on an "AS IS" basis, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the Licence for the specific language governing permissions and * limitations under the Licence. * * This product combines work with different licenses. See the "NOTICE" text * file for details on the various modules and licenses. * The "NOTICE" text file is part of the distribution. Any derivative works * that you distribute must include a readable copy of the "NOTICE" text file. * * $Id: ParseDocument.java,v 1.1 2006/08/25 17:00:59 wprinz Exp $ */ package at.knowcenter.wag.exactparser; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import at.knowcenter.wag.exactparser.parsing.PDFUtils; import at.knowcenter.wag.exactparser.parsing.results.DictionaryParseResult; import at.knowcenter.wag.exactparser.parsing.results.FooterParseResult; import at.knowcenter.wag.exactparser.parsing.results.IndirectObjectReferenceParseResult; import at.knowcenter.wag.exactparser.parsing.results.NameParseResult; import at.knowcenter.wag.exactparser.parsing.results.NumberParseResult; import at.knowcenter.wag.exactparser.parsing.results.ObjectParseResult; import at.knowcenter.wag.exactparser.parsing.results.StartXRefParseResult; import at.knowcenter.wag.exactparser.parsing.results.TrailerParseResult; import at.knowcenter.wag.exactparser.parsing.results.XRefSectionParseResult; /** * Test class. * @author wprinz */ public class ParseDocument { public static final String DOCUMENT = "C:/wprinz/temp.pdf"; public static final byte[] EGIZ_DICT_NAME = { 'E', 'G', 'I', 'Z', 'S', 'i', 'g', 'D', 'i', 'c', 't' }; public static final byte[] EGIZ_ODS_NAME = { 'O', 'D', 'S' }; public static final byte[] EGIZ_XOBJ_NAME = { 'S', 'i', 'g', 'X', 'O', 'b', 'j', 'e', 'c', 't' }; /** * @param args */ public static void main(String[] args) { try { File in = new File(DOCUMENT); FileInputStream fis = new FileInputStream(in); byte[] pdf = new byte[(int) in.length()]; fis.read(pdf); fis.close(); fis = null; List blocks = parseDocument(pdf); Iterator it = blocks.iterator(); while (it.hasNext()) { FooterParseResult bpr = (FooterParseResult) it.next(); System.out.print("block from " + bpr.start_index + " to " + bpr.next_index); if (bpr.tpr.root != null) { int root_index = PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(bpr.xpr, bpr.tpr.root.ior); ObjectParseResult root_opr = PDFUtils.parseObject(pdf, root_index); DictionaryParseResult root_dpr = (DictionaryParseResult) root_opr.object; int egiz_index = PDFUtils.indexOfName(pdf, root_dpr.names, EGIZ_DICT_NAME); if (egiz_index >= 0) { System.out.print(" == EGIZDict"); } } System.out.println(); } } catch (IOException e) { e.printStackTrace(); } } public static List parseDocument(final byte[] pdf) throws IOException { //HeaderParseResult hpr = PDFUtils.parseHeader(pdf, 0); //System.out.println("PDF-version = " + hpr.major + "." + hpr.minor); List blocks = new ArrayList(); int last_start_xref = PDFUtils.findLastStartXRef(pdf); StartXRefParseResult last_sxpr = PDFUtils.parseStartXRef(pdf, last_start_xref); int xref_index = last_sxpr.xref_index; for (;;) { FooterParseResult fpr = PDFUtils.parseFooter(pdf, xref_index); blocks.add(0, fpr); //System.out.println("tpr.has_predecessor = " + fpr.tpr.has_predecessor); if (!fpr.tpr.has_predecessor) { // eventually parse the PDF header here. break; } //System.out.println("tpr.prev = " + fpr.tpr.getPrev()); xref_index = fpr.tpr.getPrev(); } return blocks; } // public static void parseEGIZ() // { // // int root_index = // PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(bpr.xpr, // bpr.tpr.root.ior); // ObjectParseResult root_opr = PDFUtils.parseObject(pdf, root_index); // DictionaryParseResult root_dpr = (DictionaryParseResult) root_opr.object; // // int egiz_index = PDFUtils.indexOfName(pdf, root_dpr.names, EGIZ_DICT_NAME); // if (egiz_index >= 0) // { // IndirectObjectReferenceParseResult egiz_iorpr = // (IndirectObjectReferenceParseResult) root_dpr.values.get(egiz_index); // System.out.println("EGIZ signature info at = " + egiz_iorpr); // // int egiz_dict_index = // PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(bpr.xpr, // egiz_iorpr.ior); // ObjectParseResult opr = PDFUtils.parseObject(pdf, egiz_dict_index); // DictionaryParseResult egiz_dict = (DictionaryParseResult) opr.object; // // for (int i = 0; i < egiz_dict.names.size(); i++) // { // NameParseResult npr = egiz_dict.names.get(i); // int len = npr.next_index - npr.name_start_index; // byte[] name = new byte[len]; // System.arraycopy(pdf, npr.name_start_index, name, 0, len); // System.out.print(" " + new String(name, "US-ASCII") + " = "); // // System.out.println(egiz_dict.values.get(i)); // } // // // int key = PDFUtils.indexOfName(pdf, egiz_dict.names, new byte [] { 'K', // // 'e', 'y'}); // // IndirectObjectReferenceParseResult key_iorpr = // // (IndirectObjectReferenceParseResult) egiz_dict.values.get(key); // // int key_offset = // // PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(xpr, // // key_iorpr.ior); // // ObjectParseResult key_opr = PDFUtils.parseObject(pdf, key_offset); // // StreamParseResult spr = (StreamParseResult) key_opr.object; // // System.out.println(" key stream from " + spr.content_start_index + " to // // " + spr.content_end_index); // // // // int data_len = spr.content_end_index - spr.content_start_index; // // byte [] data = new byte[data_len]; // // System.arraycopy(pdf, spr.content_start_index, data, 0, data_len); // // System.out.println(new String(data, "US-ASCII")); // // } // else // { // System.out.println("No EGIZ block found."); // } // // } public static byte[] getOriginalDocument(final File file_name) throws IOException { FileInputStream fis = new FileInputStream(file_name); byte[] pdf = new byte[(int) file_name.length()]; fis.read(pdf); fis.close(); fis = null; int last_start_xref = PDFUtils.findLastStartXRef(pdf); StartXRefParseResult sxpr = PDFUtils.parseStartXRef(pdf, last_start_xref); XRefSectionParseResult xpr = PDFUtils.parseXRefSection(pdf, sxpr.xref_index); TrailerParseResult tpr = PDFUtils.parseTrailer(pdf, xpr.next_index); System.out.println("tpr.info = " + tpr.info); System.out.println("tpr.root = " + tpr.root); System.out.println("tpr.size = " + tpr.size); System.out.println("tpr.has_predecessor = " + tpr.has_predecessor); if (tpr.has_predecessor) { System.out.println("tpr.prev = " + tpr.getPrev()); } int root_index = PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(xpr, tpr.root.ior); ObjectParseResult root_opr = PDFUtils.parseObject(pdf, root_index); DictionaryParseResult root_dpr = (DictionaryParseResult) root_opr.object; byte[] EGIZ_TYPE = new String("EGIZSigDict").getBytes("US-ASCII"); int egiz_index = PDFUtils.indexOfName(pdf, root_dpr.names, EGIZ_TYPE); if (egiz_index >= 0) { System.out.println("The document is EGIZ-signed. ==> extract original document"); IndirectObjectReferenceParseResult egiz_iorpr = (IndirectObjectReferenceParseResult) root_dpr.values.get(egiz_index); System.out.println("EGIZ signature info at = " + egiz_iorpr); int egiz_dict_index = PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(xpr, egiz_iorpr.ior); ObjectParseResult opr = PDFUtils.parseObject(pdf, egiz_dict_index); DictionaryParseResult egiz_dict = (DictionaryParseResult) opr.object; for (int i = 0; i < egiz_dict.names.size(); i++) { NameParseResult npr = (NameParseResult) egiz_dict.names.get(i); int len = npr.next_index - npr.name_start_index; byte[] name = new byte[len]; System.arraycopy(pdf, npr.name_start_index, name, 0, len); System.out.print(" " + new String(name, "US-ASCII") + " = "); System.out.println(egiz_dict.values.get(i)); } // Original document size int key = PDFUtils.indexOfName(pdf, egiz_dict.names, new byte[] { 'O', 'D', 'S' }); NumberParseResult ods = (NumberParseResult) egiz_dict.values.get(key); int original_document_size = ods.number; System.out.println("Original Document Size = " + original_document_size); byte[] original = new byte[original_document_size]; System.arraycopy(pdf, 0, original, 0, original_document_size); return original; } System.out.println("No EGIZ block found. ==> the whold document is the original document"); return pdf; } }