aboutsummaryrefslogtreecommitdiff
path: root/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java
diff options
context:
space:
mode:
Diffstat (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java')
-rw-r--r--pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java272
1 files changed, 272 insertions, 0 deletions
diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java
new file mode 100644
index 0000000..fbaa4de
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/ParseDocument.java
@@ -0,0 +1,272 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ *
+ * $Id: ParseDocument.java,v 1.1 2006/08/25 17:00:59 wprinz Exp $
+ */
+package at.knowcenter.wag.exactparser;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import at.knowcenter.wag.exactparser.parsing.PDFUtils;
+import at.knowcenter.wag.exactparser.parsing.results.DictionaryParseResult;
+import at.knowcenter.wag.exactparser.parsing.results.FooterParseResult;
+import at.knowcenter.wag.exactparser.parsing.results.IndirectObjectReferenceParseResult;
+import at.knowcenter.wag.exactparser.parsing.results.NameParseResult;
+import at.knowcenter.wag.exactparser.parsing.results.NumberParseResult;
+import at.knowcenter.wag.exactparser.parsing.results.ObjectParseResult;
+import at.knowcenter.wag.exactparser.parsing.results.StartXRefParseResult;
+import at.knowcenter.wag.exactparser.parsing.results.TrailerParseResult;
+import at.knowcenter.wag.exactparser.parsing.results.XRefSectionParseResult;
+
+
+/**
+ * Test class.
+ * @author wprinz
+ */
+public class ParseDocument
+{
+
+ public static final String DOCUMENT = "C:/wprinz/temp.pdf";
+
+ public static final byte[] EGIZ_DICT_NAME = { 'E', 'G', 'I', 'Z', 'S', 'i',
+ 'g', 'D', 'i', 'c', 't' };
+
+ public static final byte[] EGIZ_ODS_NAME = { 'O', 'D', 'S' };
+
+ public static final byte[] EGIZ_XOBJ_NAME = { 'S', 'i', 'g', 'X', 'O', 'b',
+ 'j', 'e', 'c', 't' };
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args)
+ {
+
+ try
+ {
+ File in = new File(DOCUMENT);
+ FileInputStream fis = new FileInputStream(in);
+ byte[] pdf = new byte[(int) in.length()];
+ fis.read(pdf);
+ fis.close();
+ fis = null;
+
+ List blocks = parseDocument(pdf);
+
+ Iterator it = blocks.iterator();
+ while (it.hasNext())
+ {
+ FooterParseResult bpr = (FooterParseResult) it.next();
+
+ System.out.print("block from " + bpr.start_index + " to " + bpr.next_index);
+
+ if (bpr.tpr.root != null)
+ {
+ int root_index = PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(bpr.xpr, bpr.tpr.root.ior);
+ ObjectParseResult root_opr = PDFUtils.parseObject(pdf, root_index);
+ DictionaryParseResult root_dpr = (DictionaryParseResult) root_opr.object;
+
+ int egiz_index = PDFUtils.indexOfName(pdf, root_dpr.names, EGIZ_DICT_NAME);
+ if (egiz_index >= 0)
+ {
+ System.out.print(" == EGIZDict");
+ }
+ }
+
+ System.out.println();
+ }
+
+ }
+ catch (IOException e)
+ {
+ e.printStackTrace();
+ }
+ }
+
+ public static List parseDocument(final byte[] pdf) throws IOException
+ {
+ //HeaderParseResult hpr = PDFUtils.parseHeader(pdf, 0);
+ //System.out.println("PDF-version = " + hpr.major + "." + hpr.minor);
+
+ List blocks = new ArrayList();
+
+ int last_start_xref = PDFUtils.findLastStartXRef(pdf);
+ StartXRefParseResult last_sxpr = PDFUtils.parseStartXRef(pdf, last_start_xref);
+ int xref_index = last_sxpr.xref_index;
+
+ for (;;)
+ {
+ FooterParseResult fpr = PDFUtils.parseFooter(pdf, xref_index);
+ blocks.add(0, fpr);
+
+ //System.out.println("tpr.has_predecessor = " + fpr.tpr.has_predecessor);
+ if (!fpr.tpr.has_predecessor)
+ {
+ // eventually parse the PDF header here.
+ break;
+ }
+
+ //System.out.println("tpr.prev = " + fpr.tpr.getPrev());
+
+ xref_index = fpr.tpr.getPrev();
+ }
+
+ return blocks;
+ }
+
+ // public static void parseEGIZ()
+ // {
+ //
+ // int root_index =
+ // PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(bpr.xpr,
+ // bpr.tpr.root.ior);
+ // ObjectParseResult root_opr = PDFUtils.parseObject(pdf, root_index);
+ // DictionaryParseResult root_dpr = (DictionaryParseResult) root_opr.object;
+ //
+ // int egiz_index = PDFUtils.indexOfName(pdf, root_dpr.names, EGIZ_DICT_NAME);
+ // if (egiz_index >= 0)
+ // {
+ // IndirectObjectReferenceParseResult egiz_iorpr =
+ // (IndirectObjectReferenceParseResult) root_dpr.values.get(egiz_index);
+ // System.out.println("EGIZ signature info at = " + egiz_iorpr);
+ //
+ // int egiz_dict_index =
+ // PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(bpr.xpr,
+ // egiz_iorpr.ior);
+ // ObjectParseResult opr = PDFUtils.parseObject(pdf, egiz_dict_index);
+ // DictionaryParseResult egiz_dict = (DictionaryParseResult) opr.object;
+ //
+ // for (int i = 0; i < egiz_dict.names.size(); i++)
+ // {
+ // NameParseResult npr = egiz_dict.names.get(i);
+ // int len = npr.next_index - npr.name_start_index;
+ // byte[] name = new byte[len];
+ // System.arraycopy(pdf, npr.name_start_index, name, 0, len);
+ // System.out.print(" " + new String(name, "US-ASCII") + " = ");
+ //
+ // System.out.println(egiz_dict.values.get(i));
+ // }
+ //
+ // // int key = PDFUtils.indexOfName(pdf, egiz_dict.names, new byte [] { 'K',
+ // // 'e', 'y'});
+ // // IndirectObjectReferenceParseResult key_iorpr =
+ // // (IndirectObjectReferenceParseResult) egiz_dict.values.get(key);
+ // // int key_offset =
+ // // PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(xpr,
+ // // key_iorpr.ior);
+ // // ObjectParseResult key_opr = PDFUtils.parseObject(pdf, key_offset);
+ // // StreamParseResult spr = (StreamParseResult) key_opr.object;
+ // // System.out.println(" key stream from " + spr.content_start_index + " to
+ // // " + spr.content_end_index);
+ // //
+ // // int data_len = spr.content_end_index - spr.content_start_index;
+ // // byte [] data = new byte[data_len];
+ // // System.arraycopy(pdf, spr.content_start_index, data, 0, data_len);
+ // // System.out.println(new String(data, "US-ASCII"));
+ //
+ // }
+ // else
+ // {
+ // System.out.println("No EGIZ block found.");
+ // }
+ //
+ // }
+
+ public static byte[] getOriginalDocument(final File file_name) throws IOException
+ {
+ FileInputStream fis = new FileInputStream(file_name);
+ byte[] pdf = new byte[(int) file_name.length()];
+ fis.read(pdf);
+ fis.close();
+ fis = null;
+
+ int last_start_xref = PDFUtils.findLastStartXRef(pdf);
+
+ StartXRefParseResult sxpr = PDFUtils.parseStartXRef(pdf, last_start_xref);
+
+ XRefSectionParseResult xpr = PDFUtils.parseXRefSection(pdf, sxpr.xref_index);
+
+ TrailerParseResult tpr = PDFUtils.parseTrailer(pdf, xpr.next_index);
+
+ System.out.println("tpr.info = " + tpr.info);
+ System.out.println("tpr.root = " + tpr.root);
+ System.out.println("tpr.size = " + tpr.size);
+
+ System.out.println("tpr.has_predecessor = " + tpr.has_predecessor);
+ if (tpr.has_predecessor)
+ {
+ System.out.println("tpr.prev = " + tpr.getPrev());
+ }
+
+ int root_index = PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(xpr, tpr.root.ior);
+ ObjectParseResult root_opr = PDFUtils.parseObject(pdf, root_index);
+ DictionaryParseResult root_dpr = (DictionaryParseResult) root_opr.object;
+
+ byte[] EGIZ_TYPE = new String("EGIZSigDict").getBytes("US-ASCII");
+ int egiz_index = PDFUtils.indexOfName(pdf, root_dpr.names, EGIZ_TYPE);
+ if (egiz_index >= 0)
+ {
+ System.out.println("The document is EGIZ-signed. ==> extract original document");
+
+ IndirectObjectReferenceParseResult egiz_iorpr = (IndirectObjectReferenceParseResult) root_dpr.values.get(egiz_index);
+ System.out.println("EGIZ signature info at = " + egiz_iorpr);
+
+ int egiz_dict_index = PDFUtils.getObjectOffsetFromXRefByIndirectObjectReference(xpr, egiz_iorpr.ior);
+ ObjectParseResult opr = PDFUtils.parseObject(pdf, egiz_dict_index);
+ DictionaryParseResult egiz_dict = (DictionaryParseResult) opr.object;
+
+ for (int i = 0; i < egiz_dict.names.size(); i++)
+ {
+ NameParseResult npr = (NameParseResult) egiz_dict.names.get(i);
+ int len = npr.next_index - npr.name_start_index;
+ byte[] name = new byte[len];
+ System.arraycopy(pdf, npr.name_start_index, name, 0, len);
+ System.out.print(" " + new String(name, "US-ASCII") + " = ");
+
+ System.out.println(egiz_dict.values.get(i));
+ }
+
+ // Original document size
+ int key = PDFUtils.indexOfName(pdf, egiz_dict.names, new byte[] { 'O',
+ 'D', 'S' });
+ NumberParseResult ods = (NumberParseResult) egiz_dict.values.get(key);
+
+ int original_document_size = ods.number;
+ System.out.println("Original Document Size = " + original_document_size);
+
+ byte[] original = new byte[original_document_size];
+ System.arraycopy(pdf, 0, original, 0, original_document_size);
+
+ return original;
+ }
+
+ System.out.println("No EGIZ block found. ==> the whold document is the original document");
+ return pdf;
+ }
+
+}