aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java')
-rw-r--r--src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java1405
1 files changed, 0 insertions, 1405 deletions
diff --git a/src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java b/src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java
deleted file mode 100644
index de356c9..0000000
--- a/src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java
+++ /dev/null
@@ -1,1405 +0,0 @@
-/**
- * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
- * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
- * joint initiative of the Federal Chancellery Austria and Graz University of
- * Technology.
- *
- * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
- * the European Commission - subsequent versions of the EUPL (the "Licence");
- * You may not use this work except in compliance with the Licence.
- * You may obtain a copy of the Licence at:
- * http://www.osor.eu/eupl/
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the Licence is distributed on an "AS IS" basis,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the Licence for the specific language governing permissions and
- * limitations under the Licence.
- *
- * This product combines work with different licenses. See the "NOTICE" text
- * file for details on the various modules and licenses.
- * The "NOTICE" text file is part of the distribution. Any derivative works
- * that you distribute must include a readable copy of the "NOTICE" text file.
- *
- * $Id: PDFUtils.java,v 1.1 2006/08/25 17:00:59 wprinz Exp $
- */
-package at.knowcenter.wag.exactparser.parsing;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.log4j.Logger;
-
-import at.knowcenter.wag.exactparser.ByteArrayUtils;
-import at.knowcenter.wag.exactparser.parsing.results.ArrayParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.BooleanParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.DictionaryParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.EOFParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.FooterParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.HeaderParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.HexStringParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.IndirectObjectReferenceParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.IntegerParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.LiteralStringParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.NameParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.NullParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.NumberParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.ObjectHeaderParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.ObjectParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.ParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.StartXRefParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.StreamParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.TrailerParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.XRefLineParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.XRefSectionParseResult;
-import at.knowcenter.wag.exactparser.parsing.results.XRefSubSectionParseResult;
-
-
-
-/**
- * Abstract class that contains several static utility methods for parsing and
- * analyzing PDF documents on the lowest level.
- *
- * <p>
- * Most operations require random access to the PDF data (mostly to verify the
- * synthax). So the whole PDF document has to be provided as a byte array. The
- * term "pdf+index" states a specific position index within this byte array.
- * </p>
- *
- * @author wprinz
- *
- */
-public abstract class PDFUtils
-{
- private static Logger log = Logger.getLogger(PDFUtils.class);
-
- public static boolean isWhitespace(final byte data)
- {
- return ByteArrayUtils.contains(PDFNames.WHITESPACE_CHARACTERS, data);
- }
-
- public static boolean isDelimiter(final byte data)
- {
- return ByteArrayUtils.contains(PDFNames.DELIMITER_CHARACTERS, data);
- }
-
- protected static boolean isRegular(final byte data)
- {
- return !(isWhitespace(data) || isDelimiter(data));
- }
-
- /**
- * Skips whitespace.
- *
- * <p>
- * Skips all whitespace, which may be none, one or multiple whitespace
- * characters.
- * </p>
- * <p>
- * Note that this also skips newline characters (which belong to whitespace as
- * well).
- * </p>
- *
- * @param data
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the index of the first non whitespace character. This may
- * be equal to index if no whitespaces were skipped at all.
- */
- public static int skipWhitespace(final byte[] data, final int index)
- {
- int non_whitespace_index = index;
- while (isWhitespace(data[non_whitespace_index]))
- {
- non_whitespace_index++;
- }
- return non_whitespace_index;
- }
-
- /**
- * Skips bytes until whitespace is reached.
- *
- * <p>
- * Skips all non whitespace characters, which may be none at all.
- * </p>
- *
- * @param data
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the index of the first whitespace character. This may be
- * equal to index if no non whitespaces were skipped at all.
- */
- public static int skipToWhitespace(final byte[] data, final int index)
- {
- int whitespace_index = index;
- while (!isWhitespace(data[whitespace_index]))
- {
- whitespace_index++;
- }
- return whitespace_index;
- }
-
- protected static final byte[] LINE_TERMINATOR_CRLF = {
- PDFNames.WHITESPACE_CR, PDFNames.WHITESPACE_LF };
-
- protected static final byte[] LINE_TERMINATOR_CRALONE = { PDFNames.WHITESPACE_CR };
-
- protected static final byte[] LINE_TERMINATOR_LF = { PDFNames.WHITESPACE_LF };
-
- public static boolean isNewline(final byte[] data, final int index)
- {
- if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_LF))
- {
- return true;
- }
- if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRLF))
- {
- return true;
- }
- // although not specified by PDF, some applications use the CR alone as line
- // terminator
- if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRALONE))
- {
- return true;
- }
- return false;
- }
-
- public static int skipNewline(final byte[] data, final int index)
- {
- if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_LF))
- {
- return index + LINE_TERMINATOR_LF.length;
- }
- if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRLF))
- {
- return index + LINE_TERMINATOR_CRLF.length;
- }
- // although not specified by PDF, some applications use the CR alone as line
- // terminator
- if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRALONE))
- {
- return index + LINE_TERMINATOR_CRALONE.length;
- }
-
- assert false : "don't call this if you don't expect a newline - call skipWhitespace instead";
- return index;
- }
-
- public static int skipToNewline(final byte[] data, final int index)
- {
- int current_index = index;
- for (;;)
- {
- if (ByteArrayUtils.compareByteArrays(data, current_index, LINE_TERMINATOR_LF))
- {
- return current_index + LINE_TERMINATOR_LF.length;
- }
- if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRLF))
- {
- return index + LINE_TERMINATOR_CRLF.length;
- }
- // although not specified by PDF, some applications use the CR alone as
- // line terminator
- if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRALONE))
- {
- return index + LINE_TERMINATOR_CRALONE.length;
- }
- current_index++;
- }
- }
-
- /**
- * Parses a boolean value.
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static BooleanParseResult parseBoolean(final byte[] pdf,
- final int index)
- {
- BooleanParseResult bpr = new BooleanParseResult();
- bpr.start_index = index;
-
- if (ByteArrayUtils.compareByteArrays(pdf, bpr.start_index, PDFNames.TRUE_STR))
- {
- bpr.value = true;
- bpr.next_index = bpr.start_index + PDFNames.TRUE_STR.length;
-
- return bpr;
- }
- if (ByteArrayUtils.compareByteArrays(pdf, bpr.start_index, PDFNames.FALSE_STR))
- {
- bpr.value = false;
- bpr.next_index = bpr.start_index + PDFNames.FALSE_STR.length;
-
- return bpr;
- }
-
- throw new RuntimeException("Boolean couldn't be parsed at index " + index);
- }
-
- public static boolean isSign(final byte data)
- {
- return data == '+' || data == '-';
- }
-
- public static boolean isNumeric(final byte data)
- {
- return '0' <= data && data <= '9';
- }
-
- /**
- * Reads the (positive integer) number from the data. The number must be
- * terminated by the end of line.
- *
- * @param data
- * The data.
- * @param index
- * The index.
- * @return Returns the read number.
- */
- public static int readNumberFromByteArray(final byte[] data, final int index)
- {
- NumberParseResult npr = parseNumberFromByteArray(data, index);
-
- assert npr.number >= 0;
- return npr.number;
- }
-
- /**
- * Parses an unsigned integer.
- *
- * <p>
- * The integer must be a block of successive number characters. It must not be
- * preceded by a sign (not even '+').
- * </p>
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static IntegerParseResult parseUnsignedInteger(final byte[] pdf,
- final int index)
- {
- assert isNumeric(pdf[index]);
-
- String number = "";
-
- int cur_index = index;
- while (isNumeric(pdf[cur_index]))
- {
-
- number += (char) pdf[cur_index];
-
- cur_index++;
- }
-
- // TODO: make better
- int int_value = Integer.parseInt(number);
-
- assert int_value >= 0;
-
- IntegerParseResult ipr = new IntegerParseResult();
- ipr.start_index = index;
- ipr.next_index = cur_index;
- ipr.number = int_value;
- return ipr;
- }
-
- /**
- * Parses a (potentially) signed integer.
- *
- * <p>
- * The integer must be a block of successive number characters. It may be
- * preceded by a sign character ('+' or '-').
- * </p>
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static IntegerParseResult parseInteger(final byte[] pdf,
- final int index)
- {
- assert isSign(pdf[index]) || isNumeric(pdf[index]);
-
- int sign = +1;
- int number_start = index;
- if (pdf[index] == '+')
- {
- sign = +1;
- number_start++;
- }
- else
- {
- if (pdf[index] == '-')
- {
- sign = -1;
- number_start++;
- }
- else
- {
- assert isNumeric(pdf[index]);
- }
- }
-
- IntegerParseResult ipr = parseUnsignedInteger(pdf, number_start);
- ipr.start_index = index;
- ipr.number *= sign;
- return ipr;
- }
-
- /**
- * Parses an arbitrary number;
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static NumberParseResult parseNumberFromByteArray(final byte[] pdf,
- int index)
- {
- String number = "";
-
- assert isSign(pdf[index]) || isNumeric(pdf[index]);
-
- int sign = +1;
- if (pdf[index] == '+')
- {
- sign = +1;
- index++;
- }
- else
- {
- if (pdf[index] == '-')
- {
- sign = -1;
- index++;
- }
- else
- {
- assert isNumeric(pdf[index]);
- }
- }
-
- while (isNumeric(pdf[index]) || pdf[index] == '.')
- {
-
- char digit = (char) pdf[index];
- number += digit;
-
- index++;
- }
-
- NumberParseResult npr = new NumberParseResult();
- npr.next_index = index;
- // TODO: make better
- try
- {
- npr.number = Integer.parseInt(number) * sign;
- }
- catch (NumberFormatException e)
- {
- npr.floating = Float.parseFloat(number) * sign;
- }
-
- return npr;
- }
-
- /**
- * Searches the last occurrence of the "startxref" entry ... in other words
- * starts the search from the end of the document and works reversely.
- *
- * @param pdf
- * The complete PDF file data.
- * @return Returns the offset (byte index) of the "startxref" entry.
- */
- public static int findLastStartXRef(final byte[] pdf)
- {
- return ByteArrayUtils.lastIndexOf(pdf, PDFNames.STARTXREF_STR);
- }
-
- /**
- * Parses the xref section at pdf+index.
- *
- * <p>
- * An xref section starts with 'xref' and contains one or more xref
- * sub-sections.
- * </p>
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The start index of the xref table.
- * @return Returns the result of the parsing operation.
- */
- public static XRefSectionParseResult parseXRefSection(final byte[] pdf,
- final int index)
- {
- at.knowcenter.wag.exactparser.parsing.results.XRefSectionParseResult xpr = new XRefSectionParseResult();
- xpr.start_index = index;
-
- assert ByteArrayUtils.compareByteArrays(pdf, xpr.start_index, PDFNames.XREF_STR);
- assert isNewline(pdf, xpr.start_index + PDFNames.XREF_STR.length);
-
- int cur_index = skipWhitespace(pdf, xpr.start_index + PDFNames.XREF_STR.length);
- // skipNewline(pdf, xpr.start_index + PDFNames.XREF_STR.length);
-
- for (;;)
- {
- // trailer ends the xref section.
- if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.TRAILER_STR))
- {
- break;
- }
-
- // no trailer ==> another xref section
-
- XRefSubSectionParseResult sspr = parseXRefSubSection(pdf, cur_index);
- xpr.appendXRefSubSection(sspr);
-
- cur_index = sspr.next_index;
- }
-
- xpr.next_index = cur_index;
- assert ByteArrayUtils.compareByteArrays(pdf, xpr.next_index, PDFNames.TRAILER_STR);
-
- return xpr;
- }
-
- /**
- * Parses a xref sub-section.
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static XRefSubSectionParseResult parseXRefSubSection(final byte[] pdf,
- final int index)
- {
- XRefSubSectionParseResult sspr = new XRefSubSectionParseResult();
- sspr.start_index = index;
-
- NumberParseResult start_obj_num_npr = parseNumberFromByteArray(pdf, sspr.start_index);
- sspr.start_obj_number = start_obj_num_npr.number;
- assert sspr.start_obj_number >= 0;
-
- assert isWhitespace(pdf[start_obj_num_npr.next_index]);
- int num_obj_index = skipWhitespace(pdf, start_obj_num_npr.next_index);
-
- NumberParseResult num_obj_npr = parseNumberFromByteArray(pdf, num_obj_index);
- sspr.num_objects = num_obj_npr.number;
-
- // assert isNewline(pdf, num_obj_npr.next_index);
- assert isWhitespace(pdf[num_obj_npr.next_index]);
- int start_of_line = skipWhitespace(pdf, num_obj_npr.next_index);
- // skipNewline(pdf, num_obj_npr.next_index);
-
- for (int i = 0; i < sspr.num_objects; i++)
- {
- final int cur_object_number = sspr.start_obj_number + i;
-
- XRefLineParseResult lpr = parseXrefLine(pdf, start_of_line);
- sspr.appendXRefLine(lpr);
-
- // System.out.println("xref line of object " + (oc.start_obj_number + i) +
- // " at " + lpr.start_index + ": " + lpr.object_offset + " " +
- // lpr.generation_number + " " + (char) lpr.object_usage);
-
- if (lpr.object_usage == 'n')
- {
- // check the line - this simple check may make problems with object
- // streams and xref streams
- ObjectHeaderParseResult ohpr = parseObjectHeader(pdf, lpr.object_offset);
- assert ohpr.object_number == cur_object_number;
- assert ohpr.generation_number == lpr.generation_number;
- }
-
- start_of_line = lpr.next_index;
- }
-
- sspr.next_index = start_of_line;
- return sspr;
- }
-
- /**
- * Parses a single 20 bytes xref line at pdf+index.
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static XRefLineParseResult parseXrefLine(final byte[] pdf,
- final int index)
- {
- XRefLineParseResult lpr = new XRefLineParseResult();
-
- lpr.start_index = index;
-
- IntegerParseResult object_offset_ipr = parseUnsignedInteger(pdf, lpr.start_index);
- lpr.object_offset = object_offset_ipr.number;
- assert lpr.object_offset >= 0;
- assert lpr.object_offset < pdf.length;
- assert object_offset_ipr.next_index == lpr.start_index + 10;
-
- assert pdf[object_offset_ipr.next_index] == PDFNames.WHITESPACE_SP; // Standard
- // explicitely
- // says 1
- // single
- // SPACE
- int generation_number_index = object_offset_ipr.next_index + 1;
-
- IntegerParseResult generation_number_ipr = parseUnsignedInteger(pdf, generation_number_index);
- lpr.generation_number = generation_number_ipr.number;
- assert generation_number_ipr.next_index == lpr.start_index + 16;
-
- assert pdf[generation_number_ipr.next_index] == PDFNames.WHITESPACE_SP;
- int usage_index = generation_number_ipr.next_index + 1;
-
- lpr.object_usage = pdf[usage_index];
- assert lpr.object_usage == 'n' || lpr.object_usage == 'f';
-
- if (pdf[usage_index + 1] == PDFNames.WHITESPACE_SP)
- {
- assert pdf[usage_index + 2] == PDFNames.WHITESPACE_CR || pdf[usage_index + 2] == PDFNames.WHITESPACE_LF;
- }
- else
- {
- assert pdf[usage_index + 1] == PDFNames.WHITESPACE_CR;
- assert pdf[usage_index + 2] == PDFNames.WHITESPACE_LF;
- }
-
- lpr.next_index = usage_index + 3;
-
- assert lpr.next_index == lpr.start_index + 20;
-
- return lpr;
- }
-
- public static int indexOfName(final byte[] pdf, List names,
- byte[] sought)
- {
- for (int i = 0; i < names.size(); i++)
- {
- NameParseResult name = (NameParseResult) names.get(i);
- if (ByteArrayUtils.compareByteArrays(pdf, name.name_start_index, sought))
- {
- return i;
- }
- }
- return -1;
- }
-
- public static TrailerParseResult parseTrailer(final byte[] pdf,
- final int index)
- {
- TrailerParseResult tpr = new TrailerParseResult();
- tpr.start_index = index;
- tpr.has_predecessor = false;
-
- assert ByteArrayUtils.compareByteArrays(pdf, tpr.start_index, PDFNames.TRAILER_STR);
-
- // assert isWhitespace(pdf[tpr.start_index + PDFNames.TRAILER_STR.length]);
- tpr.contents_index = skipWhitespace(pdf, tpr.start_index + PDFNames.TRAILER_STR.length);
-
- int trailer_dict_index = skipWhitespace(pdf, tpr.contents_index);
-
- assert ByteArrayUtils.compareByteArrays(pdf, trailer_dict_index, PDFNames.DICT_START_STR);
-
- tpr.dpr = parseDictionary(pdf, trailer_dict_index);
-
- int cur_index = tpr.dpr.next_index;
-
- int info_index = indexOfName(pdf, tpr.dpr.names, PDFNames.INFO_STR);
- if (info_index >= 0)
- {
- tpr.info = (IndirectObjectReferenceParseResult) tpr.dpr.values.get(info_index);
- }
-
- int root_index = indexOfName(pdf, tpr.dpr.names, PDFNames.ROOT_STR);
- if (root_index >= 0)
- {
- tpr.root = (IndirectObjectReferenceParseResult) tpr.dpr.values.get(root_index);
- }
-
- tpr.size = ((NumberParseResult) tpr.dpr.values.get(indexOfName(pdf, tpr.dpr.names, PDFNames.SIZE_STR))).number;
-
- int prev_index = indexOfName(pdf, tpr.dpr.names, PDFNames.PREV_STR);
- if (prev_index >= 0)
- {
- tpr.has_predecessor = true;
- tpr.setPrev(((NumberParseResult) tpr.dpr.values.get(prev_index)).number);
- }
-
- //
- // int cur_index = skipWhitespace(pdf, trailer_dict_index +
- // PDFNames.DICT_START_STR.length);
- // for (;;) {
- // if (ByteArrayUtils.compareByteArrays(pdf, cur_index,
- // PDFNames.DICT_END_STR)) {
- // cur_index += PDFNames.DICT_END_STR.length;
- // break;
- // }
- //
- // assert pdf[cur_index] == PDFNames.DELIMITER_NAME;
- // cur_index++;
- //
- // if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.INFO_STR))
- // {
- // assert isWhitespace(pdf[cur_index + PDFNames.INFO_STR.length]);
- // int ir_index = skipWhitespace(pdf, cur_index + PDFNames.INFO_STR.length);
- //
- // IndirectObjectReferenceParseResult iorpr =
- // parseIndirectObjectReference(pdf, ir_index);
- // tpr.info = iorpr;
- //
- // cur_index = skipWhitespace(pdf, iorpr.next_index);
- // continue;
- // }
- //
- // if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.ROOT_STR))
- // {
- // assert isWhitespace(pdf[cur_index + PDFNames.ROOT_STR.length]);
- // int ir_index = skipWhitespace(pdf, cur_index + PDFNames.ROOT_STR.length);
- //
- // IndirectObjectReferenceParseResult iorpr =
- // parseIndirectObjectReference(pdf, ir_index);
- // tpr.root = iorpr;
- //
- // cur_index = skipWhitespace(pdf, iorpr.next_index);
- // continue;
- // }
- //
- // if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.SIZE_STR))
- // {
- // assert isWhitespace(pdf[cur_index + PDFNames.SIZE_STR.length]);
- // int size_index = skipWhitespace(pdf, cur_index +
- // PDFNames.SIZE_STR.length);
- //
- // NumberParseResult npr = parseNumberFromByteArray(pdf, size_index);
- // tpr.size = npr.number;
- // assert tpr.size > 0;
- //
- // cur_index = skipWhitespace(pdf, npr.next_index);
- // continue;
- // }
- //
- // if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.PREV_STR))
- // {
- // assert isWhitespace(pdf[cur_index + PDFNames.PREV_STR.length]);
- // int prev_index = skipWhitespace(pdf, cur_index +
- // PDFNames.PREV_STR.length);
- //
- // NumberParseResult npr = parseNumberFromByteArray(pdf, prev_index);
- // tpr.has_predecessor = true;
- // tpr.setPrev(npr.number);
- // assert tpr.getPrev() >= 0;
- // assert tpr.getPrev() < pdf.length;
- //
- // assert ByteArrayUtils.compareByteArrays(pdf, tpr.getPrev(),
- // PDFNames.XREF_STR);
- //
- // cur_index = skipWhitespace(pdf, npr.next_index);
- // continue;
- // }
- //
- // // unrecognized type
- // // skip to next delimiter
- // // TODO: this will not work with nested dicts. - already deprecated
- // while (pdf[cur_index] != PDFNames.DELIMITER_NAME) {
- // cur_index++;
- // }
- // }
-
- tpr.contents_end_index = cur_index;
- tpr.next_index = skipWhitespace(pdf, tpr.contents_end_index);
-
- assert ByteArrayUtils.compareByteArrays(pdf, tpr.next_index, PDFNames.STARTXREF_STR);
- return tpr;
- }
-
- /**
- * Parses the startxref section at pdf+index.
- *
- * @param pdf
- * The complete PDF file data.
- * @param index
- * The index of the startxref section.
- * @return Returns the retsult of the parsing operation.
- */
- public static StartXRefParseResult parseStartXRef(final byte[] pdf,
- final int index)
- {
- StartXRefParseResult spr = new StartXRefParseResult();
- spr.next_index = index;
-
- assert ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.STARTXREF_STR);
- assert isNewline(pdf, index + PDFNames.STARTXREF_STR.length);
-
- int index_of_number = skipWhitespace(pdf, index + PDFNames.STARTXREF_STR.length);
- // skipNewline(pdf, index + PDFNames.STARTXREF_STR.length);
- NumberParseResult npr = parseNumberFromByteArray(pdf, index_of_number);
- spr.xref_index = npr.number;
-
- assert isNewline(pdf, npr.next_index);
- spr.next_index = skipWhitespace(pdf, npr.next_index);
- // skipNewline(pdf, npr.next_index);
-
- assert ByteArrayUtils.compareByteArrays(pdf, spr.next_index, PDFNames.EOF_STR);
-
- assert spr.xref_index >= 0;
- assert spr.xref_index < pdf.length;
-
- // A linearized document sets the startxref value of the first page's footer
- // to 0.
- if (spr.xref_index != 0)
- {
- assert ByteArrayUtils.compareByteArrays(pdf, spr.xref_index, PDFNames.XREF_STR);
- }
-
- return spr;
- }
-
- /**
- * Parses the End Of File (EOF) marker at pdf+index.
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index where to start the parsing.
- * @return Returns the result of the parsing operation.
- */
- public static EOFParseResult parseEOF(final byte[] pdf, final int index)
- {
- EOFParseResult eofpr = new EOFParseResult();
- eofpr.start_index = index;
-
- assert ByteArrayUtils.compareByteArrays(pdf, eofpr.start_index, PDFNames.EOF_STR);
-
- eofpr.eof_end_index = eofpr.start_index + PDFNames.EOF_STR.length;
-
- // Note: The EOF marker is not necessarily terminated with a
- // newline.
-
- // perhaps explicitely determine a newline.
-
- eofpr.next_index = eofpr.eof_end_index;
-
- return eofpr;
- }
-
- public static boolean isIndirectObjectReference(final byte[] pdf,
- final int index)
- {
- IndirectObjectReferenceParseResult iorpr = new IndirectObjectReferenceParseResult();
- iorpr.ior = new IndirectObjectReference();
- iorpr.start_index = index;
-
- if (!PDFUtils.isNumeric(pdf[iorpr.start_index]))
- {
- return false;
- }
- NumberParseResult object_number_npr = parseNumberFromByteArray(pdf, iorpr.start_index);
- iorpr.ior.object_number = object_number_npr.number;
- if (iorpr.ior.object_number <= 0)
- {
- return false;
- }
-
- if (!isWhitespace(pdf[object_number_npr.next_index]))
- {
- return false;
- }
- int generation_number_index = skipWhitespace(pdf, object_number_npr.next_index);
-
- if (!PDFUtils.isNumeric(pdf[generation_number_index]))
- {
- return false;
- }
- NumberParseResult generation_number_npr = parseNumberFromByteArray(pdf, generation_number_index);
- iorpr.ior.generation_number = generation_number_npr.number;
- if (iorpr.ior.generation_number < 0)
- {
- return false;
- }
-
- if (!isWhitespace(pdf[generation_number_npr.next_index]))
- {
- return false;
- }
- int R_index = skipWhitespace(pdf, generation_number_npr.next_index);
-
- if (!ByteArrayUtils.compareByteArrays(pdf, R_index, PDFNames.REFERENCE_STR))
- {
- return false;
- }
-
- iorpr.next_index = R_index + PDFNames.REFERENCE_STR.length;
-
- return true;
- }
-
- /**
- * Parses an indirect object reference.
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static IndirectObjectReferenceParseResult parseIndirectObjectReference(
- final byte[] pdf, final int index)
- {
-
- assert isIndirectObjectReference(pdf, index);
-
- IndirectObjectReferenceParseResult iorpr = new IndirectObjectReferenceParseResult();
- iorpr.ior = new IndirectObjectReference();
- iorpr.start_index = index;
-
- NumberParseResult object_number_npr = parseNumberFromByteArray(pdf, iorpr.start_index);
- iorpr.ior.object_number = object_number_npr.number;
- assert iorpr.ior.object_number > 0;
-
- assert isWhitespace(pdf[object_number_npr.next_index]);
- int generation_number_index = skipWhitespace(pdf, object_number_npr.next_index);
-
- NumberParseResult generation_number_npr = parseNumberFromByteArray(pdf, generation_number_index);
- iorpr.ior.generation_number = generation_number_npr.number;
- assert iorpr.ior.generation_number >= 0;
-
- assert isWhitespace(pdf[generation_number_npr.next_index]);
- int R_index = skipWhitespace(pdf, generation_number_npr.next_index);
-
- assert ByteArrayUtils.compareByteArrays(pdf, R_index, PDFNames.REFERENCE_STR);
-
- iorpr.next_index = R_index + PDFNames.REFERENCE_STR.length;
-
- return iorpr;
- }
-
- /**
- * Parses the object header at pdf+index.
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static ObjectHeaderParseResult parseObjectHeader(final byte[] pdf,
- final int index)
- {
- ObjectHeaderParseResult ohpr = new ObjectHeaderParseResult();
-
- ohpr.start_index = index;
-
- NumberParseResult object_number_npr = parseNumberFromByteArray(pdf, ohpr.start_index);
- ohpr.object_number = object_number_npr.number;
- assert ohpr.object_number > 0;
-
- assert isWhitespace(pdf[object_number_npr.next_index]);
- int generation_number_index = skipWhitespace(pdf, object_number_npr.next_index);
-
- NumberParseResult generation_number_npr = parseNumberFromByteArray(pdf, generation_number_index);
- ohpr.generation_number = generation_number_npr.number;
- assert ohpr.generation_number >= 0;
-
- assert isWhitespace(pdf[generation_number_npr.next_index]);
- int obj_index = skipWhitespace(pdf, generation_number_npr.next_index);
-
- assert ByteArrayUtils.compareByteArrays(pdf, obj_index, PDFNames.OBJ_STR);
-
- // not all pdfwriters make a newline after obj...
- // assert isNewline(pdf, obj_index + PDFNames.OBJ_STR.length);
- // ohpr.next_index = skipNewline(pdf, obj_index + PDFNames.OBJ_STR.length);
- ohpr.next_index = skipWhitespace(pdf, obj_index + PDFNames.OBJ_STR.length);
-
- return ohpr;
- }
-
- public static ObjectParseResult parseObject(final byte[] pdf, final int index)
- {
- ObjectParseResult opr = new ObjectParseResult();
- opr.start_index = index;
-
- opr.header = parseObjectHeader(pdf, opr.start_index);
- opr.content_index = opr.header.next_index;
-
- int cur_index = skipWhitespace(pdf, opr.content_index);
-
- opr.object = parseUnknownObject(pdf, cur_index);
-
- cur_index = skipWhitespace(pdf, opr.object.next_index);
-
- opr.end_of_content_index = cur_index;
- assert ByteArrayUtils.compareByteArrays(pdf, opr.end_of_content_index, PDFNames.ENDOBJ_STR);
-
- cur_index = opr.end_of_content_index + PDFNames.ENDOBJ_STR.length;
-
- opr.next_index = cur_index;
- //assert isNewline(pdf, cur_index);
- //opr.next_index = skipNewline(pdf, cur_index);
-
- return opr;
- }
-
- public static ParseResult parseUnknownObject(final byte[] pdf, final int index)
- {
- if (ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.DICT_START_STR))
- {
- DictionaryParseResult dpr = parseDictionary(pdf, index);
-
- int possible_stream_index = skipWhitespace(pdf, dpr.next_index);
- if (ByteArrayUtils.compareByteArrays(pdf, possible_stream_index, PDFNames.STREAM_STR))
- {
- return parseStream(pdf, possible_stream_index, dpr);
- }
-
- return dpr;
- }
-
- if (ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.NULL_STR))
- {
- return parseNull(pdf, index);
- }
-
- if (ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.TRUE_STR) || ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.FALSE_STR))
- {
- return parseBoolean(pdf, index);
- }
-
- final byte first_byte = pdf[index];
-
- if (isNumeric(first_byte) || isSign(first_byte))
- {
-
- // try to parse a Indirect reference first - if this fails, parse a number
- if (isIndirectObjectReference(pdf, index))
- {
- return parseIndirectObjectReference(pdf, index);
- }
-
- return parseNumberFromByteArray(pdf, index);
- }
-
- ParseResult pr = null;
-
- switch (first_byte)
- {
- case PDFNames.DELIMITER_STRING_OPEN:
- pr = parseLiteralString(pdf, index);
- break;
- case PDFNames.DELIMITER_HEXSTRING_OPEN:
- pr = parseHexString(pdf, index);
- break;
- case PDFNames.DELIMITER_ARRAY_OPEN:
- pr = parseArray(pdf, index);
- break;
- case PDFNames.DELIMITER_NAME:
- pr = parseName(pdf, index);
- break;
- default:
- throw new RuntimeException("Unknown first_byte " + first_byte + "' when parsing an unknown object at index=" + index + ".");
- // assert false : "nyi or invalid char";
- }
- assert pr != null;
-
- return pr;
- }
-
- /**
- * Parses a literal string.
- *
- * <p>
- * A literal string is a string of ASCII characters enclosed by '(' and ')'.
- * Balanced pairs of '(' and ')' are allowed within the string. Unbalanced '('
- * or ')' must be escaped as '\(' or '\)'.
- * </p>
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static LiteralStringParseResult parseLiteralString(final byte[] pdf,
- final int index)
- {
- LiteralStringParseResult lspr = new LiteralStringParseResult();
- lspr.start_index = index;
-
- assert pdf[lspr.start_index] == PDFNames.DELIMITER_STRING_OPEN;
-
- lspr.content_start_index = lspr.start_index + 1;
-
- int cur_index = lspr.content_start_index;
- int parenthesis_stack = 0;
- for (;;)
- {
- if (pdf[cur_index] == '\\' && (pdf[cur_index + 1] == PDFNames.DELIMITER_STRING_CLOSE || pdf[cur_index + 1] == PDFNames.DELIMITER_STRING_OPEN))
- {
- cur_index += 2;
- continue;
- }
- if (pdf[cur_index] == PDFNames.DELIMITER_STRING_OPEN)
- {
- parenthesis_stack++;
- }
- if (pdf[cur_index] == PDFNames.DELIMITER_STRING_CLOSE)
- {
- assert parenthesis_stack >= 0;
-
- if (parenthesis_stack == 0)
- {
- break;
- }
-
- assert parenthesis_stack > 0;
- parenthesis_stack--;
-
- }
-
- cur_index++;
- }
-
- lspr.content_end_index = cur_index;
- assert pdf[lspr.content_end_index] == PDFNames.DELIMITER_STRING_CLOSE;
-
- lspr.next_index = lspr.content_end_index + 1;
-
- return lspr;
- }
-
- protected static boolean isHex(final byte data)
- {
- return isNumeric(data) || ('a' <= data && data <= 'f') || ('A' <= data && data <= 'f');
- }
-
- /**
- * Parses a hexadecimal string.
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- */
- public static HexStringParseResult parseHexString(final byte[] pdf,
- final int index)
- {
- HexStringParseResult hspr = new HexStringParseResult();
- hspr.start_index = index;
-
- assert pdf[hspr.start_index] == PDFNames.DELIMITER_HEXSTRING_OPEN;
-
- hspr.content_start_index = hspr.start_index + 1;
-
- int cur_index = hspr.content_start_index;
- while (isHex(pdf[cur_index]) || isWhitespace(pdf[cur_index]))
- {
- cur_index++;
- }
-
- hspr.content_end_index = cur_index;
- assert pdf[hspr.content_end_index] == PDFNames.DELIMITER_HEXSTRING_CLOSE;
-
- hspr.next_index = hspr.content_end_index + 1;
-
- return hspr;
- }
-
- public static ArrayParseResult parseArray(final byte[] pdf, final int index)
- {
- ArrayParseResult apr = new ArrayParseResult();
- apr.start_index = index;
- assert pdf[apr.start_index] == PDFNames.DELIMITER_ARRAY_OPEN;
-
- apr.content_start_index = apr.start_index + 1;
-
- apr.elements = new ArrayList();
-
- int cur_index = skipWhitespace(pdf, apr.content_start_index);
- for (;;)
- {
- if (pdf[cur_index] == PDFNames.DELIMITER_ARRAY_CLOSE)
- {
- break;
- }
-
- ParseResult pr = parseUnknownObject(pdf, cur_index);
- apr.elements.add(pr);
-
- cur_index = skipWhitespace(pdf, pr.next_index);
- }
- assert pdf[cur_index] == PDFNames.DELIMITER_ARRAY_CLOSE;
-
- apr.content_end_index = cur_index;
- assert pdf[apr.content_end_index] == PDFNames.DELIMITER_ARRAY_CLOSE;
-
- apr.next_index = apr.content_end_index + 1;
- return apr;
- }
-
- /**
- * Parses a PDF Name.
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of this parsing operation.
- */
- public static NameParseResult parseName(final byte[] pdf, final int index)
- {
- NameParseResult npr = new NameParseResult();
- npr.start_index = index;
-
- assert pdf[npr.start_index] == PDFNames.DELIMITER_NAME;
-
- npr.name_start_index = npr.start_index + 1;
-
- assert isRegular(pdf[npr.name_start_index]);
-
- int cur_index = npr.name_start_index;
- while (isRegular(pdf[cur_index]))
- {
- cur_index++;
- }
- assert !isRegular(pdf[cur_index]);
-
- npr.next_index = cur_index;
-
- return npr;
- }
-
- public static DictionaryParseResult parseDictionary(final byte[] pdf,
- final int index)
- {
- DictionaryParseResult dpr = new DictionaryParseResult();
- dpr.start_index = index;
-
- assert ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.DICT_START_STR);
-
- dpr.content_start_index = dpr.start_index + PDFNames.DICT_START_STR.length;
-
- dpr.names = new ArrayList();
- dpr.values = new ArrayList();
-
- int cur_index = skipWhitespace(pdf, dpr.content_start_index);
- for (;;)
- {
- if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.DICT_END_STR))
- {
- break;
- }
-
- NameParseResult npr = parseName(pdf, cur_index);
- dpr.names.add(npr);
-
- cur_index = npr.next_index;
- cur_index = skipWhitespace(pdf, cur_index);
-
- ParseResult pr = parseUnknownObject(pdf, cur_index);
- dpr.values.add(pr);
-
- cur_index = pr.next_index;
- cur_index = skipWhitespace(pdf, cur_index);
- }
-
- dpr.content_end_index = cur_index;
- assert ByteArrayUtils.compareByteArrays(pdf, dpr.content_end_index, PDFNames.DICT_END_STR);
- dpr.next_index = dpr.content_end_index + PDFNames.DICT_END_STR.length;
-
- return dpr;
- }
-
- /**
- * Parses a stream.
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @param dpr
- * The DictionaryParseResult of the stream's dictionary. This
- * dictionary must precede the stream keyword. Usually this is
- * provided in the stream object's dictionary via the /Length field.
- * @return Returns the result of this parsing operation.
- */
- public static StreamParseResult parseStream(final byte[] pdf,
- final int index, final DictionaryParseResult dpr)
- {
- StreamParseResult spr = new StreamParseResult();
- spr.stream_dictionary = dpr;
- spr.start_index = spr.stream_dictionary.start_index;
- spr.stream_start_index = index;
- assert ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.STREAM_STR);
-
- // assert that the provided dictionary really belongs to this stream
- assert spr.stream_start_index == skipWhitespace(pdf, spr.stream_dictionary.next_index);
-
- // see PDF Spec 1.4 chapter 3.2.7
- assert pdf[spr.stream_start_index + PDFNames.STREAM_STR.length] == PDFNames.WHITESPACE_LF || (pdf[spr.stream_start_index + PDFNames.STREAM_STR.length] == PDFNames.WHITESPACE_CR && pdf[spr.stream_start_index + PDFNames.STREAM_STR.length + 1] == PDFNames.WHITESPACE_LF);
- spr.content_start_index = skipNewline(pdf, spr.stream_start_index + PDFNames.STREAM_STR.length);
-
- int length = -1;
- for (int i = 0; i < spr.stream_dictionary.names.size(); i++)
- {
- NameParseResult name = (NameParseResult) spr.stream_dictionary.names.get(i);
- if (ByteArrayUtils.compareByteArrays(pdf, name.name_start_index, PDFNames.LENGTH_STR))
- {
- ParseResult pr = (ParseResult) spr.stream_dictionary.values.get(i);
- NumberParseResult npr = null;
- if (pr instanceof IndirectObjectReferenceParseResult)
- {
- log.debug("An object stream with indirect length - cannot parse this instantly - parse later again.");
- spr.content_end_index = -1;
- spr.next_index = -1;
- return spr;
-
- }
- else
- {
- npr = (NumberParseResult) pr;
- }
- assert npr != null;
-
- length = npr.number;
- break;
- }
-
- }
- assert length >= 0;
-
- spr.content_end_index = spr.content_start_index + length;
-
- int endstr_index = spr.content_end_index;
- if (isNewline(pdf, endstr_index))
- {
- endstr_index = skipWhitespace(pdf, endstr_index);
- }
- assert ByteArrayUtils.compareByteArrays(pdf, endstr_index, PDFNames.ENDSTREAM_STR);
-
- spr.next_index = endstr_index + PDFNames.ENDSTREAM_STR.length;
-
- return spr;
- }
-
- public static NullParseResult parseNull(final byte[] pdf, final int index)
- {
- NullParseResult npr = new NullParseResult();
- npr.start_index = index;
-
- assert ByteArrayUtils.compareByteArrays(pdf, npr.start_index, PDFNames.NULL_STR);
-
- npr.next_index = npr.start_index + PDFNames.NULL_STR.length;
-
- return npr;
- }
-
- public static int getObjectOffsetFromXRefByIndirectObjectReference(
- XRefSectionParseResult xpr, IndirectObjectReference ior)
- {
- Iterator it = xpr.xref_subsections.iterator();
- while (it.hasNext())
- {
- XRefSubSectionParseResult section = (XRefSubSectionParseResult) it.next();
-
- for (int i = 0; i < section.xref_lines.size(); i++)
- {
- if (section.start_obj_number + i == ior.object_number)
- {
- XRefLineParseResult lpr = (XRefLineParseResult) section.xref_lines.get(i);
- return lpr.object_offset;
- }
- }
- }
-
- return -1;
- }
-
- public static HeaderParseResult parseHeader(final byte[] pdf, final int index)
- {
- HeaderParseResult hpr = new HeaderParseResult();
- hpr.start_index = index;
-
- assert pdf[hpr.start_index] == PDFNames.COMMENT;
-
- assert ByteArrayUtils.compareByteArrays(pdf, hpr.start_index + 1, PDFNames.PDF_VERSION_STR);
-
- hpr.major_index = hpr.start_index + 1 + PDFNames.PDF_VERSION_STR.length;
-
- IntegerParseResult major_ipr = parseUnsignedInteger(pdf, hpr.major_index);
- hpr.major = major_ipr.number;
- assert hpr.major >= 1;
-
- assert pdf[major_ipr.next_index] == PDFNames.PDF_VERSION_SEPARATOR;
-
- hpr.minor_index = major_ipr.next_index + 1;
-
- IntegerParseResult minor_ipr = parseUnsignedInteger(pdf, hpr.minor_index);
- hpr.minor = minor_ipr.number;
- assert hpr.minor >= 0;
-
- assert isWhitespace(pdf[minor_ipr.next_index]);
- hpr.binary_characters_index = skipWhitespace(pdf, minor_ipr.next_index);
-
- assert pdf[hpr.binary_characters_index] == PDFNames.COMMENT;
-
- hpr.next_index = skipToNewline(pdf, hpr.binary_characters_index);
- return hpr;
- }
-
- /**
- * Parses a PDF footer.
- *
- * <p>
- * A PDF footer starts with the xref, followed by the trailer, the startxref
- * and the EOF marker.
- * </p>
- *
- * @param pdf
- * The PDF data.
- * @param index
- * The index.
- * @return Returns the result of the parsing operation.
- *
- * @see FooterParseResult
- */
- public static FooterParseResult parseFooter(final byte[] pdf, final int index)
- {
- FooterParseResult fpr = new FooterParseResult();
- fpr.start_index = index;
-
- fpr.xpr = PDFUtils.parseXRefSection(pdf, fpr.start_index);
-
- fpr.tpr = PDFUtils.parseTrailer(pdf, fpr.xpr.next_index);
-
- fpr.sxpr = PDFUtils.parseStartXRef(pdf, fpr.tpr.next_index);
-
- fpr.eofpr = PDFUtils.parseEOF(pdf, fpr.sxpr.next_index);
-
- fpr.next_index = fpr.eofpr.next_index;
- return fpr;
- }
-
-}