From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../wag/exactparser/parsing/PDFUtils.java | 1405 ++++++++++++++++++++ 1 file changed, 1405 insertions(+) create mode 100644 pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java (limited to 'pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java') diff --git a/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java b/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java new file mode 100644 index 0000000..de356c9 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/knowcenter/wag/exactparser/parsing/PDFUtils.java @@ -0,0 +1,1405 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + * + * $Id: PDFUtils.java,v 1.1 2006/08/25 17:00:59 wprinz Exp $ + */ +package at.knowcenter.wag.exactparser.parsing; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.log4j.Logger; + +import at.knowcenter.wag.exactparser.ByteArrayUtils; +import at.knowcenter.wag.exactparser.parsing.results.ArrayParseResult; +import at.knowcenter.wag.exactparser.parsing.results.BooleanParseResult; +import at.knowcenter.wag.exactparser.parsing.results.DictionaryParseResult; +import at.knowcenter.wag.exactparser.parsing.results.EOFParseResult; +import at.knowcenter.wag.exactparser.parsing.results.FooterParseResult; +import at.knowcenter.wag.exactparser.parsing.results.HeaderParseResult; +import at.knowcenter.wag.exactparser.parsing.results.HexStringParseResult; +import at.knowcenter.wag.exactparser.parsing.results.IndirectObjectReferenceParseResult; +import at.knowcenter.wag.exactparser.parsing.results.IntegerParseResult; +import at.knowcenter.wag.exactparser.parsing.results.LiteralStringParseResult; +import at.knowcenter.wag.exactparser.parsing.results.NameParseResult; +import at.knowcenter.wag.exactparser.parsing.results.NullParseResult; +import at.knowcenter.wag.exactparser.parsing.results.NumberParseResult; +import at.knowcenter.wag.exactparser.parsing.results.ObjectHeaderParseResult; +import at.knowcenter.wag.exactparser.parsing.results.ObjectParseResult; +import at.knowcenter.wag.exactparser.parsing.results.ParseResult; +import at.knowcenter.wag.exactparser.parsing.results.StartXRefParseResult; +import at.knowcenter.wag.exactparser.parsing.results.StreamParseResult; +import at.knowcenter.wag.exactparser.parsing.results.TrailerParseResult; +import at.knowcenter.wag.exactparser.parsing.results.XRefLineParseResult; +import at.knowcenter.wag.exactparser.parsing.results.XRefSectionParseResult; +import at.knowcenter.wag.exactparser.parsing.results.XRefSubSectionParseResult; + + + +/** + * Abstract class that contains several static utility methods for parsing and + * analyzing PDF documents on the lowest level. + * + *

+ * Most operations require random access to the PDF data (mostly to verify the + * synthax). So the whole PDF document has to be provided as a byte array. The + * term "pdf+index" states a specific position index within this byte array. + *

+ * + * @author wprinz + * + */ +public abstract class PDFUtils +{ + private static Logger log = Logger.getLogger(PDFUtils.class); + + public static boolean isWhitespace(final byte data) + { + return ByteArrayUtils.contains(PDFNames.WHITESPACE_CHARACTERS, data); + } + + public static boolean isDelimiter(final byte data) + { + return ByteArrayUtils.contains(PDFNames.DELIMITER_CHARACTERS, data); + } + + protected static boolean isRegular(final byte data) + { + return !(isWhitespace(data) || isDelimiter(data)); + } + + /** + * Skips whitespace. + * + *

+ * Skips all whitespace, which may be none, one or multiple whitespace + * characters. + *

+ *

+ * Note that this also skips newline characters (which belong to whitespace as + * well). + *

+ * + * @param data + * The PDF data. + * @param index + * The index. + * @return Returns the index of the first non whitespace character. This may + * be equal to index if no whitespaces were skipped at all. + */ + public static int skipWhitespace(final byte[] data, final int index) + { + int non_whitespace_index = index; + while (isWhitespace(data[non_whitespace_index])) + { + non_whitespace_index++; + } + return non_whitespace_index; + } + + /** + * Skips bytes until whitespace is reached. + * + *

+ * Skips all non whitespace characters, which may be none at all. + *

+ * + * @param data + * The PDF data. + * @param index + * The index. + * @return Returns the index of the first whitespace character. This may be + * equal to index if no non whitespaces were skipped at all. + */ + public static int skipToWhitespace(final byte[] data, final int index) + { + int whitespace_index = index; + while (!isWhitespace(data[whitespace_index])) + { + whitespace_index++; + } + return whitespace_index; + } + + protected static final byte[] LINE_TERMINATOR_CRLF = { + PDFNames.WHITESPACE_CR, PDFNames.WHITESPACE_LF }; + + protected static final byte[] LINE_TERMINATOR_CRALONE = { PDFNames.WHITESPACE_CR }; + + protected static final byte[] LINE_TERMINATOR_LF = { PDFNames.WHITESPACE_LF }; + + public static boolean isNewline(final byte[] data, final int index) + { + if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_LF)) + { + return true; + } + if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRLF)) + { + return true; + } + // although not specified by PDF, some applications use the CR alone as line + // terminator + if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRALONE)) + { + return true; + } + return false; + } + + public static int skipNewline(final byte[] data, final int index) + { + if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_LF)) + { + return index + LINE_TERMINATOR_LF.length; + } + if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRLF)) + { + return index + LINE_TERMINATOR_CRLF.length; + } + // although not specified by PDF, some applications use the CR alone as line + // terminator + if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRALONE)) + { + return index + LINE_TERMINATOR_CRALONE.length; + } + + assert false : "don't call this if you don't expect a newline - call skipWhitespace instead"; + return index; + } + + public static int skipToNewline(final byte[] data, final int index) + { + int current_index = index; + for (;;) + { + if (ByteArrayUtils.compareByteArrays(data, current_index, LINE_TERMINATOR_LF)) + { + return current_index + LINE_TERMINATOR_LF.length; + } + if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRLF)) + { + return index + LINE_TERMINATOR_CRLF.length; + } + // although not specified by PDF, some applications use the CR alone as + // line terminator + if (ByteArrayUtils.compareByteArrays(data, index, LINE_TERMINATOR_CRALONE)) + { + return index + LINE_TERMINATOR_CRALONE.length; + } + current_index++; + } + } + + /** + * Parses a boolean value. + * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static BooleanParseResult parseBoolean(final byte[] pdf, + final int index) + { + BooleanParseResult bpr = new BooleanParseResult(); + bpr.start_index = index; + + if (ByteArrayUtils.compareByteArrays(pdf, bpr.start_index, PDFNames.TRUE_STR)) + { + bpr.value = true; + bpr.next_index = bpr.start_index + PDFNames.TRUE_STR.length; + + return bpr; + } + if (ByteArrayUtils.compareByteArrays(pdf, bpr.start_index, PDFNames.FALSE_STR)) + { + bpr.value = false; + bpr.next_index = bpr.start_index + PDFNames.FALSE_STR.length; + + return bpr; + } + + throw new RuntimeException("Boolean couldn't be parsed at index " + index); + } + + public static boolean isSign(final byte data) + { + return data == '+' || data == '-'; + } + + public static boolean isNumeric(final byte data) + { + return '0' <= data && data <= '9'; + } + + /** + * Reads the (positive integer) number from the data. The number must be + * terminated by the end of line. + * + * @param data + * The data. + * @param index + * The index. + * @return Returns the read number. + */ + public static int readNumberFromByteArray(final byte[] data, final int index) + { + NumberParseResult npr = parseNumberFromByteArray(data, index); + + assert npr.number >= 0; + return npr.number; + } + + /** + * Parses an unsigned integer. + * + *

+ * The integer must be a block of successive number characters. It must not be + * preceded by a sign (not even '+'). + *

+ * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static IntegerParseResult parseUnsignedInteger(final byte[] pdf, + final int index) + { + assert isNumeric(pdf[index]); + + String number = ""; + + int cur_index = index; + while (isNumeric(pdf[cur_index])) + { + + number += (char) pdf[cur_index]; + + cur_index++; + } + + // TODO: make better + int int_value = Integer.parseInt(number); + + assert int_value >= 0; + + IntegerParseResult ipr = new IntegerParseResult(); + ipr.start_index = index; + ipr.next_index = cur_index; + ipr.number = int_value; + return ipr; + } + + /** + * Parses a (potentially) signed integer. + * + *

+ * The integer must be a block of successive number characters. It may be + * preceded by a sign character ('+' or '-'). + *

+ * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static IntegerParseResult parseInteger(final byte[] pdf, + final int index) + { + assert isSign(pdf[index]) || isNumeric(pdf[index]); + + int sign = +1; + int number_start = index; + if (pdf[index] == '+') + { + sign = +1; + number_start++; + } + else + { + if (pdf[index] == '-') + { + sign = -1; + number_start++; + } + else + { + assert isNumeric(pdf[index]); + } + } + + IntegerParseResult ipr = parseUnsignedInteger(pdf, number_start); + ipr.start_index = index; + ipr.number *= sign; + return ipr; + } + + /** + * Parses an arbitrary number; + * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static NumberParseResult parseNumberFromByteArray(final byte[] pdf, + int index) + { + String number = ""; + + assert isSign(pdf[index]) || isNumeric(pdf[index]); + + int sign = +1; + if (pdf[index] == '+') + { + sign = +1; + index++; + } + else + { + if (pdf[index] == '-') + { + sign = -1; + index++; + } + else + { + assert isNumeric(pdf[index]); + } + } + + while (isNumeric(pdf[index]) || pdf[index] == '.') + { + + char digit = (char) pdf[index]; + number += digit; + + index++; + } + + NumberParseResult npr = new NumberParseResult(); + npr.next_index = index; + // TODO: make better + try + { + npr.number = Integer.parseInt(number) * sign; + } + catch (NumberFormatException e) + { + npr.floating = Float.parseFloat(number) * sign; + } + + return npr; + } + + /** + * Searches the last occurrence of the "startxref" entry ... in other words + * starts the search from the end of the document and works reversely. + * + * @param pdf + * The complete PDF file data. + * @return Returns the offset (byte index) of the "startxref" entry. + */ + public static int findLastStartXRef(final byte[] pdf) + { + return ByteArrayUtils.lastIndexOf(pdf, PDFNames.STARTXREF_STR); + } + + /** + * Parses the xref section at pdf+index. + * + *

+ * An xref section starts with 'xref' and contains one or more xref + * sub-sections. + *

+ * + * @param pdf + * The PDF data. + * @param index + * The start index of the xref table. + * @return Returns the result of the parsing operation. + */ + public static XRefSectionParseResult parseXRefSection(final byte[] pdf, + final int index) + { + at.knowcenter.wag.exactparser.parsing.results.XRefSectionParseResult xpr = new XRefSectionParseResult(); + xpr.start_index = index; + + assert ByteArrayUtils.compareByteArrays(pdf, xpr.start_index, PDFNames.XREF_STR); + assert isNewline(pdf, xpr.start_index + PDFNames.XREF_STR.length); + + int cur_index = skipWhitespace(pdf, xpr.start_index + PDFNames.XREF_STR.length); + // skipNewline(pdf, xpr.start_index + PDFNames.XREF_STR.length); + + for (;;) + { + // trailer ends the xref section. + if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.TRAILER_STR)) + { + break; + } + + // no trailer ==> another xref section + + XRefSubSectionParseResult sspr = parseXRefSubSection(pdf, cur_index); + xpr.appendXRefSubSection(sspr); + + cur_index = sspr.next_index; + } + + xpr.next_index = cur_index; + assert ByteArrayUtils.compareByteArrays(pdf, xpr.next_index, PDFNames.TRAILER_STR); + + return xpr; + } + + /** + * Parses a xref sub-section. + * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static XRefSubSectionParseResult parseXRefSubSection(final byte[] pdf, + final int index) + { + XRefSubSectionParseResult sspr = new XRefSubSectionParseResult(); + sspr.start_index = index; + + NumberParseResult start_obj_num_npr = parseNumberFromByteArray(pdf, sspr.start_index); + sspr.start_obj_number = start_obj_num_npr.number; + assert sspr.start_obj_number >= 0; + + assert isWhitespace(pdf[start_obj_num_npr.next_index]); + int num_obj_index = skipWhitespace(pdf, start_obj_num_npr.next_index); + + NumberParseResult num_obj_npr = parseNumberFromByteArray(pdf, num_obj_index); + sspr.num_objects = num_obj_npr.number; + + // assert isNewline(pdf, num_obj_npr.next_index); + assert isWhitespace(pdf[num_obj_npr.next_index]); + int start_of_line = skipWhitespace(pdf, num_obj_npr.next_index); + // skipNewline(pdf, num_obj_npr.next_index); + + for (int i = 0; i < sspr.num_objects; i++) + { + final int cur_object_number = sspr.start_obj_number + i; + + XRefLineParseResult lpr = parseXrefLine(pdf, start_of_line); + sspr.appendXRefLine(lpr); + + // System.out.println("xref line of object " + (oc.start_obj_number + i) + + // " at " + lpr.start_index + ": " + lpr.object_offset + " " + + // lpr.generation_number + " " + (char) lpr.object_usage); + + if (lpr.object_usage == 'n') + { + // check the line - this simple check may make problems with object + // streams and xref streams + ObjectHeaderParseResult ohpr = parseObjectHeader(pdf, lpr.object_offset); + assert ohpr.object_number == cur_object_number; + assert ohpr.generation_number == lpr.generation_number; + } + + start_of_line = lpr.next_index; + } + + sspr.next_index = start_of_line; + return sspr; + } + + /** + * Parses a single 20 bytes xref line at pdf+index. + * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static XRefLineParseResult parseXrefLine(final byte[] pdf, + final int index) + { + XRefLineParseResult lpr = new XRefLineParseResult(); + + lpr.start_index = index; + + IntegerParseResult object_offset_ipr = parseUnsignedInteger(pdf, lpr.start_index); + lpr.object_offset = object_offset_ipr.number; + assert lpr.object_offset >= 0; + assert lpr.object_offset < pdf.length; + assert object_offset_ipr.next_index == lpr.start_index + 10; + + assert pdf[object_offset_ipr.next_index] == PDFNames.WHITESPACE_SP; // Standard + // explicitely + // says 1 + // single + // SPACE + int generation_number_index = object_offset_ipr.next_index + 1; + + IntegerParseResult generation_number_ipr = parseUnsignedInteger(pdf, generation_number_index); + lpr.generation_number = generation_number_ipr.number; + assert generation_number_ipr.next_index == lpr.start_index + 16; + + assert pdf[generation_number_ipr.next_index] == PDFNames.WHITESPACE_SP; + int usage_index = generation_number_ipr.next_index + 1; + + lpr.object_usage = pdf[usage_index]; + assert lpr.object_usage == 'n' || lpr.object_usage == 'f'; + + if (pdf[usage_index + 1] == PDFNames.WHITESPACE_SP) + { + assert pdf[usage_index + 2] == PDFNames.WHITESPACE_CR || pdf[usage_index + 2] == PDFNames.WHITESPACE_LF; + } + else + { + assert pdf[usage_index + 1] == PDFNames.WHITESPACE_CR; + assert pdf[usage_index + 2] == PDFNames.WHITESPACE_LF; + } + + lpr.next_index = usage_index + 3; + + assert lpr.next_index == lpr.start_index + 20; + + return lpr; + } + + public static int indexOfName(final byte[] pdf, List names, + byte[] sought) + { + for (int i = 0; i < names.size(); i++) + { + NameParseResult name = (NameParseResult) names.get(i); + if (ByteArrayUtils.compareByteArrays(pdf, name.name_start_index, sought)) + { + return i; + } + } + return -1; + } + + public static TrailerParseResult parseTrailer(final byte[] pdf, + final int index) + { + TrailerParseResult tpr = new TrailerParseResult(); + tpr.start_index = index; + tpr.has_predecessor = false; + + assert ByteArrayUtils.compareByteArrays(pdf, tpr.start_index, PDFNames.TRAILER_STR); + + // assert isWhitespace(pdf[tpr.start_index + PDFNames.TRAILER_STR.length]); + tpr.contents_index = skipWhitespace(pdf, tpr.start_index + PDFNames.TRAILER_STR.length); + + int trailer_dict_index = skipWhitespace(pdf, tpr.contents_index); + + assert ByteArrayUtils.compareByteArrays(pdf, trailer_dict_index, PDFNames.DICT_START_STR); + + tpr.dpr = parseDictionary(pdf, trailer_dict_index); + + int cur_index = tpr.dpr.next_index; + + int info_index = indexOfName(pdf, tpr.dpr.names, PDFNames.INFO_STR); + if (info_index >= 0) + { + tpr.info = (IndirectObjectReferenceParseResult) tpr.dpr.values.get(info_index); + } + + int root_index = indexOfName(pdf, tpr.dpr.names, PDFNames.ROOT_STR); + if (root_index >= 0) + { + tpr.root = (IndirectObjectReferenceParseResult) tpr.dpr.values.get(root_index); + } + + tpr.size = ((NumberParseResult) tpr.dpr.values.get(indexOfName(pdf, tpr.dpr.names, PDFNames.SIZE_STR))).number; + + int prev_index = indexOfName(pdf, tpr.dpr.names, PDFNames.PREV_STR); + if (prev_index >= 0) + { + tpr.has_predecessor = true; + tpr.setPrev(((NumberParseResult) tpr.dpr.values.get(prev_index)).number); + } + + // + // int cur_index = skipWhitespace(pdf, trailer_dict_index + + // PDFNames.DICT_START_STR.length); + // for (;;) { + // if (ByteArrayUtils.compareByteArrays(pdf, cur_index, + // PDFNames.DICT_END_STR)) { + // cur_index += PDFNames.DICT_END_STR.length; + // break; + // } + // + // assert pdf[cur_index] == PDFNames.DELIMITER_NAME; + // cur_index++; + // + // if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.INFO_STR)) + // { + // assert isWhitespace(pdf[cur_index + PDFNames.INFO_STR.length]); + // int ir_index = skipWhitespace(pdf, cur_index + PDFNames.INFO_STR.length); + // + // IndirectObjectReferenceParseResult iorpr = + // parseIndirectObjectReference(pdf, ir_index); + // tpr.info = iorpr; + // + // cur_index = skipWhitespace(pdf, iorpr.next_index); + // continue; + // } + // + // if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.ROOT_STR)) + // { + // assert isWhitespace(pdf[cur_index + PDFNames.ROOT_STR.length]); + // int ir_index = skipWhitespace(pdf, cur_index + PDFNames.ROOT_STR.length); + // + // IndirectObjectReferenceParseResult iorpr = + // parseIndirectObjectReference(pdf, ir_index); + // tpr.root = iorpr; + // + // cur_index = skipWhitespace(pdf, iorpr.next_index); + // continue; + // } + // + // if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.SIZE_STR)) + // { + // assert isWhitespace(pdf[cur_index + PDFNames.SIZE_STR.length]); + // int size_index = skipWhitespace(pdf, cur_index + + // PDFNames.SIZE_STR.length); + // + // NumberParseResult npr = parseNumberFromByteArray(pdf, size_index); + // tpr.size = npr.number; + // assert tpr.size > 0; + // + // cur_index = skipWhitespace(pdf, npr.next_index); + // continue; + // } + // + // if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.PREV_STR)) + // { + // assert isWhitespace(pdf[cur_index + PDFNames.PREV_STR.length]); + // int prev_index = skipWhitespace(pdf, cur_index + + // PDFNames.PREV_STR.length); + // + // NumberParseResult npr = parseNumberFromByteArray(pdf, prev_index); + // tpr.has_predecessor = true; + // tpr.setPrev(npr.number); + // assert tpr.getPrev() >= 0; + // assert tpr.getPrev() < pdf.length; + // + // assert ByteArrayUtils.compareByteArrays(pdf, tpr.getPrev(), + // PDFNames.XREF_STR); + // + // cur_index = skipWhitespace(pdf, npr.next_index); + // continue; + // } + // + // // unrecognized type + // // skip to next delimiter + // // TODO: this will not work with nested dicts. - already deprecated + // while (pdf[cur_index] != PDFNames.DELIMITER_NAME) { + // cur_index++; + // } + // } + + tpr.contents_end_index = cur_index; + tpr.next_index = skipWhitespace(pdf, tpr.contents_end_index); + + assert ByteArrayUtils.compareByteArrays(pdf, tpr.next_index, PDFNames.STARTXREF_STR); + return tpr; + } + + /** + * Parses the startxref section at pdf+index. + * + * @param pdf + * The complete PDF file data. + * @param index + * The index of the startxref section. + * @return Returns the retsult of the parsing operation. + */ + public static StartXRefParseResult parseStartXRef(final byte[] pdf, + final int index) + { + StartXRefParseResult spr = new StartXRefParseResult(); + spr.next_index = index; + + assert ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.STARTXREF_STR); + assert isNewline(pdf, index + PDFNames.STARTXREF_STR.length); + + int index_of_number = skipWhitespace(pdf, index + PDFNames.STARTXREF_STR.length); + // skipNewline(pdf, index + PDFNames.STARTXREF_STR.length); + NumberParseResult npr = parseNumberFromByteArray(pdf, index_of_number); + spr.xref_index = npr.number; + + assert isNewline(pdf, npr.next_index); + spr.next_index = skipWhitespace(pdf, npr.next_index); + // skipNewline(pdf, npr.next_index); + + assert ByteArrayUtils.compareByteArrays(pdf, spr.next_index, PDFNames.EOF_STR); + + assert spr.xref_index >= 0; + assert spr.xref_index < pdf.length; + + // A linearized document sets the startxref value of the first page's footer + // to 0. + if (spr.xref_index != 0) + { + assert ByteArrayUtils.compareByteArrays(pdf, spr.xref_index, PDFNames.XREF_STR); + } + + return spr; + } + + /** + * Parses the End Of File (EOF) marker at pdf+index. + * + * @param pdf + * The PDF data. + * @param index + * The index where to start the parsing. + * @return Returns the result of the parsing operation. + */ + public static EOFParseResult parseEOF(final byte[] pdf, final int index) + { + EOFParseResult eofpr = new EOFParseResult(); + eofpr.start_index = index; + + assert ByteArrayUtils.compareByteArrays(pdf, eofpr.start_index, PDFNames.EOF_STR); + + eofpr.eof_end_index = eofpr.start_index + PDFNames.EOF_STR.length; + + // Note: The EOF marker is not necessarily terminated with a + // newline. + + // perhaps explicitely determine a newline. + + eofpr.next_index = eofpr.eof_end_index; + + return eofpr; + } + + public static boolean isIndirectObjectReference(final byte[] pdf, + final int index) + { + IndirectObjectReferenceParseResult iorpr = new IndirectObjectReferenceParseResult(); + iorpr.ior = new IndirectObjectReference(); + iorpr.start_index = index; + + if (!PDFUtils.isNumeric(pdf[iorpr.start_index])) + { + return false; + } + NumberParseResult object_number_npr = parseNumberFromByteArray(pdf, iorpr.start_index); + iorpr.ior.object_number = object_number_npr.number; + if (iorpr.ior.object_number <= 0) + { + return false; + } + + if (!isWhitespace(pdf[object_number_npr.next_index])) + { + return false; + } + int generation_number_index = skipWhitespace(pdf, object_number_npr.next_index); + + if (!PDFUtils.isNumeric(pdf[generation_number_index])) + { + return false; + } + NumberParseResult generation_number_npr = parseNumberFromByteArray(pdf, generation_number_index); + iorpr.ior.generation_number = generation_number_npr.number; + if (iorpr.ior.generation_number < 0) + { + return false; + } + + if (!isWhitespace(pdf[generation_number_npr.next_index])) + { + return false; + } + int R_index = skipWhitespace(pdf, generation_number_npr.next_index); + + if (!ByteArrayUtils.compareByteArrays(pdf, R_index, PDFNames.REFERENCE_STR)) + { + return false; + } + + iorpr.next_index = R_index + PDFNames.REFERENCE_STR.length; + + return true; + } + + /** + * Parses an indirect object reference. + * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static IndirectObjectReferenceParseResult parseIndirectObjectReference( + final byte[] pdf, final int index) + { + + assert isIndirectObjectReference(pdf, index); + + IndirectObjectReferenceParseResult iorpr = new IndirectObjectReferenceParseResult(); + iorpr.ior = new IndirectObjectReference(); + iorpr.start_index = index; + + NumberParseResult object_number_npr = parseNumberFromByteArray(pdf, iorpr.start_index); + iorpr.ior.object_number = object_number_npr.number; + assert iorpr.ior.object_number > 0; + + assert isWhitespace(pdf[object_number_npr.next_index]); + int generation_number_index = skipWhitespace(pdf, object_number_npr.next_index); + + NumberParseResult generation_number_npr = parseNumberFromByteArray(pdf, generation_number_index); + iorpr.ior.generation_number = generation_number_npr.number; + assert iorpr.ior.generation_number >= 0; + + assert isWhitespace(pdf[generation_number_npr.next_index]); + int R_index = skipWhitespace(pdf, generation_number_npr.next_index); + + assert ByteArrayUtils.compareByteArrays(pdf, R_index, PDFNames.REFERENCE_STR); + + iorpr.next_index = R_index + PDFNames.REFERENCE_STR.length; + + return iorpr; + } + + /** + * Parses the object header at pdf+index. + * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static ObjectHeaderParseResult parseObjectHeader(final byte[] pdf, + final int index) + { + ObjectHeaderParseResult ohpr = new ObjectHeaderParseResult(); + + ohpr.start_index = index; + + NumberParseResult object_number_npr = parseNumberFromByteArray(pdf, ohpr.start_index); + ohpr.object_number = object_number_npr.number; + assert ohpr.object_number > 0; + + assert isWhitespace(pdf[object_number_npr.next_index]); + int generation_number_index = skipWhitespace(pdf, object_number_npr.next_index); + + NumberParseResult generation_number_npr = parseNumberFromByteArray(pdf, generation_number_index); + ohpr.generation_number = generation_number_npr.number; + assert ohpr.generation_number >= 0; + + assert isWhitespace(pdf[generation_number_npr.next_index]); + int obj_index = skipWhitespace(pdf, generation_number_npr.next_index); + + assert ByteArrayUtils.compareByteArrays(pdf, obj_index, PDFNames.OBJ_STR); + + // not all pdfwriters make a newline after obj... + // assert isNewline(pdf, obj_index + PDFNames.OBJ_STR.length); + // ohpr.next_index = skipNewline(pdf, obj_index + PDFNames.OBJ_STR.length); + ohpr.next_index = skipWhitespace(pdf, obj_index + PDFNames.OBJ_STR.length); + + return ohpr; + } + + public static ObjectParseResult parseObject(final byte[] pdf, final int index) + { + ObjectParseResult opr = new ObjectParseResult(); + opr.start_index = index; + + opr.header = parseObjectHeader(pdf, opr.start_index); + opr.content_index = opr.header.next_index; + + int cur_index = skipWhitespace(pdf, opr.content_index); + + opr.object = parseUnknownObject(pdf, cur_index); + + cur_index = skipWhitespace(pdf, opr.object.next_index); + + opr.end_of_content_index = cur_index; + assert ByteArrayUtils.compareByteArrays(pdf, opr.end_of_content_index, PDFNames.ENDOBJ_STR); + + cur_index = opr.end_of_content_index + PDFNames.ENDOBJ_STR.length; + + opr.next_index = cur_index; + //assert isNewline(pdf, cur_index); + //opr.next_index = skipNewline(pdf, cur_index); + + return opr; + } + + public static ParseResult parseUnknownObject(final byte[] pdf, final int index) + { + if (ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.DICT_START_STR)) + { + DictionaryParseResult dpr = parseDictionary(pdf, index); + + int possible_stream_index = skipWhitespace(pdf, dpr.next_index); + if (ByteArrayUtils.compareByteArrays(pdf, possible_stream_index, PDFNames.STREAM_STR)) + { + return parseStream(pdf, possible_stream_index, dpr); + } + + return dpr; + } + + if (ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.NULL_STR)) + { + return parseNull(pdf, index); + } + + if (ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.TRUE_STR) || ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.FALSE_STR)) + { + return parseBoolean(pdf, index); + } + + final byte first_byte = pdf[index]; + + if (isNumeric(first_byte) || isSign(first_byte)) + { + + // try to parse a Indirect reference first - if this fails, parse a number + if (isIndirectObjectReference(pdf, index)) + { + return parseIndirectObjectReference(pdf, index); + } + + return parseNumberFromByteArray(pdf, index); + } + + ParseResult pr = null; + + switch (first_byte) + { + case PDFNames.DELIMITER_STRING_OPEN: + pr = parseLiteralString(pdf, index); + break; + case PDFNames.DELIMITER_HEXSTRING_OPEN: + pr = parseHexString(pdf, index); + break; + case PDFNames.DELIMITER_ARRAY_OPEN: + pr = parseArray(pdf, index); + break; + case PDFNames.DELIMITER_NAME: + pr = parseName(pdf, index); + break; + default: + throw new RuntimeException("Unknown first_byte " + first_byte + "' when parsing an unknown object at index=" + index + "."); + // assert false : "nyi or invalid char"; + } + assert pr != null; + + return pr; + } + + /** + * Parses a literal string. + * + *

+ * A literal string is a string of ASCII characters enclosed by '(' and ')'. + * Balanced pairs of '(' and ')' are allowed within the string. Unbalanced '(' + * or ')' must be escaped as '\(' or '\)'. + *

+ * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static LiteralStringParseResult parseLiteralString(final byte[] pdf, + final int index) + { + LiteralStringParseResult lspr = new LiteralStringParseResult(); + lspr.start_index = index; + + assert pdf[lspr.start_index] == PDFNames.DELIMITER_STRING_OPEN; + + lspr.content_start_index = lspr.start_index + 1; + + int cur_index = lspr.content_start_index; + int parenthesis_stack = 0; + for (;;) + { + if (pdf[cur_index] == '\\' && (pdf[cur_index + 1] == PDFNames.DELIMITER_STRING_CLOSE || pdf[cur_index + 1] == PDFNames.DELIMITER_STRING_OPEN)) + { + cur_index += 2; + continue; + } + if (pdf[cur_index] == PDFNames.DELIMITER_STRING_OPEN) + { + parenthesis_stack++; + } + if (pdf[cur_index] == PDFNames.DELIMITER_STRING_CLOSE) + { + assert parenthesis_stack >= 0; + + if (parenthesis_stack == 0) + { + break; + } + + assert parenthesis_stack > 0; + parenthesis_stack--; + + } + + cur_index++; + } + + lspr.content_end_index = cur_index; + assert pdf[lspr.content_end_index] == PDFNames.DELIMITER_STRING_CLOSE; + + lspr.next_index = lspr.content_end_index + 1; + + return lspr; + } + + protected static boolean isHex(final byte data) + { + return isNumeric(data) || ('a' <= data && data <= 'f') || ('A' <= data && data <= 'f'); + } + + /** + * Parses a hexadecimal string. + * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + */ + public static HexStringParseResult parseHexString(final byte[] pdf, + final int index) + { + HexStringParseResult hspr = new HexStringParseResult(); + hspr.start_index = index; + + assert pdf[hspr.start_index] == PDFNames.DELIMITER_HEXSTRING_OPEN; + + hspr.content_start_index = hspr.start_index + 1; + + int cur_index = hspr.content_start_index; + while (isHex(pdf[cur_index]) || isWhitespace(pdf[cur_index])) + { + cur_index++; + } + + hspr.content_end_index = cur_index; + assert pdf[hspr.content_end_index] == PDFNames.DELIMITER_HEXSTRING_CLOSE; + + hspr.next_index = hspr.content_end_index + 1; + + return hspr; + } + + public static ArrayParseResult parseArray(final byte[] pdf, final int index) + { + ArrayParseResult apr = new ArrayParseResult(); + apr.start_index = index; + assert pdf[apr.start_index] == PDFNames.DELIMITER_ARRAY_OPEN; + + apr.content_start_index = apr.start_index + 1; + + apr.elements = new ArrayList(); + + int cur_index = skipWhitespace(pdf, apr.content_start_index); + for (;;) + { + if (pdf[cur_index] == PDFNames.DELIMITER_ARRAY_CLOSE) + { + break; + } + + ParseResult pr = parseUnknownObject(pdf, cur_index); + apr.elements.add(pr); + + cur_index = skipWhitespace(pdf, pr.next_index); + } + assert pdf[cur_index] == PDFNames.DELIMITER_ARRAY_CLOSE; + + apr.content_end_index = cur_index; + assert pdf[apr.content_end_index] == PDFNames.DELIMITER_ARRAY_CLOSE; + + apr.next_index = apr.content_end_index + 1; + return apr; + } + + /** + * Parses a PDF Name. + * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of this parsing operation. + */ + public static NameParseResult parseName(final byte[] pdf, final int index) + { + NameParseResult npr = new NameParseResult(); + npr.start_index = index; + + assert pdf[npr.start_index] == PDFNames.DELIMITER_NAME; + + npr.name_start_index = npr.start_index + 1; + + assert isRegular(pdf[npr.name_start_index]); + + int cur_index = npr.name_start_index; + while (isRegular(pdf[cur_index])) + { + cur_index++; + } + assert !isRegular(pdf[cur_index]); + + npr.next_index = cur_index; + + return npr; + } + + public static DictionaryParseResult parseDictionary(final byte[] pdf, + final int index) + { + DictionaryParseResult dpr = new DictionaryParseResult(); + dpr.start_index = index; + + assert ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.DICT_START_STR); + + dpr.content_start_index = dpr.start_index + PDFNames.DICT_START_STR.length; + + dpr.names = new ArrayList(); + dpr.values = new ArrayList(); + + int cur_index = skipWhitespace(pdf, dpr.content_start_index); + for (;;) + { + if (ByteArrayUtils.compareByteArrays(pdf, cur_index, PDFNames.DICT_END_STR)) + { + break; + } + + NameParseResult npr = parseName(pdf, cur_index); + dpr.names.add(npr); + + cur_index = npr.next_index; + cur_index = skipWhitespace(pdf, cur_index); + + ParseResult pr = parseUnknownObject(pdf, cur_index); + dpr.values.add(pr); + + cur_index = pr.next_index; + cur_index = skipWhitespace(pdf, cur_index); + } + + dpr.content_end_index = cur_index; + assert ByteArrayUtils.compareByteArrays(pdf, dpr.content_end_index, PDFNames.DICT_END_STR); + dpr.next_index = dpr.content_end_index + PDFNames.DICT_END_STR.length; + + return dpr; + } + + /** + * Parses a stream. + * + * @param pdf + * The PDF data. + * @param index + * The index. + * @param dpr + * The DictionaryParseResult of the stream's dictionary. This + * dictionary must precede the stream keyword. Usually this is + * provided in the stream object's dictionary via the /Length field. + * @return Returns the result of this parsing operation. + */ + public static StreamParseResult parseStream(final byte[] pdf, + final int index, final DictionaryParseResult dpr) + { + StreamParseResult spr = new StreamParseResult(); + spr.stream_dictionary = dpr; + spr.start_index = spr.stream_dictionary.start_index; + spr.stream_start_index = index; + assert ByteArrayUtils.compareByteArrays(pdf, index, PDFNames.STREAM_STR); + + // assert that the provided dictionary really belongs to this stream + assert spr.stream_start_index == skipWhitespace(pdf, spr.stream_dictionary.next_index); + + // see PDF Spec 1.4 chapter 3.2.7 + assert pdf[spr.stream_start_index + PDFNames.STREAM_STR.length] == PDFNames.WHITESPACE_LF || (pdf[spr.stream_start_index + PDFNames.STREAM_STR.length] == PDFNames.WHITESPACE_CR && pdf[spr.stream_start_index + PDFNames.STREAM_STR.length + 1] == PDFNames.WHITESPACE_LF); + spr.content_start_index = skipNewline(pdf, spr.stream_start_index + PDFNames.STREAM_STR.length); + + int length = -1; + for (int i = 0; i < spr.stream_dictionary.names.size(); i++) + { + NameParseResult name = (NameParseResult) spr.stream_dictionary.names.get(i); + if (ByteArrayUtils.compareByteArrays(pdf, name.name_start_index, PDFNames.LENGTH_STR)) + { + ParseResult pr = (ParseResult) spr.stream_dictionary.values.get(i); + NumberParseResult npr = null; + if (pr instanceof IndirectObjectReferenceParseResult) + { + log.debug("An object stream with indirect length - cannot parse this instantly - parse later again."); + spr.content_end_index = -1; + spr.next_index = -1; + return spr; + + } + else + { + npr = (NumberParseResult) pr; + } + assert npr != null; + + length = npr.number; + break; + } + + } + assert length >= 0; + + spr.content_end_index = spr.content_start_index + length; + + int endstr_index = spr.content_end_index; + if (isNewline(pdf, endstr_index)) + { + endstr_index = skipWhitespace(pdf, endstr_index); + } + assert ByteArrayUtils.compareByteArrays(pdf, endstr_index, PDFNames.ENDSTREAM_STR); + + spr.next_index = endstr_index + PDFNames.ENDSTREAM_STR.length; + + return spr; + } + + public static NullParseResult parseNull(final byte[] pdf, final int index) + { + NullParseResult npr = new NullParseResult(); + npr.start_index = index; + + assert ByteArrayUtils.compareByteArrays(pdf, npr.start_index, PDFNames.NULL_STR); + + npr.next_index = npr.start_index + PDFNames.NULL_STR.length; + + return npr; + } + + public static int getObjectOffsetFromXRefByIndirectObjectReference( + XRefSectionParseResult xpr, IndirectObjectReference ior) + { + Iterator it = xpr.xref_subsections.iterator(); + while (it.hasNext()) + { + XRefSubSectionParseResult section = (XRefSubSectionParseResult) it.next(); + + for (int i = 0; i < section.xref_lines.size(); i++) + { + if (section.start_obj_number + i == ior.object_number) + { + XRefLineParseResult lpr = (XRefLineParseResult) section.xref_lines.get(i); + return lpr.object_offset; + } + } + } + + return -1; + } + + public static HeaderParseResult parseHeader(final byte[] pdf, final int index) + { + HeaderParseResult hpr = new HeaderParseResult(); + hpr.start_index = index; + + assert pdf[hpr.start_index] == PDFNames.COMMENT; + + assert ByteArrayUtils.compareByteArrays(pdf, hpr.start_index + 1, PDFNames.PDF_VERSION_STR); + + hpr.major_index = hpr.start_index + 1 + PDFNames.PDF_VERSION_STR.length; + + IntegerParseResult major_ipr = parseUnsignedInteger(pdf, hpr.major_index); + hpr.major = major_ipr.number; + assert hpr.major >= 1; + + assert pdf[major_ipr.next_index] == PDFNames.PDF_VERSION_SEPARATOR; + + hpr.minor_index = major_ipr.next_index + 1; + + IntegerParseResult minor_ipr = parseUnsignedInteger(pdf, hpr.minor_index); + hpr.minor = minor_ipr.number; + assert hpr.minor >= 0; + + assert isWhitespace(pdf[minor_ipr.next_index]); + hpr.binary_characters_index = skipWhitespace(pdf, minor_ipr.next_index); + + assert pdf[hpr.binary_characters_index] == PDFNames.COMMENT; + + hpr.next_index = skipToNewline(pdf, hpr.binary_characters_index); + return hpr; + } + + /** + * Parses a PDF footer. + * + *

+ * A PDF footer starts with the xref, followed by the trailer, the startxref + * and the EOF marker. + *

+ * + * @param pdf + * The PDF data. + * @param index + * The index. + * @return Returns the result of the parsing operation. + * + * @see FooterParseResult + */ + public static FooterParseResult parseFooter(final byte[] pdf, final int index) + { + FooterParseResult fpr = new FooterParseResult(); + fpr.start_index = index; + + fpr.xpr = PDFUtils.parseXRefSection(pdf, fpr.start_index); + + fpr.tpr = PDFUtils.parseTrailer(pdf, fpr.xpr.next_index); + + fpr.sxpr = PDFUtils.parseStartXRef(pdf, fpr.tpr.next_index); + + fpr.eofpr = PDFUtils.parseEOF(pdf, fpr.sxpr.next_index); + + fpr.next_index = fpr.eofpr.next_index; + return fpr; + } + +} -- cgit v1.2.3