at.knowcenter.wag.exactparser.parsing
Class PDFUtils

java.lang.Object
  extended by at.knowcenter.wag.exactparser.parsing.PDFUtils

public abstract class PDFUtils
extends Object

Abstract class that contains several static utility methods for parsing and analyzing PDF documents on the lowest level.

Most operations require random access to the PDF data (mostly to verify the synthax). So the whole PDF document has to be provided as a byte array. The term "pdf+index" states a specific position index within this byte array.

Author:
wprinz

Field Summary
protected static byte[] LINE_TERMINATOR_CRALONE
           
protected static byte[] LINE_TERMINATOR_CRLF
           
protected static byte[] LINE_TERMINATOR_LF
           
 
Constructor Summary
PDFUtils()
           
 
Method Summary
static int findLastStartXRef(byte[] pdf)
          Searches the last occurrence of the "startxref" entry ... in other words starts the search from the end of the document and works reversely.
static int getObjectOffsetFromXRefByIndirectObjectReference(XRefSectionParseResult xpr, IndirectObjectReference ior)
           
static int indexOfName(byte[] pdf, List names, byte[] sought)
           
static boolean isDelimiter(byte data)
           
protected static boolean isHex(byte data)
           
static boolean isIndirectObjectReference(byte[] pdf, int index)
           
static boolean isNewline(byte[] data, int index)
           
static boolean isNumeric(byte data)
           
protected static boolean isRegular(byte data)
           
static boolean isSign(byte data)
           
static boolean isWhitespace(byte data)
           
static ArrayParseResult parseArray(byte[] pdf, int index)
           
static BooleanParseResult parseBoolean(byte[] pdf, int index)
          Parses a boolean value.
static DictionaryParseResult parseDictionary(byte[] pdf, int index)
           
static EOFParseResult parseEOF(byte[] pdf, int index)
          Parses the End Of File (EOF) marker at pdf+index.
static FooterParseResult parseFooter(byte[] pdf, int index)
          Parses a PDF footer.
static HeaderParseResult parseHeader(byte[] pdf, int index)
           
static HexStringParseResult parseHexString(byte[] pdf, int index)
          Parses a hexadecimal string.
static IndirectObjectReferenceParseResult parseIndirectObjectReference(byte[] pdf, int index)
          Parses an indirect object reference.
static IntegerParseResult parseInteger(byte[] pdf, int index)
          Parses a (potentially) signed integer.
static LiteralStringParseResult parseLiteralString(byte[] pdf, int index)
          Parses a literal string.
static NameParseResult parseName(byte[] pdf, int index)
          Parses a PDF Name.
static NullParseResult parseNull(byte[] pdf, int index)
           
static NumberParseResult parseNumberFromByteArray(byte[] pdf, int index)
          Parses an arbitrary number;
static ObjectParseResult parseObject(byte[] pdf, int index)
           
static ObjectHeaderParseResult parseObjectHeader(byte[] pdf, int index)
          Parses the object header at pdf+index.
static StartXRefParseResult parseStartXRef(byte[] pdf, int index)
          Parses the startxref section at pdf+index.
static StreamParseResult parseStream(byte[] pdf, int index, DictionaryParseResult dpr)
          Parses a stream.
static TrailerParseResult parseTrailer(byte[] pdf, int index)
           
static ParseResult parseUnknownObject(byte[] pdf, int index)
           
static IntegerParseResult parseUnsignedInteger(byte[] pdf, int index)
          Parses an unsigned integer.
static XRefLineParseResult parseXrefLine(byte[] pdf, int index)
          Parses a single 20 bytes xref line at pdf+index.
static XRefSectionParseResult parseXRefSection(byte[] pdf, int index)
          Parses the xref section at pdf+index.
static XRefSubSectionParseResult parseXRefSubSection(byte[] pdf, int index)
          Parses a xref sub-section.
static int readNumberFromByteArray(byte[] data, int index)
          Reads the (positive integer) number from the data.
static int skipNewline(byte[] data, int index)
           
static int skipToNewline(byte[] data, int index)
           
static int skipToWhitespace(byte[] data, int index)
          Skips bytes until whitespace is reached.
static int skipWhitespace(byte[] data, int index)
          Skips whitespace.
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

LINE_TERMINATOR_CRLF

protected static final byte[] LINE_TERMINATOR_CRLF

LINE_TERMINATOR_CRALONE

protected static final byte[] LINE_TERMINATOR_CRALONE

LINE_TERMINATOR_LF

protected static final byte[] LINE_TERMINATOR_LF
Constructor Detail

PDFUtils

public PDFUtils()
Method Detail

isWhitespace

public static boolean isWhitespace(byte data)

isDelimiter

public static boolean isDelimiter(byte data)

isRegular

protected static boolean isRegular(byte data)

skipWhitespace

public static int skipWhitespace(byte[] data,
                                 int index)
Skips whitespace.

Skips all whitespace, which may be none, one or multiple whitespace characters.

Note that this also skips newline characters (which belong to whitespace as well).

Parameters:
data - The PDF data.
index - The index.
Returns:
Returns the index of the first non whitespace character. This may be equal to index if no whitespaces were skipped at all.

skipToWhitespace

public static int skipToWhitespace(byte[] data,
                                   int index)
Skips bytes until whitespace is reached.

Skips all non whitespace characters, which may be none at all.

Parameters:
data - The PDF data.
index - The index.
Returns:
Returns the index of the first whitespace character. This may be equal to index if no non whitespaces were skipped at all.

isNewline

public static boolean isNewline(byte[] data,
                                int index)

skipNewline

public static int skipNewline(byte[] data,
                              int index)

skipToNewline

public static int skipToNewline(byte[] data,
                                int index)

parseBoolean

public static BooleanParseResult parseBoolean(byte[] pdf,
                                              int index)
Parses a boolean value.

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

isSign

public static boolean isSign(byte data)

isNumeric

public static boolean isNumeric(byte data)

readNumberFromByteArray

public static int readNumberFromByteArray(byte[] data,
                                          int index)
Reads the (positive integer) number from the data. The number must be terminated by the end of line.

Parameters:
data - The data.
index - The index.
Returns:
Returns the read number.

parseUnsignedInteger

public static IntegerParseResult parseUnsignedInteger(byte[] pdf,
                                                      int index)
Parses an unsigned integer.

The integer must be a block of successive number characters. It must not be preceded by a sign (not even '+').

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

parseInteger

public static IntegerParseResult parseInteger(byte[] pdf,
                                              int index)
Parses a (potentially) signed integer.

The integer must be a block of successive number characters. It may be preceded by a sign character ('+' or '-').

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

parseNumberFromByteArray

public static NumberParseResult parseNumberFromByteArray(byte[] pdf,
                                                         int index)
Parses an arbitrary number;

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

findLastStartXRef

public static int findLastStartXRef(byte[] pdf)
Searches the last occurrence of the "startxref" entry ... in other words starts the search from the end of the document and works reversely.

Parameters:
pdf - The complete PDF file data.
Returns:
Returns the offset (byte index) of the "startxref" entry.

parseXRefSection

public static XRefSectionParseResult parseXRefSection(byte[] pdf,
                                                      int index)
Parses the xref section at pdf+index.

An xref section starts with 'xref' and contains one or more xref sub-sections.

Parameters:
pdf - The PDF data.
index - The start index of the xref table.
Returns:
Returns the result of the parsing operation.

parseXRefSubSection

public static XRefSubSectionParseResult parseXRefSubSection(byte[] pdf,
                                                            int index)
Parses a xref sub-section.

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

parseXrefLine

public static XRefLineParseResult parseXrefLine(byte[] pdf,
                                                int index)
Parses a single 20 bytes xref line at pdf+index.

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

indexOfName

public static int indexOfName(byte[] pdf,
                              List names,
                              byte[] sought)

parseTrailer

public static TrailerParseResult parseTrailer(byte[] pdf,
                                              int index)

parseStartXRef

public static StartXRefParseResult parseStartXRef(byte[] pdf,
                                                  int index)
Parses the startxref section at pdf+index.

Parameters:
pdf - The complete PDF file data.
index - The index of the startxref section.
Returns:
Returns the retsult of the parsing operation.

parseEOF

public static EOFParseResult parseEOF(byte[] pdf,
                                      int index)
Parses the End Of File (EOF) marker at pdf+index.

Parameters:
pdf - The PDF data.
index - The index where to start the parsing.
Returns:
Returns the result of the parsing operation.

isIndirectObjectReference

public static boolean isIndirectObjectReference(byte[] pdf,
                                                int index)

parseIndirectObjectReference

public static IndirectObjectReferenceParseResult parseIndirectObjectReference(byte[] pdf,
                                                                              int index)
Parses an indirect object reference.

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

parseObjectHeader

public static ObjectHeaderParseResult parseObjectHeader(byte[] pdf,
                                                        int index)
Parses the object header at pdf+index.

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

parseObject

public static ObjectParseResult parseObject(byte[] pdf,
                                            int index)

parseUnknownObject

public static ParseResult parseUnknownObject(byte[] pdf,
                                             int index)

parseLiteralString

public static LiteralStringParseResult parseLiteralString(byte[] pdf,
                                                          int index)
Parses a literal string.

A literal string is a string of ASCII characters enclosed by '(' and ')'. Balanced pairs of '(' and ')' are allowed within the string. Unbalanced '(' or ')' must be escaped as '\(' or '\)'.

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

isHex

protected static boolean isHex(byte data)

parseHexString

public static HexStringParseResult parseHexString(byte[] pdf,
                                                  int index)
Parses a hexadecimal string.

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.

parseArray

public static ArrayParseResult parseArray(byte[] pdf,
                                          int index)

parseName

public static NameParseResult parseName(byte[] pdf,
                                        int index)
Parses a PDF Name.

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of this parsing operation.

parseDictionary

public static DictionaryParseResult parseDictionary(byte[] pdf,
                                                    int index)

parseStream

public static StreamParseResult parseStream(byte[] pdf,
                                            int index,
                                            DictionaryParseResult dpr)
Parses a stream.

Parameters:
pdf - The PDF data.
index - The index.
dpr - The DictionaryParseResult of the stream's dictionary. This dictionary must precede the stream keyword. Usually this is provided in the stream object's dictionary via the /Length field.
Returns:
Returns the result of this parsing operation.

parseNull

public static NullParseResult parseNull(byte[] pdf,
                                        int index)

getObjectOffsetFromXRefByIndirectObjectReference

public static int getObjectOffsetFromXRefByIndirectObjectReference(XRefSectionParseResult xpr,
                                                                   IndirectObjectReference ior)

parseHeader

public static HeaderParseResult parseHeader(byte[] pdf,
                                            int index)

parseFooter

public static FooterParseResult parseFooter(byte[] pdf,
                                            int index)
Parses a PDF footer.

A PDF footer starts with the xref, followed by the trailer, the startxref and the EOF marker.

Parameters:
pdf - The PDF data.
index - The index.
Returns:
Returns the result of the parsing operation.
See Also:
FooterParseResult


Copyright © 2006-2007 EGIZ - E-Government Innovationszentrum. All Rights Reserved.