aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/at/gv/egiz/pdfas/impl/input
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/at/gv/egiz/pdfas/impl/input')
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java56
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java47
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java105
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java44
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java20
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java103
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java124
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java49
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java82
-rw-r--r--src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java92
10 files changed, 722 insertions, 0 deletions
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java
new file mode 100644
index 0000000..0d27781
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java
@@ -0,0 +1,56 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import at.gv.egiz.pdfas.performance.PerformanceCounters;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+/**
+ * Implements a PdfDataSource that holds the whole PDF document in a byte array.
+ *
+ * <p>
+ * Note that holding the data in a byte array is very memory consuming for large
+ * documents.
+ * </p>
+ *
+ * @author wprinz
+ */
+public class ByteArrayPdfDataSourceImpl implements PdfDataSource
+{
+ protected byte[] pdf = null;
+
+ protected int length = -1;
+
+ public ByteArrayPdfDataSourceImpl(byte[] pdf)
+ {
+ PerformanceCounters.byteArrays.increment();
+
+ this.pdf = pdf;
+ this.length = pdf.length;
+ }
+
+ public ByteArrayPdfDataSourceImpl(byte[] pdf, int length)
+ {
+ PerformanceCounters.byteArrays.increment();
+
+ this.pdf = pdf;
+ this.length = length;
+ }
+
+
+ public InputStream createInputStream()
+ {
+ ByteArrayInputStream bais = new ByteArrayInputStream(this.pdf, 0, this.length);
+ return bais;
+ }
+
+ public int getLength()
+ {
+ return this.length;
+ }
+
+}
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java
new file mode 100644
index 0000000..f77d6be
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java
@@ -0,0 +1,47 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.SequenceInputStream;
+
+import at.gv.egiz.pdfas.framework.input.DataSource;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+/**
+ * @author wprinz
+ *
+ */
+public class CompoundPdfDataSourceImpl implements PdfDataSource
+{
+ protected DataSource originalDataSource = null;
+
+ protected byte[] appendix = null;
+
+ public CompoundPdfDataSourceImpl (PdfDataSource original, byte [] appendix)
+ {
+ this.originalDataSource = original;
+ this.appendix = appendix;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ ByteArrayInputStream bais = new ByteArrayInputStream(this.appendix);
+ SequenceInputStream sis = new SequenceInputStream(this.originalDataSource.createInputStream(), bais);
+ return sis;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength()
+ */
+ public int getLength()
+ {
+ return this.originalDataSource.getLength() + this.appendix.length;
+ }
+
+}
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java
new file mode 100644
index 0000000..4be9ec5
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java
@@ -0,0 +1,105 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * An input stream that has a delimited length.
+ *
+ * @author wprinz
+ */
+public class DelimitedInputStream extends InputStream
+{
+ /**
+ * The underlying InputStream.
+ */
+ protected InputStream is = null;
+
+ /**
+ * The number of bytes that can be read from the stream.
+ */
+ protected int bytes_to_read = -1;
+
+ /**
+ * Constructs the DelimitedInputStream from which a maximum of length bytes
+ * can be read.
+ */
+ public DelimitedInputStream(InputStream is, int length)
+ {
+ this.is = is;
+ this.bytes_to_read = length;
+ }
+
+ /**
+ * @see java.io.InputStream#read()
+ */
+ public int read() throws IOException
+ {
+ if (this.bytes_to_read <= 0)
+ {
+ return -1;
+ }
+ int read = this.is.read();
+ if (read > 0)
+ {
+ this.bytes_to_read--;
+ }
+ return read;
+ }
+
+ /**
+ * @see java.io.InputStream#read(byte[], int, int)
+ */
+ public int read(byte[] b, int off, int len) throws IOException
+ {
+ int btr = Math.min(len, this.bytes_to_read);
+ int read = this.is.read(b, off, btr);
+ if (read > 0)
+ {
+ this.bytes_to_read -= read;
+ }
+ return read;
+ }
+
+ /**
+ * @see java.io.InputStream#read(byte[])
+ */
+ public int read(byte[] b) throws IOException
+ {
+ return read(b, 0, b.length);
+ }
+
+ /**
+ * @see java.io.InputStream#skip(long)
+ */
+ public long skip(long n) throws IOException
+ {
+ long bts = Math.min(n, this.bytes_to_read);
+ long skipped = this.is.skip(bts);
+ if (skipped > 0)
+ {
+ this.bytes_to_read -= skipped;
+ }
+ return skipped;
+ }
+
+ /**
+ * @see java.io.InputStream#close()
+ */
+ public void close() throws IOException
+ {
+ this.is.close();
+ }
+
+ /**
+ * @see java.io.InputStream#available()
+ */
+ public int available() throws IOException
+ {
+ int avail = this.is.available();
+ return Math.min(this.bytes_to_read, avail);
+ }
+}
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java
new file mode 100644
index 0000000..6c67be2
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java
@@ -0,0 +1,44 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.InputStream;
+
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+/**
+ * @author wprinz
+ *
+ */
+public class DelimitedPdfDataSource implements PdfDataSource
+{
+
+ protected PdfDataSource dataSource = null;
+ protected int len = -1;
+
+ public DelimitedPdfDataSource (PdfDataSource original, int length)
+ {
+ this.dataSource = original;
+ this.len = length;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ InputStream originalIS = this.dataSource.createInputStream();
+ DelimitedInputStream dis = new DelimitedInputStream(originalIS, this.len);
+ return dis;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength()
+ */
+ public int getLength()
+ {
+ return this.len;
+ }
+
+}
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java
new file mode 100644
index 0000000..54f8842
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java
@@ -0,0 +1,20 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.File;
+
+/**
+ * Interface that reveals the underlying data file.
+ *
+ * @author wprinz
+ */
+public interface FileBased
+{
+ /**
+ * Returns the underlying data file.
+ * @return Returns the underlying data file.
+ */
+ public File getFile();
+}
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java
new file mode 100644
index 0000000..8453192
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java
@@ -0,0 +1,103 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+/**
+ * @author wprinz
+ *
+ */
+public class FileBasedPdfDataSourceImpl implements PdfDataSource, FileBased
+{
+ /**
+ * The log.
+ */
+ private static final Log log = LogFactory.getLog(FileBasedPdfDataSourceImpl.class);
+
+ /**
+ * The underlying file.
+ */
+ protected File inputFile = null;
+
+ protected int length = -1;
+
+ /**
+ * Constructor that creates this PdfDataSource backed by a file in the file
+ * system.
+ *
+ * @param file
+ * The input File.
+ * @param length
+ * The length of the InputStream. The is the maximum number of bytes
+ * that can be read from the stream.
+ * @throws IOException
+ * Thrown if the file cannot be read properly.
+ */
+ public FileBasedPdfDataSourceImpl(File file, int length) throws IOException
+ {
+
+ if (!file.exists())
+ {
+ throw new FileNotFoundException("The file '" + file + "' does not exist.");
+ }
+ // for some reason the isFile is not always correct...
+ // if (file.isFile())
+ // {
+ // throw new IOException("The file '" + file + "' is not a normal file.");
+ // }
+ if (!file.canRead())
+ {
+ throw new IOException("The file '" + file + "' cannot be read.");
+ }
+
+ this.inputFile = file;
+ this.length = length;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile()
+ */
+ public File getFile()
+ {
+ return this.inputFile;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ try
+ {
+ FileInputStream fis = new FileInputStream(getFile());
+ DelimitedInputStream dis = new DelimitedInputStream(fis, getLength());
+ return dis;
+ }
+ catch (IOException e)
+ {
+ log.error("Couldn't create InputStream for file " + getFile() + ". Returning null.");
+ log.error(e);
+
+ return null;
+ }
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#getLength()
+ */
+ public int getLength()
+ {
+ return this.length;
+ }
+}
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java
new file mode 100644
index 0000000..6f6c7b4
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java
@@ -0,0 +1,124 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import at.gv.egiz.pdfas.framework.input.TextDataSource;
+
+/**
+ * @author wprinz
+ *
+ */
+public class FileBasedTextDataSourceImpl implements TextDataSource, FileBased
+{
+ /**
+ * The log.
+ */
+ private static final Log log = LogFactory.getLog(FileBasedTextDataSourceImpl.class);
+
+ protected File file = null;
+
+ protected String characterEncoding = null;
+
+ public FileBasedTextDataSourceImpl(File file, String characterEncoding) throws IOException
+ {
+ if (!file.exists())
+ {
+ throw new FileNotFoundException("The file '" + file + "' does not exist.");
+ }
+ if (!file.canRead())
+ {
+ throw new IOException("The file '" + file + "' cannot be read.");
+ }
+
+ this.file = file;
+ this.characterEncoding = characterEncoding;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile()
+ */
+ public File getFile()
+ {
+ return this.file;
+ }
+
+ /**
+ * Returns the character encoding.
+ *
+ * @return Returns the character encoding.
+ */
+ public String getCharacterEncoding()
+ {
+ return this.characterEncoding;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText()
+ */
+ public String getText()
+ {
+ try
+ {
+ InputStream is = createInputStream();
+ byte[] data = new byte[getLength()];
+ int read = 0;
+ int n = 0;
+ while ((n = is.read(data, read, data.length - read)) > 0)
+ {
+ read += n;
+ }
+ is.close();
+
+ String text = new String(data, getCharacterEncoding());
+
+ data = null;
+
+ return text;
+ }
+ catch (IOException e)
+ {
+ log.error("Couldn't read text for file " + getFile() + ". Returning null.");
+ log.error(e);
+
+ return null;
+ }
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ try
+ {
+ FileInputStream fis = new FileInputStream(getFile());
+ return fis;
+ }
+ catch (IOException e)
+ {
+ log.error("Couldn't create InputStream for file " + getFile() + ". Returning null.");
+ log.error(e);
+
+ return null;
+ }
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength()
+ */
+ public int getLength()
+ {
+ return (int) getFile().length();
+ }
+
+}
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java b/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java
new file mode 100644
index 0000000..b4c2bef
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java
@@ -0,0 +1,49 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.util.List;
+
+import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper;
+import at.gv.egiz.pdfas.exceptions.ErrorCode;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException;
+import at.knowcenter.wag.exactparser.ParseDocument;
+
+/**
+ * Parses the given PDF document into a list of Incremental Update blocks.
+ * @author wprinz
+ */
+public class IncrementalUpdateParser
+{
+ /**
+ * The log.
+ */
+ private static final Log log = LogFactory.getLog(IncrementalUpdateParser.class);
+
+ public static List parsePdfIntoIUBlocks (PdfDataSource pdfDataSource) throws PDFDocumentException
+ {
+ log.trace("parsePdfIntoIUBlocks:");
+
+ List blocks = null;
+ try
+ {
+ byte [] pdf = DataSourceHelper.convertDataSourceToByteArray(pdfDataSource);
+ blocks = ParseDocument.parseDocument(pdf);
+ }
+ catch (Exception e)
+ {
+ log.error("Error while parsing Document into IU blocks.", e);
+ throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
+ }
+
+ log.trace("parsePdfIntoIUBlocks finished.");
+ return blocks;
+ }
+
+}
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java
new file mode 100644
index 0000000..b259a3e
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java
@@ -0,0 +1,82 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+
+import at.gv.egiz.pdfas.framework.input.TextDataSource;
+
+/**
+ * A TextDataSource that keeps the text in memory.
+ *
+ * <p>
+ * Keeping the text in memory is fast as long as the text is short, but may
+ * result in bad memory performance when the text is longer. Use a FileBased
+ * TextDataSource instead if memory is an issue.
+ * </p>
+ *
+ * @author wprinz
+ */
+public class TextDataSourceImpl implements TextDataSource
+{
+ /**
+ * The text.
+ */
+ protected String text = null;
+
+ /**
+ * Constructor that sets the text.
+ *
+ * @param text
+ * The text.
+ */
+ public TextDataSourceImpl(String text)
+ {
+ this.text = text;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText()
+ */
+ public String getText()
+ {
+ return this.text;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ try
+ {
+ byte[] data = getText().getBytes("UTF-8");
+ // PERF: if memory is an issue (e.g. in web), use a FileBased TextDataSource instead.
+ return new ByteArrayInputStream(data);
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength()
+ */
+ public int getLength()
+ {
+ try
+ {
+ byte[] data = getText().getBytes("UTF-8");
+ return data.length;
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java b/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java
new file mode 100644
index 0000000..1e2ffdc
--- /dev/null
+++ b/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java
@@ -0,0 +1,92 @@
+/**
+ *
+ */
+package at.gv.egiz.pdfas.impl.input.helper;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import at.gv.egiz.pdfas.performance.PerformanceCounters;
+import at.gv.egiz.pdfas.framework.input.DataSource;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * @author wprinz
+ *
+ */
+public class DataSourceHelper
+{
+ /**
+ * The log.
+ */
+ private static final Log log = LogFactory.getLog(DataSourceHelper.class);
+
+ /**
+ * Converts a PdfDataSource to a byte array.
+ *
+ * <p>
+ * Note that this function is very memory intensive. Use the Streams whereever
+ * possible.
+ * </p>
+ *
+ * @deprecated
+ *
+ * @param pdfDataSource
+ * @return
+ * @throws IOException
+ */
+ public static byte[] convertDataSourceToByteArray(DataSource pdfDataSource)
+ {
+ try
+ {
+ PerformanceCounters.byteArrays.increment();
+
+ byte[] data = new byte[pdfDataSource.getLength()];
+
+ int bytes_written = 0;
+
+ InputStream is = pdfDataSource.createInputStream();
+ int n = 0;
+ while ((n = is.read(data, bytes_written, data.length - bytes_written)) > 0)
+ {
+ bytes_written += n;
+ }
+ is.close();
+
+ assert bytes_written == data.length;
+
+ return data;
+ }
+ catch (IOException e)
+ {
+ log.error(e);
+ throw new RuntimeException(e);
+ }
+ }
+
+ public static void debugDataSourceToFile(DataSource dataSource, File file)
+ {
+ try
+ {
+ InputStream is = dataSource.createInputStream();
+ FileOutputStream fos = new FileOutputStream(file);
+ byte[] data = new byte[2048];
+ int n = -1;
+ while ((n = is.read(data)) > 0)
+ {
+ fos.write(data, 0, n);
+ }
+ is.close();
+ fos.close();
+ }
+ catch (IOException e)
+ {
+ log.error(e);
+ }
+ }
+}