aboutsummaryrefslogtreecommitdiff
path: root/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input
diff options
context:
space:
mode:
Diffstat (limited to 'pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input')
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java85
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java85
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java125
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java82
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java40
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java150
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java160
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java92
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java120
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java283
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java82
-rw-r--r--pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java148
12 files changed, 1452 insertions, 0 deletions
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java
new file mode 100644
index 0000000..edcb1d4
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java
@@ -0,0 +1,85 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import at.gv.egiz.pdfas.performance.PerformanceCounters;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+/**
+ * Implements a PdfDataSource that holds the whole PDF document in a byte array.
+ *
+ * <p>
+ * Note that holding the data in a byte array is very memory consuming for large
+ * documents.
+ * </p>
+ *
+ * @author wprinz
+ */
+public class ByteArrayPdfDataSourceImpl implements PdfDataSource
+{
+ protected byte[] pdf = null;
+
+ public ByteArrayPdfDataSourceImpl(byte[] pdf)
+ {
+ PerformanceCounters.byteArrays.increment();
+
+ this.pdf = pdf;
+ }
+
+ public ByteArrayPdfDataSourceImpl(byte[] pdf, int length)
+ {
+ PerformanceCounters.byteArrays.increment();
+
+ if (pdf.length == length)
+ {
+ this.pdf = pdf;
+ }
+ else
+ {
+ this.pdf = new byte [length];
+ System.arraycopy(pdf, 0, this.pdf, 0, length);
+ }
+ }
+
+
+ public InputStream createInputStream()
+ {
+ ByteArrayInputStream bais = new ByteArrayInputStream(this.pdf);
+ return bais;
+ }
+
+ public int getLength()
+ {
+ return this.pdf.length;
+ }
+
+ public byte[] getAsByteArray()
+ {
+ return this.pdf;
+ }
+
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java
new file mode 100644
index 0000000..f5e9b76
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java
@@ -0,0 +1,85 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.SequenceInputStream;
+
+import at.gv.egiz.pdfas.framework.input.DataSource;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+/**
+ * @author wprinz
+ *
+ */
+public class CompoundPdfDataSourceImpl implements PdfDataSource
+{
+ protected DataSource originalDataSource = null;
+
+ protected byte[] appendix = null;
+
+ public CompoundPdfDataSourceImpl (PdfDataSource original, byte [] appendix)
+ {
+ this.originalDataSource = original;
+ this.appendix = appendix;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ ByteArrayInputStream bais = new ByteArrayInputStream(this.appendix);
+ SequenceInputStream sis = new SequenceInputStream(this.originalDataSource.createInputStream(), bais);
+ return sis;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength()
+ */
+ public int getLength()
+ {
+ return this.originalDataSource.getLength() + this.appendix.length;
+ }
+
+ byte [] cache = null;
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray()
+ */
+ public byte[] getAsByteArray()
+ {
+ if (cache != null)
+ {
+ return cache;
+ }
+
+ cache = new byte [getLength()];
+ System.arraycopy(originalDataSource.getAsByteArray(), 0, cache, 0, originalDataSource.getLength());
+ System.arraycopy(appendix, 0, cache, originalDataSource.getLength(), appendix.length);
+
+ return cache;
+ }
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java
new file mode 100644
index 0000000..f10b546
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java
@@ -0,0 +1,125 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * An input stream that has a delimited length.
+ *
+ * @author wprinz
+ */
+public class DelimitedInputStream extends InputStream
+{
+ /**
+ * The underlying InputStream.
+ */
+ protected InputStream is = null;
+
+ /**
+ * The number of bytes that can be read from the stream.
+ */
+ protected int bytes_to_read = -1;
+
+ /**
+ * Constructs the DelimitedInputStream from which a maximum of length bytes
+ * can be read.
+ */
+ public DelimitedInputStream(InputStream is, int length)
+ {
+ this.is = is;
+ this.bytes_to_read = length;
+ }
+
+ /**
+ * @see java.io.InputStream#read()
+ */
+ public int read() throws IOException
+ {
+ if (this.bytes_to_read <= 0)
+ {
+ return -1;
+ }
+ int read = this.is.read();
+ if (read > 0)
+ {
+ this.bytes_to_read--;
+ }
+ return read;
+ }
+
+ /**
+ * @see java.io.InputStream#read(byte[], int, int)
+ */
+ public int read(byte[] b, int off, int len) throws IOException
+ {
+ int btr = Math.min(len, this.bytes_to_read);
+ int read = this.is.read(b, off, btr);
+ if (read > 0)
+ {
+ this.bytes_to_read -= read;
+ }
+ return read;
+ }
+
+ /**
+ * @see java.io.InputStream#read(byte[])
+ */
+ public int read(byte[] b) throws IOException
+ {
+ return read(b, 0, b.length);
+ }
+
+ /**
+ * @see java.io.InputStream#skip(long)
+ */
+ public long skip(long n) throws IOException
+ {
+ long bts = Math.min(n, this.bytes_to_read);
+ long skipped = this.is.skip(bts);
+ if (skipped > 0)
+ {
+ this.bytes_to_read -= skipped;
+ }
+ return skipped;
+ }
+
+ /**
+ * @see java.io.InputStream#close()
+ */
+ public void close() throws IOException
+ {
+ this.is.close();
+ }
+
+ /**
+ * @see java.io.InputStream#available()
+ */
+ public int available() throws IOException
+ {
+ int avail = this.is.available();
+ return Math.min(this.bytes_to_read, avail);
+ }
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java
new file mode 100644
index 0000000..ca73f37
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java
@@ -0,0 +1,82 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.InputStream;
+
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+/**
+ * @author wprinz
+ *
+ */
+public class DelimitedPdfDataSource implements PdfDataSource
+{
+
+ protected PdfDataSource dataSource = null;
+ protected int len = -1;
+
+ public DelimitedPdfDataSource (PdfDataSource original, int length)
+ {
+ this.dataSource = original;
+ this.len = length;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ InputStream originalIS = this.dataSource.createInputStream();
+ DelimitedInputStream dis = new DelimitedInputStream(originalIS, this.len);
+ return dis;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength()
+ */
+ public int getLength()
+ {
+ return this.len;
+ }
+
+ byte [] cache = null;
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray()
+ */
+ public byte[] getAsByteArray()
+ {
+ if (cache != null)
+ {
+ return cache;
+ }
+
+ cache = new byte [getLength()];
+ System.arraycopy(dataSource.getAsByteArray(), 0, cache, 0, getLength());
+
+ return cache;
+ }
+
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java
new file mode 100644
index 0000000..65ee416
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java
@@ -0,0 +1,40 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.File;
+
+/**
+ * Interface that reveals the underlying data file.
+ *
+ * @author wprinz
+ */
+public interface FileBased
+{
+ /**
+ * Returns the underlying data file.
+ * @return Returns the underlying data file.
+ */
+ public File getFile();
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java
new file mode 100644
index 0000000..a710c3c
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java
@@ -0,0 +1,150 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper;
+
+/**
+ * @author wprinz
+ *
+ */
+public class FileBasedPdfDataSourceImpl implements PdfDataSource, FileBased
+{
+ /**
+ * The log.
+ */
+ private static final Log log = LogFactory.getLog(FileBasedPdfDataSourceImpl.class);
+
+ /**
+ * The underlying file.
+ */
+ protected File inputFile = null;
+
+ protected int length = -1;
+
+ /**
+ * Constructor that creates this PdfDataSource backed by a file in the file
+ * system.
+ *
+ * @param file
+ * The input File.
+ * @param length
+ * The length of the InputStream. The is the maximum number of bytes
+ * that can be read from the stream.
+ * @throws IOException
+ * Thrown if the file cannot be read properly.
+ */
+ public FileBasedPdfDataSourceImpl(File file, int length) throws IOException
+ {
+
+ if (!file.exists())
+ {
+ throw new FileNotFoundException("The file '" + file + "' does not exist.");
+ }
+ // for some reason the isFile is not always correct...
+ // if (file.isFile())
+ // {
+ // throw new IOException("The file '" + file + "' is not a normal file.");
+ // }
+ if (!file.canRead())
+ {
+ throw new IOException("The file '" + file + "' cannot be read.");
+ }
+
+ this.inputFile = file;
+ this.length = length;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile()
+ */
+ public File getFile()
+ {
+ return this.inputFile;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ if (cache == null)
+ {
+ getAsByteArray();
+ }
+ return new ByteArrayInputStream(cache);
+ }
+
+ protected InputStream createFileInputStream()
+ {
+ try
+ {
+ FileInputStream fis = new FileInputStream(getFile());
+ DelimitedInputStream dis = new DelimitedInputStream(fis, getLength());
+ return dis;
+ }
+ catch (IOException e)
+ {
+ log.error("Couldn't create InputStream for file " + getFile() + ". Returning null.", e);
+
+ return null;
+ }
+ }
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#getLength()
+ */
+ public int getLength()
+ {
+ return this.length;
+ }
+
+ byte [] cache = null;
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray()
+ */
+ public byte[] getAsByteArray()
+ {
+ if (cache != null)
+ {
+ return cache;
+ }
+
+ cache = DataSourceHelper.convertInputStreamToByteArray(createFileInputStream());
+
+ return cache;
+ }
+
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java
new file mode 100644
index 0000000..5a84ce2
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java
@@ -0,0 +1,160 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import at.gv.egiz.pdfas.framework.input.TextDataSource;
+import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper;
+
+/**
+ * @author wprinz
+ *
+ */
+public class FileBasedTextDataSourceImpl implements TextDataSource, FileBased
+{
+ /**
+ * The log.
+ */
+ private static final Log log = LogFactory.getLog(FileBasedTextDataSourceImpl.class);
+
+ protected File file = null;
+
+ protected String characterEncoding = null;
+
+ public FileBasedTextDataSourceImpl(File file, String characterEncoding) throws IOException
+ {
+ if (!file.exists())
+ {
+ throw new FileNotFoundException("The file '" + file + "' does not exist.");
+ }
+ if (!file.canRead())
+ {
+ throw new IOException("The file '" + file + "' cannot be read.");
+ }
+
+ this.file = file;
+ this.characterEncoding = characterEncoding;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile()
+ */
+ public File getFile()
+ {
+ return this.file;
+ }
+
+ /**
+ * Returns the character encoding.
+ *
+ * @return Returns the character encoding.
+ */
+ public String getCharacterEncoding()
+ {
+ return this.characterEncoding;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText()
+ */
+ public String getText()
+ {
+ try
+ {
+ InputStream is = createInputStream();
+ byte[] data = new byte[getLength()];
+ int read = 0;
+ int n = 0;
+ while ((n = is.read(data, read, data.length - read)) > 0)
+ {
+ read += n;
+ }
+ is.close();
+
+ String text = new String(data, getCharacterEncoding());
+
+ data = null;
+
+ return text;
+ }
+ catch (IOException e)
+ {
+ log.error("Couldn't read text for file " + getFile() + ". Returning null.", e);
+
+ return null;
+ }
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ try
+ {
+ FileInputStream fis = new FileInputStream(getFile());
+ return fis;
+ }
+ catch (IOException e)
+ {
+ log.error("Couldn't create InputStream for file " + getFile() + ". Returning null.", e);
+
+ return null;
+ }
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength()
+ */
+ public int getLength()
+ {
+ return (int) getFile().length();
+ }
+
+ byte [] cache = null;
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray()
+ */
+ public byte[] getAsByteArray()
+ {
+ if (cache != null)
+ {
+ return cache;
+ }
+
+ cache = DataSourceHelper.convertInputStreamToByteArray(createInputStream());
+
+ return cache;
+ }
+
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java
new file mode 100644
index 0000000..c1dcc03
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java
@@ -0,0 +1,92 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import at.gv.egiz.pdfas.exceptions.ErrorCode;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+import at.gv.egiz.pdfas.framework.input.PdfDataSourceHolder;
+import at.gv.egiz.pdfas.framework.input.correction.Corrector;
+import at.gv.egiz.pdfas.framework.input.correction.CorrectorFactory;
+import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper;
+import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
+import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException;
+import at.knowcenter.wag.exactparser.ParseDocument;
+
+/**
+ * Parses the given PDF document into a list of Incremental Update blocks.
+ * @author wprinz
+ */
+public class IncrementalUpdateParser
+{
+ /**
+ * The log.
+ */
+ private static final Log log = LogFactory.getLog(IncrementalUpdateParser.class);
+
+ public static List parsePdfIntoIUBlocks (PdfDataSourceHolder pdfDataSource) throws PDFDocumentException
+ {
+ log.trace("parsePdfIntoIUBlocks:");
+
+ List blocks = null;
+ try
+ {
+ byte [] pdf = DataSourceHelper.convertDataSourceToByteArray(pdfDataSource.getDataSource());
+ blocks = ParseDocument.parseDocument(pdf);
+ }
+ catch (Exception e) {
+ try {
+ log.debug("Error while parsing Document.", e);
+ boolean tryToCorrect = SettingsReader.getInstance().getSetting("correct_document_on_verify_if_necessary", "false").equals("true");
+ if (tryToCorrect) {
+ log.info("Correcting document...");
+ Corrector cor = CorrectorFactory.createCorrector();
+ PdfDataSource correctedDS = cor.correctDocument(pdfDataSource.getDataSource());
+ log.info("Correction finished.");
+ byte [] pdf = DataSourceHelper.convertDataSourceToByteArray(correctedDS);
+ blocks = ParseDocument.parseDocument(pdf);
+ pdfDataSource.setDataSource(correctedDS);
+ } else {
+ makeError(e);
+ }
+
+ } catch (Exception e1) {
+ makeError(e);
+ }
+ }
+
+ log.trace("parsePdfIntoIUBlocks finished.");
+ return blocks;
+ }
+
+ private static void makeError(Exception e) throws PDFDocumentException {
+ log.error("Error while parsing Document into IU blocks.", e);
+ throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e);
+ }
+
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java
new file mode 100644
index 0000000..fa5ab04
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java
@@ -0,0 +1,120 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+
+import at.gv.egiz.pdfas.framework.input.TextDataSource;
+
+/**
+ * A TextDataSource that keeps the text in memory.
+ *
+ * <p>
+ * Keeping the text in memory is fast as long as the text is short, but may
+ * result in bad memory performance when the text is longer. Use a FileBased
+ * TextDataSource instead if memory is an issue.
+ * </p>
+ *
+ * @author wprinz
+ */
+public class TextDataSourceImpl implements TextDataSource
+{
+ /**
+ * The text.
+ */
+ protected String text = null;
+
+ private final static String CHARSET = "UTF-8";
+
+ /**
+ * Constructor that sets the text.
+ *
+ * @param text
+ * The text.
+ */
+ public TextDataSourceImpl(String text)
+ {
+ this.text = text;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText()
+ */
+ public String getText()
+ {
+ return this.text;
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream()
+ */
+ public InputStream createInputStream()
+ {
+ try
+ {
+ byte[] data = getText().getBytes(CHARSET);
+ // PERF: if memory is an issue (e.g. in web), use a FileBased TextDataSource instead.
+ return new ByteArrayInputStream(data);
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength()
+ */
+ public int getLength()
+ {
+ try
+ {
+ byte[] data = getText().getBytes(CHARSET);
+ return data.length;
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray()
+ */
+ public byte[] getAsByteArray()
+ {
+ try
+ {
+ byte[] data = getText().getBytes(CHARSET);
+ return data;
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java
new file mode 100644
index 0000000..efd094a
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java
@@ -0,0 +1,283 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input.correction;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import at.gv.egiz.pdfas.exceptions.ErrorCode;
+import at.gv.egiz.pdfas.exceptions.framework.CorrectorException;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+import at.gv.egiz.pdfas.framework.input.correction.Corrector;
+import at.gv.egiz.pdfas.impl.input.FileBased;
+import at.gv.egiz.pdfas.impl.input.FileBasedPdfDataSourceImpl;
+import at.gv.egiz.pdfas.utils.TempDirHelper;
+import at.knowcenter.wag.egov.egiz.cfg.SettingsReader;
+import at.knowcenter.wag.egov.egiz.exceptions.SettingNotFoundException;
+import at.knowcenter.wag.egov.egiz.exceptions.SettingsException;
+
+/**
+ * Corrects the document using an extrenal commandline tool.
+ *
+ * <p>
+ * Process.destroy after a certain timeout does not work if the executable is a
+ * Windows batch file.
+ * </p>
+ *
+ * @author wprinz
+ */
+public class ExternalCorrector implements Corrector
+{
+ public static final String INPUT_DOCUMENT_REPLACE = "##input_document##";
+
+ public static final String OUTPUT_DOCUMENT_REPLACE = "##output_document##";
+
+ public static final String COMMANDLINE_KEY = "external_corrector_commandline";
+
+ public static final String TIMEOUT_KEY = "external_corrector_timeout";
+
+ protected static final int DEFAULT_TIMEOUT = 1000;
+
+ /**
+ * The log.
+ */
+ private static final Log log = LogFactory.getLog(ExternalCorrector.class);
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.correction.Corrector#correctDocument(at.gv.egiz.pdfas.framework.input.PdfDataSource)
+ */
+ public PdfDataSource correctDocument(PdfDataSource document) throws CorrectorException
+ {
+
+ try
+ {
+ String outName = null;
+ File in = null;
+ if (document instanceof FileBased)
+ {
+ FileBased fb = (FileBased) document;
+ in = fb.getFile();
+ outName = in.getName() + "_correction_outfile.pdf";
+ }
+ else
+ {
+ in = TempDirHelper.placeInputIntoTempDirFile(document.createInputStream(), "correction_infile.pdf");
+ outName = "correction_outfile.pdf";
+ }
+
+ File out = TempDirHelper.formTempFile(outName);
+
+ String commandline = SettingsReader.getInstance().getSetting(COMMANDLINE_KEY);
+ long timeout = SettingsReader.getInstance().getIntSetting(TIMEOUT_KEY, DEFAULT_TIMEOUT);
+
+ String inF = in.getAbsolutePath();
+ commandline = commandline.replaceFirst(INPUT_DOCUMENT_REPLACE, inF.replaceAll("\\\\", "\\\\\\\\"));
+ String outF = out.getAbsolutePath();
+ commandline = commandline.replaceFirst(OUTPUT_DOCUMENT_REPLACE, outF.replaceAll("\\\\", "\\\\\\\\"));
+
+ log.info(commandline);
+
+ Process p = Runtime.getRuntime().exec(commandline);
+
+ Thread outT = null;
+ Thread errT = null;
+ TimeoutThread tt = null;
+ BufferedReader outReader = null;
+ BufferedReader errReader = null;
+
+ try
+ {
+ outReader = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ errReader = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+
+ outT = new Thread(new ReaderPrinter(outReader, "STDOUT"));
+ errT = new Thread(new ReaderPrinter(errReader, "STDERR"));
+
+ tt = new TimeoutThread(p, timeout, new Thread[] { outT, errT });
+
+ tt.start();
+ outT.start();
+ errT.start();
+
+ log.trace("Joining the STDOUT thread...");
+ outT.join();
+ log.trace("STDOUT thread joined.");
+ log.trace("Joining the STDERR thread...");
+ errT.join();
+ log.trace("STDERR thread joined.");
+
+ log.trace("Waiting for process to end...");
+ p.waitFor();
+ log.trace("process has ended.");
+
+ log.trace("Interrupting timeout thread...");
+ tt.interrupt();
+ log.trace("timeout thread has been interrupted.");
+
+ int exitValue = p.exitValue();
+ log.info("External Corrector exited with: " + exitValue);
+
+ if (tt.isTimedOut())
+ {
+ throw new CorrectorException(ErrorCode.EXTERNAL_CORRECTOR_TIMEOUT_REACHED, "The external corrector process timed out. timeout = " + timeout);
+ }
+
+ PdfDataSource ds = new FileBasedPdfDataSourceImpl(out, (int) out.length());
+ return ds;
+ }
+ finally
+ {
+ if (outT != null)
+ {
+ outT.interrupt();
+ }
+ if (errT != null)
+ {
+ errT.interrupt();
+ }
+ if (tt != null)
+ {
+ tt.interrupt();
+ }
+ if (outReader != null)
+ {
+ outReader.close();
+ }
+ if (errReader != null)
+ {
+ errReader.close();
+ }
+ }
+
+ }
+ catch (IOException e)
+ {
+ throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e);
+ }
+ catch (InterruptedException e)
+ {
+ throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e);
+ }
+ catch (SettingNotFoundException e)
+ {
+ throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e);
+ }
+ catch (SettingsException e)
+ {
+ throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e);
+ }
+ }
+
+ protected static class ReaderPrinter implements Runnable
+ {
+ protected BufferedReader reader = null;
+
+ protected String streamName = null;
+
+ public ReaderPrinter(BufferedReader reader, String streamName)
+ {
+ this.reader = reader;
+ this.streamName = streamName;
+ }
+
+ public void run()
+ {
+ try
+ {
+ String line = null;
+
+ while ((line = this.reader.readLine()) != null)
+ {
+ if (line != null)
+ {
+ log.info(streamName + ": " + line);
+ }
+ }
+ }
+ catch (IOException e)
+ {
+ log.error(e.getMessage(), e);
+ }
+ }
+ }
+
+ protected static class TimeoutThread extends Thread
+ {
+ protected Process proc = null;
+
+ protected long timeout = -1;
+
+ protected boolean ranIntoTimeout = false;
+
+ protected Thread[] threads;
+
+ protected BufferedReader errReader;
+
+ public TimeoutThread(Process proc, long timeout, Thread[] threadsToInterrupt)
+ {
+ this.proc = proc;
+ this.timeout = timeout;
+ this.threads = threadsToInterrupt;
+ }
+
+ public void run()
+ {
+ try
+ {
+ Thread.sleep(this.timeout);
+ log.info("The timeout was reached. Destroying the process.");
+ proc.destroy();
+ ranIntoTimeout = true;
+ log.trace("destroy has been called.");
+ log.trace("Interrupting threads...");
+ for (int i = 0; i < this.threads.length; i++)
+ {
+ this.threads[i].interrupt();
+ }
+ log.trace("threads have been interrupted.");
+ }
+ catch (InterruptedException e)
+ {
+ log.debug("Timeout thread interrupted. This means that the process finished successfully.");
+ }
+ }
+
+ /**
+ * Tells, if the process ran into the timeout.
+ *
+ * @return Returns true if the timeout was reached. Returns false if the
+ * timeout was not reached.
+ */
+ public boolean isTimedOut()
+ {
+ return this.ranIntoTimeout;
+ }
+ }
+
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java
new file mode 100644
index 0000000..eaa6b7f
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java
@@ -0,0 +1,82 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input.correction;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import at.gv.egiz.pdfas.exceptions.ErrorCode;
+import at.gv.egiz.pdfas.exceptions.framework.CorrectorException;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+import at.gv.egiz.pdfas.framework.input.correction.Corrector;
+import at.gv.egiz.pdfas.impl.input.ByteArrayPdfDataSourceImpl;
+import at.gv.egiz.pdfas.utils.PDFASUtils;
+import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException;
+
+import com.lowagie.text.DocumentException;
+import com.lowagie.text.pdf.PdfReader;
+import com.lowagie.text.pdf.PdfStamper;
+
+/**
+ * Corrects a document using iText.
+ *
+ * @author wprinz
+ */
+public class InternalCorrector implements Corrector
+{
+
+ /**
+ * @see at.gv.egiz.pdfas.framework.input.correction.Corrector#correctDocument(at.gv.egiz.pdfas.framework.input.PdfDataSource)
+ */
+ public PdfDataSource correctDocument(PdfDataSource document) throws CorrectorException
+ {
+ try
+ {
+ byte[] pdf = document.getAsByteArray();
+ PdfReader reader = new PdfReader(pdf);
+ PDFASUtils.checkReaderPermissions(reader);
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(pdf.length);
+
+ PdfStamper stamper = new PdfStamper(reader, baos, '\0', false);
+ stamper.close();
+
+ baos.close();
+ byte[] corrected_pdf = baos.toByteArray();
+
+ return new ByteArrayPdfDataSourceImpl(corrected_pdf);
+ }
+ catch (DocumentException e)
+ {
+ throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e);
+ }
+ catch (IOException e)
+ {
+ throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e);
+ } catch (PDFDocumentException e) {
+ throw new CorrectorException(e.getErrorCode(), e);
+ }
+ }
+
+}
diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java
new file mode 100644
index 0000000..76a5f99
--- /dev/null
+++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java
@@ -0,0 +1,148 @@
+/**
+ * <copyright> Copyright 2006 by Know-Center, Graz, Austria </copyright>
+ * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a
+ * joint initiative of the Federal Chancellery Austria and Graz University of
+ * Technology.
+ *
+ * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by
+ * the European Commission - subsequent versions of the EUPL (the "Licence");
+ * You may not use this work except in compliance with the Licence.
+ * You may obtain a copy of the Licence at:
+ * http://www.osor.eu/eupl/
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Licence is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the Licence for the specific language governing permissions and
+ * limitations under the Licence.
+ *
+ * This product combines work with different licenses. See the "NOTICE" text
+ * file for details on the various modules and licenses.
+ * The "NOTICE" text file is part of the distribution. Any derivative works
+ * that you distribute must include a readable copy of the "NOTICE" text file.
+ */
+package at.gv.egiz.pdfas.impl.input.helper;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import at.gv.egiz.pdfas.performance.PerformanceCounters;
+import at.gv.egiz.pdfas.framework.input.DataSource;
+import at.gv.egiz.pdfas.framework.input.PdfDataSource;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * @author wprinz
+ *
+ */
+public class DataSourceHelper
+{
+ /**
+ * The log.
+ */
+ private static final Log log = LogFactory.getLog(DataSourceHelper.class);
+
+ /**
+ * Converts a PdfDataSource to a byte array.
+ *
+ * <p>
+ * Note that this function is very memory intensive. Use the Streams whereever
+ * possible.
+ * </p>
+ *
+ * @deprecated
+ *
+ * @param pdfDataSource
+ * @return
+ * @throws IOException
+ */
+ public static byte[] convertDataSourceToByteArray(DataSource pdfDataSource)
+ {
+ return pdfDataSource.getAsByteArray();
+// try
+// {
+// PerformanceCounters.byteArrays.increment();
+//
+// byte[] data = new byte[pdfDataSource.getLength()];
+//
+// int bytes_written = 0;
+//
+// InputStream is = pdfDataSource.createInputStream();
+// int n = 0;
+// while ((n = is.read(data, bytes_written, data.length - bytes_written)) > 0)
+// {
+// bytes_written += n;
+// }
+// is.close();
+//
+// assert bytes_written == data.length;
+//
+// return data;
+// }
+// catch (IOException e)
+// {
+// log.error(e);
+// throw new RuntimeException(e);
+// }
+ }
+
+ public static byte [] convertInputStreamToByteArray(InputStream inputStream)
+ {
+ try
+ {
+ return convertInputStreamToByteArrayIOEx(inputStream);
+ }
+ catch (IOException e)
+ {
+ log.error(e);
+ throw new RuntimeException(e);
+ }
+ }
+
+ public static byte [] convertInputStreamToByteArrayIOEx(InputStream inputStream) throws IOException
+ {
+ PerformanceCounters.byteArrays.increment();
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
+
+ byte[] temp = new byte[4096];
+
+ int n = 0;
+ while ((n = inputStream.read(temp)) > 0)
+ {
+ baos.write(temp, 0, n);
+ }
+ inputStream.close();
+
+ baos.close();
+ byte [] data = baos.toByteArray();
+
+ return data;
+ }
+
+ public static void debugDataSourceToFile(DataSource dataSource, File file)
+ {
+ try
+ {
+ InputStream is = dataSource.createInputStream();
+ FileOutputStream fos = new FileOutputStream(file);
+ byte[] data = new byte[2048];
+ int n = -1;
+ while ((n = is.read(data)) > 0)
+ {
+ fos.write(data, 0, n);
+ }
+ is.close();
+ fos.close();
+ }
+ catch (IOException e)
+ {
+ log.error(e);
+ }
+ }
+}