From 3d982813b34f6f230baf4a467cdc37ec92a77595 Mon Sep 17 00:00:00 2001 From: netconomy Date: Fri, 17 Aug 2007 06:10:56 +0000 Subject: Performance git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@167 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../impl/input/ByteArrayPdfDataSourceImpl.java | 56 ++++++++++ .../impl/input/CompoundPdfDataSourceImpl.java | 47 ++++++++ .../pdfas/impl/input/DelimitedInputStream.java | 105 +++++++++++++++++ .../pdfas/impl/input/DelimitedPdfDataSource.java | 44 ++++++++ .../at/gv/egiz/pdfas/impl/input/FileBased.java | 20 ++++ .../impl/input/FileBasedPdfDataSourceImpl.java | 103 +++++++++++++++++ .../impl/input/FileBasedTextDataSourceImpl.java | 124 +++++++++++++++++++++ .../pdfas/impl/input/IncrementalUpdateParser.java | 49 ++++++++ .../egiz/pdfas/impl/input/TextDataSourceImpl.java | 82 ++++++++++++++ .../pdfas/impl/input/helper/DataSourceHelper.java | 92 +++++++++++++++ 10 files changed, 722 insertions(+) create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java create mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java (limited to 'src/main/java/at/gv/egiz/pdfas/impl/input') diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java new file mode 100644 index 0000000..0d27781 --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java @@ -0,0 +1,56 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; + +import at.gv.egiz.pdfas.performance.PerformanceCounters; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +/** + * Implements a PdfDataSource that holds the whole PDF document in a byte array. + * + *

+ * Note that holding the data in a byte array is very memory consuming for large + * documents. + *

+ * + * @author wprinz + */ +public class ByteArrayPdfDataSourceImpl implements PdfDataSource +{ + protected byte[] pdf = null; + + protected int length = -1; + + public ByteArrayPdfDataSourceImpl(byte[] pdf) + { + PerformanceCounters.byteArrays.increment(); + + this.pdf = pdf; + this.length = pdf.length; + } + + public ByteArrayPdfDataSourceImpl(byte[] pdf, int length) + { + PerformanceCounters.byteArrays.increment(); + + this.pdf = pdf; + this.length = length; + } + + + public InputStream createInputStream() + { + ByteArrayInputStream bais = new ByteArrayInputStream(this.pdf, 0, this.length); + return bais; + } + + public int getLength() + { + return this.length; + } + +} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java new file mode 100644 index 0000000..f77d6be --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java @@ -0,0 +1,47 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.SequenceInputStream; + +import at.gv.egiz.pdfas.framework.input.DataSource; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +/** + * @author wprinz + * + */ +public class CompoundPdfDataSourceImpl implements PdfDataSource +{ + protected DataSource originalDataSource = null; + + protected byte[] appendix = null; + + public CompoundPdfDataSourceImpl (PdfDataSource original, byte [] appendix) + { + this.originalDataSource = original; + this.appendix = appendix; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() + */ + public InputStream createInputStream() + { + ByteArrayInputStream bais = new ByteArrayInputStream(this.appendix); + SequenceInputStream sis = new SequenceInputStream(this.originalDataSource.createInputStream(), bais); + return sis; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() + */ + public int getLength() + { + return this.originalDataSource.getLength() + this.appendix.length; + } + +} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java new file mode 100644 index 0000000..4be9ec5 --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java @@ -0,0 +1,105 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.IOException; +import java.io.InputStream; + +/** + * An input stream that has a delimited length. + * + * @author wprinz + */ +public class DelimitedInputStream extends InputStream +{ + /** + * The underlying InputStream. + */ + protected InputStream is = null; + + /** + * The number of bytes that can be read from the stream. + */ + protected int bytes_to_read = -1; + + /** + * Constructs the DelimitedInputStream from which a maximum of length bytes + * can be read. + */ + public DelimitedInputStream(InputStream is, int length) + { + this.is = is; + this.bytes_to_read = length; + } + + /** + * @see java.io.InputStream#read() + */ + public int read() throws IOException + { + if (this.bytes_to_read <= 0) + { + return -1; + } + int read = this.is.read(); + if (read > 0) + { + this.bytes_to_read--; + } + return read; + } + + /** + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException + { + int btr = Math.min(len, this.bytes_to_read); + int read = this.is.read(b, off, btr); + if (read > 0) + { + this.bytes_to_read -= read; + } + return read; + } + + /** + * @see java.io.InputStream#read(byte[]) + */ + public int read(byte[] b) throws IOException + { + return read(b, 0, b.length); + } + + /** + * @see java.io.InputStream#skip(long) + */ + public long skip(long n) throws IOException + { + long bts = Math.min(n, this.bytes_to_read); + long skipped = this.is.skip(bts); + if (skipped > 0) + { + this.bytes_to_read -= skipped; + } + return skipped; + } + + /** + * @see java.io.InputStream#close() + */ + public void close() throws IOException + { + this.is.close(); + } + + /** + * @see java.io.InputStream#available() + */ + public int available() throws IOException + { + int avail = this.is.available(); + return Math.min(this.bytes_to_read, avail); + } +} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java new file mode 100644 index 0000000..6c67be2 --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java @@ -0,0 +1,44 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.InputStream; + +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +/** + * @author wprinz + * + */ +public class DelimitedPdfDataSource implements PdfDataSource +{ + + protected PdfDataSource dataSource = null; + protected int len = -1; + + public DelimitedPdfDataSource (PdfDataSource original, int length) + { + this.dataSource = original; + this.len = length; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() + */ + public InputStream createInputStream() + { + InputStream originalIS = this.dataSource.createInputStream(); + DelimitedInputStream dis = new DelimitedInputStream(originalIS, this.len); + return dis; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() + */ + public int getLength() + { + return this.len; + } + +} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java new file mode 100644 index 0000000..54f8842 --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java @@ -0,0 +1,20 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.File; + +/** + * Interface that reveals the underlying data file. + * + * @author wprinz + */ +public interface FileBased +{ + /** + * Returns the underlying data file. + * @return Returns the underlying data file. + */ + public File getFile(); +} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java new file mode 100644 index 0000000..8453192 --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java @@ -0,0 +1,103 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +/** + * @author wprinz + * + */ +public class FileBasedPdfDataSourceImpl implements PdfDataSource, FileBased +{ + /** + * The log. + */ + private static final Log log = LogFactory.getLog(FileBasedPdfDataSourceImpl.class); + + /** + * The underlying file. + */ + protected File inputFile = null; + + protected int length = -1; + + /** + * Constructor that creates this PdfDataSource backed by a file in the file + * system. + * + * @param file + * The input File. + * @param length + * The length of the InputStream. The is the maximum number of bytes + * that can be read from the stream. + * @throws IOException + * Thrown if the file cannot be read properly. + */ + public FileBasedPdfDataSourceImpl(File file, int length) throws IOException + { + + if (!file.exists()) + { + throw new FileNotFoundException("The file '" + file + "' does not exist."); + } + // for some reason the isFile is not always correct... + // if (file.isFile()) + // { + // throw new IOException("The file '" + file + "' is not a normal file."); + // } + if (!file.canRead()) + { + throw new IOException("The file '" + file + "' cannot be read."); + } + + this.inputFile = file; + this.length = length; + } + + /** + * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile() + */ + public File getFile() + { + return this.inputFile; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#createInputStream() + */ + public InputStream createInputStream() + { + try + { + FileInputStream fis = new FileInputStream(getFile()); + DelimitedInputStream dis = new DelimitedInputStream(fis, getLength()); + return dis; + } + catch (IOException e) + { + log.error("Couldn't create InputStream for file " + getFile() + ". Returning null."); + log.error(e); + + return null; + } + } + + /** + * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#getLength() + */ + public int getLength() + { + return this.length; + } +} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java new file mode 100644 index 0000000..6f6c7b4 --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java @@ -0,0 +1,124 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import at.gv.egiz.pdfas.framework.input.TextDataSource; + +/** + * @author wprinz + * + */ +public class FileBasedTextDataSourceImpl implements TextDataSource, FileBased +{ + /** + * The log. + */ + private static final Log log = LogFactory.getLog(FileBasedTextDataSourceImpl.class); + + protected File file = null; + + protected String characterEncoding = null; + + public FileBasedTextDataSourceImpl(File file, String characterEncoding) throws IOException + { + if (!file.exists()) + { + throw new FileNotFoundException("The file '" + file + "' does not exist."); + } + if (!file.canRead()) + { + throw new IOException("The file '" + file + "' cannot be read."); + } + + this.file = file; + this.characterEncoding = characterEncoding; + } + + /** + * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile() + */ + public File getFile() + { + return this.file; + } + + /** + * Returns the character encoding. + * + * @return Returns the character encoding. + */ + public String getCharacterEncoding() + { + return this.characterEncoding; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText() + */ + public String getText() + { + try + { + InputStream is = createInputStream(); + byte[] data = new byte[getLength()]; + int read = 0; + int n = 0; + while ((n = is.read(data, read, data.length - read)) > 0) + { + read += n; + } + is.close(); + + String text = new String(data, getCharacterEncoding()); + + data = null; + + return text; + } + catch (IOException e) + { + log.error("Couldn't read text for file " + getFile() + ". Returning null."); + log.error(e); + + return null; + } + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() + */ + public InputStream createInputStream() + { + try + { + FileInputStream fis = new FileInputStream(getFile()); + return fis; + } + catch (IOException e) + { + log.error("Couldn't create InputStream for file " + getFile() + ". Returning null."); + log.error(e); + + return null; + } + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() + */ + public int getLength() + { + return (int) getFile().length(); + } + +} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java b/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java new file mode 100644 index 0000000..b4c2bef --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java @@ -0,0 +1,49 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input; + +import java.util.List; + +import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper; +import at.gv.egiz.pdfas.exceptions.ErrorCode; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; +import at.knowcenter.wag.exactparser.ParseDocument; + +/** + * Parses the given PDF document into a list of Incremental Update blocks. + * @author wprinz + */ +public class IncrementalUpdateParser +{ + /** + * The log. + */ + private static final Log log = LogFactory.getLog(IncrementalUpdateParser.class); + + public static List parsePdfIntoIUBlocks (PdfDataSource pdfDataSource) throws PDFDocumentException + { + log.trace("parsePdfIntoIUBlocks:"); + + List blocks = null; + try + { + byte [] pdf = DataSourceHelper.convertDataSourceToByteArray(pdfDataSource); + blocks = ParseDocument.parseDocument(pdf); + } + catch (Exception e) + { + log.error("Error while parsing Document into IU blocks.", e); + throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); + } + + log.trace("parsePdfIntoIUBlocks finished."); + return blocks; + } + +} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java new file mode 100644 index 0000000..b259a3e --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java @@ -0,0 +1,82 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; + +import at.gv.egiz.pdfas.framework.input.TextDataSource; + +/** + * A TextDataSource that keeps the text in memory. + * + *

+ * Keeping the text in memory is fast as long as the text is short, but may + * result in bad memory performance when the text is longer. Use a FileBased + * TextDataSource instead if memory is an issue. + *

+ * + * @author wprinz + */ +public class TextDataSourceImpl implements TextDataSource +{ + /** + * The text. + */ + protected String text = null; + + /** + * Constructor that sets the text. + * + * @param text + * The text. + */ + public TextDataSourceImpl(String text) + { + this.text = text; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText() + */ + public String getText() + { + return this.text; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() + */ + public InputStream createInputStream() + { + try + { + byte[] data = getText().getBytes("UTF-8"); + // PERF: if memory is an issue (e.g. in web), use a FileBased TextDataSource instead. + return new ByteArrayInputStream(data); + } + catch (UnsupportedEncodingException e) + { + throw new RuntimeException(e); + } + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() + */ + public int getLength() + { + try + { + byte[] data = getText().getBytes("UTF-8"); + return data.length; + } + catch (UnsupportedEncodingException e) + { + throw new RuntimeException(e); + } + } + +} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java b/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java new file mode 100644 index 0000000..1e2ffdc --- /dev/null +++ b/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java @@ -0,0 +1,92 @@ +/** + * + */ +package at.gv.egiz.pdfas.impl.input.helper; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; + +import at.gv.egiz.pdfas.performance.PerformanceCounters; +import at.gv.egiz.pdfas.framework.input.DataSource; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * @author wprinz + * + */ +public class DataSourceHelper +{ + /** + * The log. + */ + private static final Log log = LogFactory.getLog(DataSourceHelper.class); + + /** + * Converts a PdfDataSource to a byte array. + * + *

+ * Note that this function is very memory intensive. Use the Streams whereever + * possible. + *

+ * + * @deprecated + * + * @param pdfDataSource + * @return + * @throws IOException + */ + public static byte[] convertDataSourceToByteArray(DataSource pdfDataSource) + { + try + { + PerformanceCounters.byteArrays.increment(); + + byte[] data = new byte[pdfDataSource.getLength()]; + + int bytes_written = 0; + + InputStream is = pdfDataSource.createInputStream(); + int n = 0; + while ((n = is.read(data, bytes_written, data.length - bytes_written)) > 0) + { + bytes_written += n; + } + is.close(); + + assert bytes_written == data.length; + + return data; + } + catch (IOException e) + { + log.error(e); + throw new RuntimeException(e); + } + } + + public static void debugDataSourceToFile(DataSource dataSource, File file) + { + try + { + InputStream is = dataSource.createInputStream(); + FileOutputStream fos = new FileOutputStream(file); + byte[] data = new byte[2048]; + int n = -1; + while ((n = is.read(data)) > 0) + { + fos.write(data, 0, n); + } + is.close(); + fos.close(); + } + catch (IOException e) + { + log.error(e); + } + } +} -- cgit v1.2.3