From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../impl/input/ByteArrayPdfDataSourceImpl.java | 85 +++++++ .../impl/input/CompoundPdfDataSourceImpl.java | 85 +++++++ .../pdfas/impl/input/DelimitedInputStream.java | 125 +++++++++ .../pdfas/impl/input/DelimitedPdfDataSource.java | 82 ++++++ .../at/gv/egiz/pdfas/impl/input/FileBased.java | 40 +++ .../impl/input/FileBasedPdfDataSourceImpl.java | 150 +++++++++++ .../impl/input/FileBasedTextDataSourceImpl.java | 160 ++++++++++++ .../pdfas/impl/input/IncrementalUpdateParser.java | 92 +++++++ .../egiz/pdfas/impl/input/TextDataSourceImpl.java | 120 +++++++++ .../impl/input/correction/ExternalCorrector.java | 283 +++++++++++++++++++++ .../impl/input/correction/InternalCorrector.java | 82 ++++++ .../pdfas/impl/input/helper/DataSourceHelper.java | 148 +++++++++++ 12 files changed, 1452 insertions(+) create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java create mode 100644 pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java (limited to 'pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input') diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java new file mode 100644 index 0000000..edcb1d4 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java @@ -0,0 +1,85 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; + +import at.gv.egiz.pdfas.performance.PerformanceCounters; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +/** + * Implements a PdfDataSource that holds the whole PDF document in a byte array. + * + *

+ * Note that holding the data in a byte array is very memory consuming for large + * documents. + *

+ * + * @author wprinz + */ +public class ByteArrayPdfDataSourceImpl implements PdfDataSource +{ + protected byte[] pdf = null; + + public ByteArrayPdfDataSourceImpl(byte[] pdf) + { + PerformanceCounters.byteArrays.increment(); + + this.pdf = pdf; + } + + public ByteArrayPdfDataSourceImpl(byte[] pdf, int length) + { + PerformanceCounters.byteArrays.increment(); + + if (pdf.length == length) + { + this.pdf = pdf; + } + else + { + this.pdf = new byte [length]; + System.arraycopy(pdf, 0, this.pdf, 0, length); + } + } + + + public InputStream createInputStream() + { + ByteArrayInputStream bais = new ByteArrayInputStream(this.pdf); + return bais; + } + + public int getLength() + { + return this.pdf.length; + } + + public byte[] getAsByteArray() + { + return this.pdf; + } + +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java new file mode 100644 index 0000000..f5e9b76 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java @@ -0,0 +1,85 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.SequenceInputStream; + +import at.gv.egiz.pdfas.framework.input.DataSource; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +/** + * @author wprinz + * + */ +public class CompoundPdfDataSourceImpl implements PdfDataSource +{ + protected DataSource originalDataSource = null; + + protected byte[] appendix = null; + + public CompoundPdfDataSourceImpl (PdfDataSource original, byte [] appendix) + { + this.originalDataSource = original; + this.appendix = appendix; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() + */ + public InputStream createInputStream() + { + ByteArrayInputStream bais = new ByteArrayInputStream(this.appendix); + SequenceInputStream sis = new SequenceInputStream(this.originalDataSource.createInputStream(), bais); + return sis; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() + */ + public int getLength() + { + return this.originalDataSource.getLength() + this.appendix.length; + } + + byte [] cache = null; + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() + */ + public byte[] getAsByteArray() + { + if (cache != null) + { + return cache; + } + + cache = new byte [getLength()]; + System.arraycopy(originalDataSource.getAsByteArray(), 0, cache, 0, originalDataSource.getLength()); + System.arraycopy(appendix, 0, cache, originalDataSource.getLength(), appendix.length); + + return cache; + } +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java new file mode 100644 index 0000000..f10b546 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java @@ -0,0 +1,125 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.IOException; +import java.io.InputStream; + +/** + * An input stream that has a delimited length. + * + * @author wprinz + */ +public class DelimitedInputStream extends InputStream +{ + /** + * The underlying InputStream. + */ + protected InputStream is = null; + + /** + * The number of bytes that can be read from the stream. + */ + protected int bytes_to_read = -1; + + /** + * Constructs the DelimitedInputStream from which a maximum of length bytes + * can be read. + */ + public DelimitedInputStream(InputStream is, int length) + { + this.is = is; + this.bytes_to_read = length; + } + + /** + * @see java.io.InputStream#read() + */ + public int read() throws IOException + { + if (this.bytes_to_read <= 0) + { + return -1; + } + int read = this.is.read(); + if (read > 0) + { + this.bytes_to_read--; + } + return read; + } + + /** + * @see java.io.InputStream#read(byte[], int, int) + */ + public int read(byte[] b, int off, int len) throws IOException + { + int btr = Math.min(len, this.bytes_to_read); + int read = this.is.read(b, off, btr); + if (read > 0) + { + this.bytes_to_read -= read; + } + return read; + } + + /** + * @see java.io.InputStream#read(byte[]) + */ + public int read(byte[] b) throws IOException + { + return read(b, 0, b.length); + } + + /** + * @see java.io.InputStream#skip(long) + */ + public long skip(long n) throws IOException + { + long bts = Math.min(n, this.bytes_to_read); + long skipped = this.is.skip(bts); + if (skipped > 0) + { + this.bytes_to_read -= skipped; + } + return skipped; + } + + /** + * @see java.io.InputStream#close() + */ + public void close() throws IOException + { + this.is.close(); + } + + /** + * @see java.io.InputStream#available() + */ + public int available() throws IOException + { + int avail = this.is.available(); + return Math.min(this.bytes_to_read, avail); + } +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java new file mode 100644 index 0000000..ca73f37 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java @@ -0,0 +1,82 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.InputStream; + +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +/** + * @author wprinz + * + */ +public class DelimitedPdfDataSource implements PdfDataSource +{ + + protected PdfDataSource dataSource = null; + protected int len = -1; + + public DelimitedPdfDataSource (PdfDataSource original, int length) + { + this.dataSource = original; + this.len = length; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() + */ + public InputStream createInputStream() + { + InputStream originalIS = this.dataSource.createInputStream(); + DelimitedInputStream dis = new DelimitedInputStream(originalIS, this.len); + return dis; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() + */ + public int getLength() + { + return this.len; + } + + byte [] cache = null; + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() + */ + public byte[] getAsByteArray() + { + if (cache != null) + { + return cache; + } + + cache = new byte [getLength()]; + System.arraycopy(dataSource.getAsByteArray(), 0, cache, 0, getLength()); + + return cache; + } + +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java new file mode 100644 index 0000000..65ee416 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java @@ -0,0 +1,40 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.File; + +/** + * Interface that reveals the underlying data file. + * + * @author wprinz + */ +public interface FileBased +{ + /** + * Returns the underlying data file. + * @return Returns the underlying data file. + */ + public File getFile(); +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java new file mode 100644 index 0000000..a710c3c --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java @@ -0,0 +1,150 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import at.gv.egiz.pdfas.framework.input.PdfDataSource; +import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper; + +/** + * @author wprinz + * + */ +public class FileBasedPdfDataSourceImpl implements PdfDataSource, FileBased +{ + /** + * The log. + */ + private static final Log log = LogFactory.getLog(FileBasedPdfDataSourceImpl.class); + + /** + * The underlying file. + */ + protected File inputFile = null; + + protected int length = -1; + + /** + * Constructor that creates this PdfDataSource backed by a file in the file + * system. + * + * @param file + * The input File. + * @param length + * The length of the InputStream. The is the maximum number of bytes + * that can be read from the stream. + * @throws IOException + * Thrown if the file cannot be read properly. + */ + public FileBasedPdfDataSourceImpl(File file, int length) throws IOException + { + + if (!file.exists()) + { + throw new FileNotFoundException("The file '" + file + "' does not exist."); + } + // for some reason the isFile is not always correct... + // if (file.isFile()) + // { + // throw new IOException("The file '" + file + "' is not a normal file."); + // } + if (!file.canRead()) + { + throw new IOException("The file '" + file + "' cannot be read."); + } + + this.inputFile = file; + this.length = length; + } + + /** + * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile() + */ + public File getFile() + { + return this.inputFile; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#createInputStream() + */ + public InputStream createInputStream() + { + if (cache == null) + { + getAsByteArray(); + } + return new ByteArrayInputStream(cache); + } + + protected InputStream createFileInputStream() + { + try + { + FileInputStream fis = new FileInputStream(getFile()); + DelimitedInputStream dis = new DelimitedInputStream(fis, getLength()); + return dis; + } + catch (IOException e) + { + log.error("Couldn't create InputStream for file " + getFile() + ". Returning null.", e); + + return null; + } + } + /** + * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#getLength() + */ + public int getLength() + { + return this.length; + } + + byte [] cache = null; + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() + */ + public byte[] getAsByteArray() + { + if (cache != null) + { + return cache; + } + + cache = DataSourceHelper.convertInputStreamToByteArray(createFileInputStream()); + + return cache; + } + +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java new file mode 100644 index 0000000..5a84ce2 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java @@ -0,0 +1,160 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import at.gv.egiz.pdfas.framework.input.TextDataSource; +import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper; + +/** + * @author wprinz + * + */ +public class FileBasedTextDataSourceImpl implements TextDataSource, FileBased +{ + /** + * The log. + */ + private static final Log log = LogFactory.getLog(FileBasedTextDataSourceImpl.class); + + protected File file = null; + + protected String characterEncoding = null; + + public FileBasedTextDataSourceImpl(File file, String characterEncoding) throws IOException + { + if (!file.exists()) + { + throw new FileNotFoundException("The file '" + file + "' does not exist."); + } + if (!file.canRead()) + { + throw new IOException("The file '" + file + "' cannot be read."); + } + + this.file = file; + this.characterEncoding = characterEncoding; + } + + /** + * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile() + */ + public File getFile() + { + return this.file; + } + + /** + * Returns the character encoding. + * + * @return Returns the character encoding. + */ + public String getCharacterEncoding() + { + return this.characterEncoding; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText() + */ + public String getText() + { + try + { + InputStream is = createInputStream(); + byte[] data = new byte[getLength()]; + int read = 0; + int n = 0; + while ((n = is.read(data, read, data.length - read)) > 0) + { + read += n; + } + is.close(); + + String text = new String(data, getCharacterEncoding()); + + data = null; + + return text; + } + catch (IOException e) + { + log.error("Couldn't read text for file " + getFile() + ". Returning null.", e); + + return null; + } + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() + */ + public InputStream createInputStream() + { + try + { + FileInputStream fis = new FileInputStream(getFile()); + return fis; + } + catch (IOException e) + { + log.error("Couldn't create InputStream for file " + getFile() + ". Returning null.", e); + + return null; + } + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() + */ + public int getLength() + { + return (int) getFile().length(); + } + + byte [] cache = null; + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() + */ + public byte[] getAsByteArray() + { + if (cache != null) + { + return cache; + } + + cache = DataSourceHelper.convertInputStreamToByteArray(createInputStream()); + + return cache; + } + +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java new file mode 100644 index 0000000..c1dcc03 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java @@ -0,0 +1,92 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input; + +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import at.gv.egiz.pdfas.exceptions.ErrorCode; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; +import at.gv.egiz.pdfas.framework.input.PdfDataSourceHolder; +import at.gv.egiz.pdfas.framework.input.correction.Corrector; +import at.gv.egiz.pdfas.framework.input.correction.CorrectorFactory; +import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper; +import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; +import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; +import at.knowcenter.wag.exactparser.ParseDocument; + +/** + * Parses the given PDF document into a list of Incremental Update blocks. + * @author wprinz + */ +public class IncrementalUpdateParser +{ + /** + * The log. + */ + private static final Log log = LogFactory.getLog(IncrementalUpdateParser.class); + + public static List parsePdfIntoIUBlocks (PdfDataSourceHolder pdfDataSource) throws PDFDocumentException + { + log.trace("parsePdfIntoIUBlocks:"); + + List blocks = null; + try + { + byte [] pdf = DataSourceHelper.convertDataSourceToByteArray(pdfDataSource.getDataSource()); + blocks = ParseDocument.parseDocument(pdf); + } + catch (Exception e) { + try { + log.debug("Error while parsing Document.", e); + boolean tryToCorrect = SettingsReader.getInstance().getSetting("correct_document_on_verify_if_necessary", "false").equals("true"); + if (tryToCorrect) { + log.info("Correcting document..."); + Corrector cor = CorrectorFactory.createCorrector(); + PdfDataSource correctedDS = cor.correctDocument(pdfDataSource.getDataSource()); + log.info("Correction finished."); + byte [] pdf = DataSourceHelper.convertDataSourceToByteArray(correctedDS); + blocks = ParseDocument.parseDocument(pdf); + pdfDataSource.setDataSource(correctedDS); + } else { + makeError(e); + } + + } catch (Exception e1) { + makeError(e); + } + } + + log.trace("parsePdfIntoIUBlocks finished."); + return blocks; + } + + private static void makeError(Exception e) throws PDFDocumentException { + log.error("Error while parsing Document into IU blocks.", e); + throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); + } + +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java new file mode 100644 index 0000000..fa5ab04 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java @@ -0,0 +1,120 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; + +import at.gv.egiz.pdfas.framework.input.TextDataSource; + +/** + * A TextDataSource that keeps the text in memory. + * + *

+ * Keeping the text in memory is fast as long as the text is short, but may + * result in bad memory performance when the text is longer. Use a FileBased + * TextDataSource instead if memory is an issue. + *

+ * + * @author wprinz + */ +public class TextDataSourceImpl implements TextDataSource +{ + /** + * The text. + */ + protected String text = null; + + private final static String CHARSET = "UTF-8"; + + /** + * Constructor that sets the text. + * + * @param text + * The text. + */ + public TextDataSourceImpl(String text) + { + this.text = text; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText() + */ + public String getText() + { + return this.text; + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() + */ + public InputStream createInputStream() + { + try + { + byte[] data = getText().getBytes(CHARSET); + // PERF: if memory is an issue (e.g. in web), use a FileBased TextDataSource instead. + return new ByteArrayInputStream(data); + } + catch (UnsupportedEncodingException e) + { + throw new RuntimeException(e); + } + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() + */ + public int getLength() + { + try + { + byte[] data = getText().getBytes(CHARSET); + return data.length; + } + catch (UnsupportedEncodingException e) + { + throw new RuntimeException(e); + } + } + + /** + * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() + */ + public byte[] getAsByteArray() + { + try + { + byte[] data = getText().getBytes(CHARSET); + return data; + } + catch (UnsupportedEncodingException e) + { + throw new RuntimeException(e); + } + } + +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java new file mode 100644 index 0000000..efd094a --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java @@ -0,0 +1,283 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input.correction; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import at.gv.egiz.pdfas.exceptions.ErrorCode; +import at.gv.egiz.pdfas.exceptions.framework.CorrectorException; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; +import at.gv.egiz.pdfas.framework.input.correction.Corrector; +import at.gv.egiz.pdfas.impl.input.FileBased; +import at.gv.egiz.pdfas.impl.input.FileBasedPdfDataSourceImpl; +import at.gv.egiz.pdfas.utils.TempDirHelper; +import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; +import at.knowcenter.wag.egov.egiz.exceptions.SettingNotFoundException; +import at.knowcenter.wag.egov.egiz.exceptions.SettingsException; + +/** + * Corrects the document using an extrenal commandline tool. + * + *

+ * Process.destroy after a certain timeout does not work if the executable is a + * Windows batch file. + *

+ * + * @author wprinz + */ +public class ExternalCorrector implements Corrector +{ + public static final String INPUT_DOCUMENT_REPLACE = "##input_document##"; + + public static final String OUTPUT_DOCUMENT_REPLACE = "##output_document##"; + + public static final String COMMANDLINE_KEY = "external_corrector_commandline"; + + public static final String TIMEOUT_KEY = "external_corrector_timeout"; + + protected static final int DEFAULT_TIMEOUT = 1000; + + /** + * The log. + */ + private static final Log log = LogFactory.getLog(ExternalCorrector.class); + + /** + * @see at.gv.egiz.pdfas.framework.input.correction.Corrector#correctDocument(at.gv.egiz.pdfas.framework.input.PdfDataSource) + */ + public PdfDataSource correctDocument(PdfDataSource document) throws CorrectorException + { + + try + { + String outName = null; + File in = null; + if (document instanceof FileBased) + { + FileBased fb = (FileBased) document; + in = fb.getFile(); + outName = in.getName() + "_correction_outfile.pdf"; + } + else + { + in = TempDirHelper.placeInputIntoTempDirFile(document.createInputStream(), "correction_infile.pdf"); + outName = "correction_outfile.pdf"; + } + + File out = TempDirHelper.formTempFile(outName); + + String commandline = SettingsReader.getInstance().getSetting(COMMANDLINE_KEY); + long timeout = SettingsReader.getInstance().getIntSetting(TIMEOUT_KEY, DEFAULT_TIMEOUT); + + String inF = in.getAbsolutePath(); + commandline = commandline.replaceFirst(INPUT_DOCUMENT_REPLACE, inF.replaceAll("\\\\", "\\\\\\\\")); + String outF = out.getAbsolutePath(); + commandline = commandline.replaceFirst(OUTPUT_DOCUMENT_REPLACE, outF.replaceAll("\\\\", "\\\\\\\\")); + + log.info(commandline); + + Process p = Runtime.getRuntime().exec(commandline); + + Thread outT = null; + Thread errT = null; + TimeoutThread tt = null; + BufferedReader outReader = null; + BufferedReader errReader = null; + + try + { + outReader = new BufferedReader(new InputStreamReader(p.getInputStream())); + errReader = new BufferedReader(new InputStreamReader(p.getErrorStream())); + + outT = new Thread(new ReaderPrinter(outReader, "STDOUT")); + errT = new Thread(new ReaderPrinter(errReader, "STDERR")); + + tt = new TimeoutThread(p, timeout, new Thread[] { outT, errT }); + + tt.start(); + outT.start(); + errT.start(); + + log.trace("Joining the STDOUT thread..."); + outT.join(); + log.trace("STDOUT thread joined."); + log.trace("Joining the STDERR thread..."); + errT.join(); + log.trace("STDERR thread joined."); + + log.trace("Waiting for process to end..."); + p.waitFor(); + log.trace("process has ended."); + + log.trace("Interrupting timeout thread..."); + tt.interrupt(); + log.trace("timeout thread has been interrupted."); + + int exitValue = p.exitValue(); + log.info("External Corrector exited with: " + exitValue); + + if (tt.isTimedOut()) + { + throw new CorrectorException(ErrorCode.EXTERNAL_CORRECTOR_TIMEOUT_REACHED, "The external corrector process timed out. timeout = " + timeout); + } + + PdfDataSource ds = new FileBasedPdfDataSourceImpl(out, (int) out.length()); + return ds; + } + finally + { + if (outT != null) + { + outT.interrupt(); + } + if (errT != null) + { + errT.interrupt(); + } + if (tt != null) + { + tt.interrupt(); + } + if (outReader != null) + { + outReader.close(); + } + if (errReader != null) + { + errReader.close(); + } + } + + } + catch (IOException e) + { + throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); + } + catch (InterruptedException e) + { + throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); + } + catch (SettingNotFoundException e) + { + throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); + } + catch (SettingsException e) + { + throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); + } + } + + protected static class ReaderPrinter implements Runnable + { + protected BufferedReader reader = null; + + protected String streamName = null; + + public ReaderPrinter(BufferedReader reader, String streamName) + { + this.reader = reader; + this.streamName = streamName; + } + + public void run() + { + try + { + String line = null; + + while ((line = this.reader.readLine()) != null) + { + if (line != null) + { + log.info(streamName + ": " + line); + } + } + } + catch (IOException e) + { + log.error(e.getMessage(), e); + } + } + } + + protected static class TimeoutThread extends Thread + { + protected Process proc = null; + + protected long timeout = -1; + + protected boolean ranIntoTimeout = false; + + protected Thread[] threads; + + protected BufferedReader errReader; + + public TimeoutThread(Process proc, long timeout, Thread[] threadsToInterrupt) + { + this.proc = proc; + this.timeout = timeout; + this.threads = threadsToInterrupt; + } + + public void run() + { + try + { + Thread.sleep(this.timeout); + log.info("The timeout was reached. Destroying the process."); + proc.destroy(); + ranIntoTimeout = true; + log.trace("destroy has been called."); + log.trace("Interrupting threads..."); + for (int i = 0; i < this.threads.length; i++) + { + this.threads[i].interrupt(); + } + log.trace("threads have been interrupted."); + } + catch (InterruptedException e) + { + log.debug("Timeout thread interrupted. This means that the process finished successfully."); + } + } + + /** + * Tells, if the process ran into the timeout. + * + * @return Returns true if the timeout was reached. Returns false if the + * timeout was not reached. + */ + public boolean isTimedOut() + { + return this.ranIntoTimeout; + } + } + +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java new file mode 100644 index 0000000..eaa6b7f --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java @@ -0,0 +1,82 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input.correction; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import at.gv.egiz.pdfas.exceptions.ErrorCode; +import at.gv.egiz.pdfas.exceptions.framework.CorrectorException; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; +import at.gv.egiz.pdfas.framework.input.correction.Corrector; +import at.gv.egiz.pdfas.impl.input.ByteArrayPdfDataSourceImpl; +import at.gv.egiz.pdfas.utils.PDFASUtils; +import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; + +import com.lowagie.text.DocumentException; +import com.lowagie.text.pdf.PdfReader; +import com.lowagie.text.pdf.PdfStamper; + +/** + * Corrects a document using iText. + * + * @author wprinz + */ +public class InternalCorrector implements Corrector +{ + + /** + * @see at.gv.egiz.pdfas.framework.input.correction.Corrector#correctDocument(at.gv.egiz.pdfas.framework.input.PdfDataSource) + */ + public PdfDataSource correctDocument(PdfDataSource document) throws CorrectorException + { + try + { + byte[] pdf = document.getAsByteArray(); + PdfReader reader = new PdfReader(pdf); + PDFASUtils.checkReaderPermissions(reader); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(pdf.length); + + PdfStamper stamper = new PdfStamper(reader, baos, '\0', false); + stamper.close(); + + baos.close(); + byte[] corrected_pdf = baos.toByteArray(); + + return new ByteArrayPdfDataSourceImpl(corrected_pdf); + } + catch (DocumentException e) + { + throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); + } + catch (IOException e) + { + throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); + } catch (PDFDocumentException e) { + throw new CorrectorException(e.getErrorCode(), e); + } + } + +} diff --git a/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java new file mode 100644 index 0000000..76a5f99 --- /dev/null +++ b/pdf-as-lib/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java @@ -0,0 +1,148 @@ +/** + * Copyright 2006 by Know-Center, Graz, Austria + * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a + * joint initiative of the Federal Chancellery Austria and Graz University of + * Technology. + * + * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by + * the European Commission - subsequent versions of the EUPL (the "Licence"); + * You may not use this work except in compliance with the Licence. + * You may obtain a copy of the Licence at: + * http://www.osor.eu/eupl/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Licence is distributed on an "AS IS" basis, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Licence for the specific language governing permissions and + * limitations under the Licence. + * + * This product combines work with different licenses. See the "NOTICE" text + * file for details on the various modules and licenses. + * The "NOTICE" text file is part of the distribution. Any derivative works + * that you distribute must include a readable copy of the "NOTICE" text file. + */ +package at.gv.egiz.pdfas.impl.input.helper; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; + +import at.gv.egiz.pdfas.performance.PerformanceCounters; +import at.gv.egiz.pdfas.framework.input.DataSource; +import at.gv.egiz.pdfas.framework.input.PdfDataSource; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * @author wprinz + * + */ +public class DataSourceHelper +{ + /** + * The log. + */ + private static final Log log = LogFactory.getLog(DataSourceHelper.class); + + /** + * Converts a PdfDataSource to a byte array. + * + *

+ * Note that this function is very memory intensive. Use the Streams whereever + * possible. + *

+ * + * @deprecated + * + * @param pdfDataSource + * @return + * @throws IOException + */ + public static byte[] convertDataSourceToByteArray(DataSource pdfDataSource) + { + return pdfDataSource.getAsByteArray(); +// try +// { +// PerformanceCounters.byteArrays.increment(); +// +// byte[] data = new byte[pdfDataSource.getLength()]; +// +// int bytes_written = 0; +// +// InputStream is = pdfDataSource.createInputStream(); +// int n = 0; +// while ((n = is.read(data, bytes_written, data.length - bytes_written)) > 0) +// { +// bytes_written += n; +// } +// is.close(); +// +// assert bytes_written == data.length; +// +// return data; +// } +// catch (IOException e) +// { +// log.error(e); +// throw new RuntimeException(e); +// } + } + + public static byte [] convertInputStreamToByteArray(InputStream inputStream) + { + try + { + return convertInputStreamToByteArrayIOEx(inputStream); + } + catch (IOException e) + { + log.error(e); + throw new RuntimeException(e); + } + } + + public static byte [] convertInputStreamToByteArrayIOEx(InputStream inputStream) throws IOException + { + PerformanceCounters.byteArrays.increment(); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(4096); + + byte[] temp = new byte[4096]; + + int n = 0; + while ((n = inputStream.read(temp)) > 0) + { + baos.write(temp, 0, n); + } + inputStream.close(); + + baos.close(); + byte [] data = baos.toByteArray(); + + return data; + } + + public static void debugDataSourceToFile(DataSource dataSource, File file) + { + try + { + InputStream is = dataSource.createInputStream(); + FileOutputStream fos = new FileOutputStream(file); + byte[] data = new byte[2048]; + int n = -1; + while ((n = is.read(data)) > 0) + { + fos.write(data, 0, n); + } + is.close(); + fos.close(); + } + catch (IOException e) + { + log.error(e); + } + } +} -- cgit v1.2.3