From 535a04fa05f739ec16dd81666e3b0f82dfbd442d Mon Sep 17 00:00:00 2001 From: tknall Date: Wed, 9 Jan 2013 15:41:29 +0000 Subject: pdf-as-lib maven project files moved to pdf-as-lib git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/pdf-as/trunk@926 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../impl/input/ByteArrayPdfDataSourceImpl.java | 85 ------- .../impl/input/CompoundPdfDataSourceImpl.java | 85 ------- .../pdfas/impl/input/DelimitedInputStream.java | 125 --------- .../pdfas/impl/input/DelimitedPdfDataSource.java | 82 ------ .../at/gv/egiz/pdfas/impl/input/FileBased.java | 40 --- .../impl/input/FileBasedPdfDataSourceImpl.java | 150 ----------- .../impl/input/FileBasedTextDataSourceImpl.java | 160 ------------ .../pdfas/impl/input/IncrementalUpdateParser.java | 92 ------- .../egiz/pdfas/impl/input/TextDataSourceImpl.java | 120 --------- .../impl/input/correction/ExternalCorrector.java | 283 --------------------- .../impl/input/correction/InternalCorrector.java | 82 ------ .../pdfas/impl/input/helper/DataSourceHelper.java | 148 ----------- 12 files changed, 1452 deletions(-) delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java delete mode 100644 src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java (limited to 'src/main/java/at/gv/egiz/pdfas/impl/input') diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java deleted file mode 100644 index edcb1d4..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/ByteArrayPdfDataSourceImpl.java +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input; - -import java.io.ByteArrayInputStream; -import java.io.InputStream; - -import at.gv.egiz.pdfas.performance.PerformanceCounters; -import at.gv.egiz.pdfas.framework.input.PdfDataSource; - -/** - * Implements a PdfDataSource that holds the whole PDF document in a byte array. - * - *

- * Note that holding the data in a byte array is very memory consuming for large - * documents. - *

- * - * @author wprinz - */ -public class ByteArrayPdfDataSourceImpl implements PdfDataSource -{ - protected byte[] pdf = null; - - public ByteArrayPdfDataSourceImpl(byte[] pdf) - { - PerformanceCounters.byteArrays.increment(); - - this.pdf = pdf; - } - - public ByteArrayPdfDataSourceImpl(byte[] pdf, int length) - { - PerformanceCounters.byteArrays.increment(); - - if (pdf.length == length) - { - this.pdf = pdf; - } - else - { - this.pdf = new byte [length]; - System.arraycopy(pdf, 0, this.pdf, 0, length); - } - } - - - public InputStream createInputStream() - { - ByteArrayInputStream bais = new ByteArrayInputStream(this.pdf); - return bais; - } - - public int getLength() - { - return this.pdf.length; - } - - public byte[] getAsByteArray() - { - return this.pdf; - } - -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java deleted file mode 100644 index f5e9b76..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/CompoundPdfDataSourceImpl.java +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input; - -import java.io.ByteArrayInputStream; -import java.io.InputStream; -import java.io.SequenceInputStream; - -import at.gv.egiz.pdfas.framework.input.DataSource; -import at.gv.egiz.pdfas.framework.input.PdfDataSource; - -/** - * @author wprinz - * - */ -public class CompoundPdfDataSourceImpl implements PdfDataSource -{ - protected DataSource originalDataSource = null; - - protected byte[] appendix = null; - - public CompoundPdfDataSourceImpl (PdfDataSource original, byte [] appendix) - { - this.originalDataSource = original; - this.appendix = appendix; - } - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() - */ - public InputStream createInputStream() - { - ByteArrayInputStream bais = new ByteArrayInputStream(this.appendix); - SequenceInputStream sis = new SequenceInputStream(this.originalDataSource.createInputStream(), bais); - return sis; - } - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() - */ - public int getLength() - { - return this.originalDataSource.getLength() + this.appendix.length; - } - - byte [] cache = null; - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() - */ - public byte[] getAsByteArray() - { - if (cache != null) - { - return cache; - } - - cache = new byte [getLength()]; - System.arraycopy(originalDataSource.getAsByteArray(), 0, cache, 0, originalDataSource.getLength()); - System.arraycopy(appendix, 0, cache, originalDataSource.getLength(), appendix.length); - - return cache; - } -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java deleted file mode 100644 index f10b546..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedInputStream.java +++ /dev/null @@ -1,125 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input; - -import java.io.IOException; -import java.io.InputStream; - -/** - * An input stream that has a delimited length. - * - * @author wprinz - */ -public class DelimitedInputStream extends InputStream -{ - /** - * The underlying InputStream. - */ - protected InputStream is = null; - - /** - * The number of bytes that can be read from the stream. - */ - protected int bytes_to_read = -1; - - /** - * Constructs the DelimitedInputStream from which a maximum of length bytes - * can be read. - */ - public DelimitedInputStream(InputStream is, int length) - { - this.is = is; - this.bytes_to_read = length; - } - - /** - * @see java.io.InputStream#read() - */ - public int read() throws IOException - { - if (this.bytes_to_read <= 0) - { - return -1; - } - int read = this.is.read(); - if (read > 0) - { - this.bytes_to_read--; - } - return read; - } - - /** - * @see java.io.InputStream#read(byte[], int, int) - */ - public int read(byte[] b, int off, int len) throws IOException - { - int btr = Math.min(len, this.bytes_to_read); - int read = this.is.read(b, off, btr); - if (read > 0) - { - this.bytes_to_read -= read; - } - return read; - } - - /** - * @see java.io.InputStream#read(byte[]) - */ - public int read(byte[] b) throws IOException - { - return read(b, 0, b.length); - } - - /** - * @see java.io.InputStream#skip(long) - */ - public long skip(long n) throws IOException - { - long bts = Math.min(n, this.bytes_to_read); - long skipped = this.is.skip(bts); - if (skipped > 0) - { - this.bytes_to_read -= skipped; - } - return skipped; - } - - /** - * @see java.io.InputStream#close() - */ - public void close() throws IOException - { - this.is.close(); - } - - /** - * @see java.io.InputStream#available() - */ - public int available() throws IOException - { - int avail = this.is.available(); - return Math.min(this.bytes_to_read, avail); - } -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java b/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java deleted file mode 100644 index ca73f37..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/DelimitedPdfDataSource.java +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input; - -import java.io.InputStream; - -import at.gv.egiz.pdfas.framework.input.PdfDataSource; - -/** - * @author wprinz - * - */ -public class DelimitedPdfDataSource implements PdfDataSource -{ - - protected PdfDataSource dataSource = null; - protected int len = -1; - - public DelimitedPdfDataSource (PdfDataSource original, int length) - { - this.dataSource = original; - this.len = length; - } - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() - */ - public InputStream createInputStream() - { - InputStream originalIS = this.dataSource.createInputStream(); - DelimitedInputStream dis = new DelimitedInputStream(originalIS, this.len); - return dis; - } - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() - */ - public int getLength() - { - return this.len; - } - - byte [] cache = null; - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() - */ - public byte[] getAsByteArray() - { - if (cache != null) - { - return cache; - } - - cache = new byte [getLength()]; - System.arraycopy(dataSource.getAsByteArray(), 0, cache, 0, getLength()); - - return cache; - } - -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java deleted file mode 100644 index 65ee416..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBased.java +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input; - -import java.io.File; - -/** - * Interface that reveals the underlying data file. - * - * @author wprinz - */ -public interface FileBased -{ - /** - * Returns the underlying data file. - * @return Returns the underlying data file. - */ - public File getFile(); -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java deleted file mode 100644 index a710c3c..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedPdfDataSourceImpl.java +++ /dev/null @@ -1,150 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input; - -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import at.gv.egiz.pdfas.framework.input.PdfDataSource; -import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper; - -/** - * @author wprinz - * - */ -public class FileBasedPdfDataSourceImpl implements PdfDataSource, FileBased -{ - /** - * The log. - */ - private static final Log log = LogFactory.getLog(FileBasedPdfDataSourceImpl.class); - - /** - * The underlying file. - */ - protected File inputFile = null; - - protected int length = -1; - - /** - * Constructor that creates this PdfDataSource backed by a file in the file - * system. - * - * @param file - * The input File. - * @param length - * The length of the InputStream. The is the maximum number of bytes - * that can be read from the stream. - * @throws IOException - * Thrown if the file cannot be read properly. - */ - public FileBasedPdfDataSourceImpl(File file, int length) throws IOException - { - - if (!file.exists()) - { - throw new FileNotFoundException("The file '" + file + "' does not exist."); - } - // for some reason the isFile is not always correct... - // if (file.isFile()) - // { - // throw new IOException("The file '" + file + "' is not a normal file."); - // } - if (!file.canRead()) - { - throw new IOException("The file '" + file + "' cannot be read."); - } - - this.inputFile = file; - this.length = length; - } - - /** - * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile() - */ - public File getFile() - { - return this.inputFile; - } - - /** - * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#createInputStream() - */ - public InputStream createInputStream() - { - if (cache == null) - { - getAsByteArray(); - } - return new ByteArrayInputStream(cache); - } - - protected InputStream createFileInputStream() - { - try - { - FileInputStream fis = new FileInputStream(getFile()); - DelimitedInputStream dis = new DelimitedInputStream(fis, getLength()); - return dis; - } - catch (IOException e) - { - log.error("Couldn't create InputStream for file " + getFile() + ". Returning null.", e); - - return null; - } - } - /** - * @see at.gv.egiz.pdfas.framework.input.PdfDataSource#getLength() - */ - public int getLength() - { - return this.length; - } - - byte [] cache = null; - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() - */ - public byte[] getAsByteArray() - { - if (cache != null) - { - return cache; - } - - cache = DataSourceHelper.convertInputStreamToByteArray(createFileInputStream()); - - return cache; - } - -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java deleted file mode 100644 index 5a84ce2..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/FileBasedTextDataSourceImpl.java +++ /dev/null @@ -1,160 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import at.gv.egiz.pdfas.framework.input.TextDataSource; -import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper; - -/** - * @author wprinz - * - */ -public class FileBasedTextDataSourceImpl implements TextDataSource, FileBased -{ - /** - * The log. - */ - private static final Log log = LogFactory.getLog(FileBasedTextDataSourceImpl.class); - - protected File file = null; - - protected String characterEncoding = null; - - public FileBasedTextDataSourceImpl(File file, String characterEncoding) throws IOException - { - if (!file.exists()) - { - throw new FileNotFoundException("The file '" + file + "' does not exist."); - } - if (!file.canRead()) - { - throw new IOException("The file '" + file + "' cannot be read."); - } - - this.file = file; - this.characterEncoding = characterEncoding; - } - - /** - * @see at.gv.egiz.pdfas.impl.input.FileBased#getFile() - */ - public File getFile() - { - return this.file; - } - - /** - * Returns the character encoding. - * - * @return Returns the character encoding. - */ - public String getCharacterEncoding() - { - return this.characterEncoding; - } - - /** - * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText() - */ - public String getText() - { - try - { - InputStream is = createInputStream(); - byte[] data = new byte[getLength()]; - int read = 0; - int n = 0; - while ((n = is.read(data, read, data.length - read)) > 0) - { - read += n; - } - is.close(); - - String text = new String(data, getCharacterEncoding()); - - data = null; - - return text; - } - catch (IOException e) - { - log.error("Couldn't read text for file " + getFile() + ". Returning null.", e); - - return null; - } - } - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() - */ - public InputStream createInputStream() - { - try - { - FileInputStream fis = new FileInputStream(getFile()); - return fis; - } - catch (IOException e) - { - log.error("Couldn't create InputStream for file " + getFile() + ". Returning null.", e); - - return null; - } - } - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() - */ - public int getLength() - { - return (int) getFile().length(); - } - - byte [] cache = null; - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() - */ - public byte[] getAsByteArray() - { - if (cache != null) - { - return cache; - } - - cache = DataSourceHelper.convertInputStreamToByteArray(createInputStream()); - - return cache; - } - -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java b/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java deleted file mode 100644 index c1dcc03..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/IncrementalUpdateParser.java +++ /dev/null @@ -1,92 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input; - -import java.util.List; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import at.gv.egiz.pdfas.exceptions.ErrorCode; -import at.gv.egiz.pdfas.framework.input.PdfDataSource; -import at.gv.egiz.pdfas.framework.input.PdfDataSourceHolder; -import at.gv.egiz.pdfas.framework.input.correction.Corrector; -import at.gv.egiz.pdfas.framework.input.correction.CorrectorFactory; -import at.gv.egiz.pdfas.impl.input.helper.DataSourceHelper; -import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; -import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; -import at.knowcenter.wag.exactparser.ParseDocument; - -/** - * Parses the given PDF document into a list of Incremental Update blocks. - * @author wprinz - */ -public class IncrementalUpdateParser -{ - /** - * The log. - */ - private static final Log log = LogFactory.getLog(IncrementalUpdateParser.class); - - public static List parsePdfIntoIUBlocks (PdfDataSourceHolder pdfDataSource) throws PDFDocumentException - { - log.trace("parsePdfIntoIUBlocks:"); - - List blocks = null; - try - { - byte [] pdf = DataSourceHelper.convertDataSourceToByteArray(pdfDataSource.getDataSource()); - blocks = ParseDocument.parseDocument(pdf); - } - catch (Exception e) { - try { - log.debug("Error while parsing Document.", e); - boolean tryToCorrect = SettingsReader.getInstance().getSetting("correct_document_on_verify_if_necessary", "false").equals("true"); - if (tryToCorrect) { - log.info("Correcting document..."); - Corrector cor = CorrectorFactory.createCorrector(); - PdfDataSource correctedDS = cor.correctDocument(pdfDataSource.getDataSource()); - log.info("Correction finished."); - byte [] pdf = DataSourceHelper.convertDataSourceToByteArray(correctedDS); - blocks = ParseDocument.parseDocument(pdf); - pdfDataSource.setDataSource(correctedDS); - } else { - makeError(e); - } - - } catch (Exception e1) { - makeError(e); - } - } - - log.trace("parsePdfIntoIUBlocks finished."); - return blocks; - } - - private static void makeError(Exception e) throws PDFDocumentException { - log.error("Error while parsing Document into IU blocks.", e); - throw new PDFDocumentException(ErrorCode.DOCUMENT_CANNOT_BE_READ, e); - } - -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java b/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java deleted file mode 100644 index fa5ab04..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/TextDataSourceImpl.java +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input; - -import java.io.ByteArrayInputStream; -import java.io.InputStream; -import java.io.UnsupportedEncodingException; - -import at.gv.egiz.pdfas.framework.input.TextDataSource; - -/** - * A TextDataSource that keeps the text in memory. - * - *

- * Keeping the text in memory is fast as long as the text is short, but may - * result in bad memory performance when the text is longer. Use a FileBased - * TextDataSource instead if memory is an issue. - *

- * - * @author wprinz - */ -public class TextDataSourceImpl implements TextDataSource -{ - /** - * The text. - */ - protected String text = null; - - private final static String CHARSET = "UTF-8"; - - /** - * Constructor that sets the text. - * - * @param text - * The text. - */ - public TextDataSourceImpl(String text) - { - this.text = text; - } - - /** - * @see at.gv.egiz.pdfas.framework.input.TextDataSource#getText() - */ - public String getText() - { - return this.text; - } - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#createInputStream() - */ - public InputStream createInputStream() - { - try - { - byte[] data = getText().getBytes(CHARSET); - // PERF: if memory is an issue (e.g. in web), use a FileBased TextDataSource instead. - return new ByteArrayInputStream(data); - } - catch (UnsupportedEncodingException e) - { - throw new RuntimeException(e); - } - } - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#getLength() - */ - public int getLength() - { - try - { - byte[] data = getText().getBytes(CHARSET); - return data.length; - } - catch (UnsupportedEncodingException e) - { - throw new RuntimeException(e); - } - } - - /** - * @see at.gv.egiz.pdfas.framework.input.DataSource#getAsByteArray() - */ - public byte[] getAsByteArray() - { - try - { - byte[] data = getText().getBytes(CHARSET); - return data; - } - catch (UnsupportedEncodingException e) - { - throw new RuntimeException(e); - } - } - -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java b/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java deleted file mode 100644 index efd094a..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/correction/ExternalCorrector.java +++ /dev/null @@ -1,283 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input.correction; - -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.InputStreamReader; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import at.gv.egiz.pdfas.exceptions.ErrorCode; -import at.gv.egiz.pdfas.exceptions.framework.CorrectorException; -import at.gv.egiz.pdfas.framework.input.PdfDataSource; -import at.gv.egiz.pdfas.framework.input.correction.Corrector; -import at.gv.egiz.pdfas.impl.input.FileBased; -import at.gv.egiz.pdfas.impl.input.FileBasedPdfDataSourceImpl; -import at.gv.egiz.pdfas.utils.TempDirHelper; -import at.knowcenter.wag.egov.egiz.cfg.SettingsReader; -import at.knowcenter.wag.egov.egiz.exceptions.SettingNotFoundException; -import at.knowcenter.wag.egov.egiz.exceptions.SettingsException; - -/** - * Corrects the document using an extrenal commandline tool. - * - *

- * Process.destroy after a certain timeout does not work if the executable is a - * Windows batch file. - *

- * - * @author wprinz - */ -public class ExternalCorrector implements Corrector -{ - public static final String INPUT_DOCUMENT_REPLACE = "##input_document##"; - - public static final String OUTPUT_DOCUMENT_REPLACE = "##output_document##"; - - public static final String COMMANDLINE_KEY = "external_corrector_commandline"; - - public static final String TIMEOUT_KEY = "external_corrector_timeout"; - - protected static final int DEFAULT_TIMEOUT = 1000; - - /** - * The log. - */ - private static final Log log = LogFactory.getLog(ExternalCorrector.class); - - /** - * @see at.gv.egiz.pdfas.framework.input.correction.Corrector#correctDocument(at.gv.egiz.pdfas.framework.input.PdfDataSource) - */ - public PdfDataSource correctDocument(PdfDataSource document) throws CorrectorException - { - - try - { - String outName = null; - File in = null; - if (document instanceof FileBased) - { - FileBased fb = (FileBased) document; - in = fb.getFile(); - outName = in.getName() + "_correction_outfile.pdf"; - } - else - { - in = TempDirHelper.placeInputIntoTempDirFile(document.createInputStream(), "correction_infile.pdf"); - outName = "correction_outfile.pdf"; - } - - File out = TempDirHelper.formTempFile(outName); - - String commandline = SettingsReader.getInstance().getSetting(COMMANDLINE_KEY); - long timeout = SettingsReader.getInstance().getIntSetting(TIMEOUT_KEY, DEFAULT_TIMEOUT); - - String inF = in.getAbsolutePath(); - commandline = commandline.replaceFirst(INPUT_DOCUMENT_REPLACE, inF.replaceAll("\\\\", "\\\\\\\\")); - String outF = out.getAbsolutePath(); - commandline = commandline.replaceFirst(OUTPUT_DOCUMENT_REPLACE, outF.replaceAll("\\\\", "\\\\\\\\")); - - log.info(commandline); - - Process p = Runtime.getRuntime().exec(commandline); - - Thread outT = null; - Thread errT = null; - TimeoutThread tt = null; - BufferedReader outReader = null; - BufferedReader errReader = null; - - try - { - outReader = new BufferedReader(new InputStreamReader(p.getInputStream())); - errReader = new BufferedReader(new InputStreamReader(p.getErrorStream())); - - outT = new Thread(new ReaderPrinter(outReader, "STDOUT")); - errT = new Thread(new ReaderPrinter(errReader, "STDERR")); - - tt = new TimeoutThread(p, timeout, new Thread[] { outT, errT }); - - tt.start(); - outT.start(); - errT.start(); - - log.trace("Joining the STDOUT thread..."); - outT.join(); - log.trace("STDOUT thread joined."); - log.trace("Joining the STDERR thread..."); - errT.join(); - log.trace("STDERR thread joined."); - - log.trace("Waiting for process to end..."); - p.waitFor(); - log.trace("process has ended."); - - log.trace("Interrupting timeout thread..."); - tt.interrupt(); - log.trace("timeout thread has been interrupted."); - - int exitValue = p.exitValue(); - log.info("External Corrector exited with: " + exitValue); - - if (tt.isTimedOut()) - { - throw new CorrectorException(ErrorCode.EXTERNAL_CORRECTOR_TIMEOUT_REACHED, "The external corrector process timed out. timeout = " + timeout); - } - - PdfDataSource ds = new FileBasedPdfDataSourceImpl(out, (int) out.length()); - return ds; - } - finally - { - if (outT != null) - { - outT.interrupt(); - } - if (errT != null) - { - errT.interrupt(); - } - if (tt != null) - { - tt.interrupt(); - } - if (outReader != null) - { - outReader.close(); - } - if (errReader != null) - { - errReader.close(); - } - } - - } - catch (IOException e) - { - throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); - } - catch (InterruptedException e) - { - throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); - } - catch (SettingNotFoundException e) - { - throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); - } - catch (SettingsException e) - { - throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); - } - } - - protected static class ReaderPrinter implements Runnable - { - protected BufferedReader reader = null; - - protected String streamName = null; - - public ReaderPrinter(BufferedReader reader, String streamName) - { - this.reader = reader; - this.streamName = streamName; - } - - public void run() - { - try - { - String line = null; - - while ((line = this.reader.readLine()) != null) - { - if (line != null) - { - log.info(streamName + ": " + line); - } - } - } - catch (IOException e) - { - log.error(e.getMessage(), e); - } - } - } - - protected static class TimeoutThread extends Thread - { - protected Process proc = null; - - protected long timeout = -1; - - protected boolean ranIntoTimeout = false; - - protected Thread[] threads; - - protected BufferedReader errReader; - - public TimeoutThread(Process proc, long timeout, Thread[] threadsToInterrupt) - { - this.proc = proc; - this.timeout = timeout; - this.threads = threadsToInterrupt; - } - - public void run() - { - try - { - Thread.sleep(this.timeout); - log.info("The timeout was reached. Destroying the process."); - proc.destroy(); - ranIntoTimeout = true; - log.trace("destroy has been called."); - log.trace("Interrupting threads..."); - for (int i = 0; i < this.threads.length; i++) - { - this.threads[i].interrupt(); - } - log.trace("threads have been interrupted."); - } - catch (InterruptedException e) - { - log.debug("Timeout thread interrupted. This means that the process finished successfully."); - } - } - - /** - * Tells, if the process ran into the timeout. - * - * @return Returns true if the timeout was reached. Returns false if the - * timeout was not reached. - */ - public boolean isTimedOut() - { - return this.ranIntoTimeout; - } - } - -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java b/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java deleted file mode 100644 index eaa6b7f..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/correction/InternalCorrector.java +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input.correction; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; - -import at.gv.egiz.pdfas.exceptions.ErrorCode; -import at.gv.egiz.pdfas.exceptions.framework.CorrectorException; -import at.gv.egiz.pdfas.framework.input.PdfDataSource; -import at.gv.egiz.pdfas.framework.input.correction.Corrector; -import at.gv.egiz.pdfas.impl.input.ByteArrayPdfDataSourceImpl; -import at.gv.egiz.pdfas.utils.PDFASUtils; -import at.knowcenter.wag.egov.egiz.exceptions.PDFDocumentException; - -import com.lowagie.text.DocumentException; -import com.lowagie.text.pdf.PdfReader; -import com.lowagie.text.pdf.PdfStamper; - -/** - * Corrects a document using iText. - * - * @author wprinz - */ -public class InternalCorrector implements Corrector -{ - - /** - * @see at.gv.egiz.pdfas.framework.input.correction.Corrector#correctDocument(at.gv.egiz.pdfas.framework.input.PdfDataSource) - */ - public PdfDataSource correctDocument(PdfDataSource document) throws CorrectorException - { - try - { - byte[] pdf = document.getAsByteArray(); - PdfReader reader = new PdfReader(pdf); - PDFASUtils.checkReaderPermissions(reader); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(pdf.length); - - PdfStamper stamper = new PdfStamper(reader, baos, '\0', false); - stamper.close(); - - baos.close(); - byte[] corrected_pdf = baos.toByteArray(); - - return new ByteArrayPdfDataSourceImpl(corrected_pdf); - } - catch (DocumentException e) - { - throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); - } - catch (IOException e) - { - throw new CorrectorException(ErrorCode.CORRECTOR_EXCEPTION, e); - } catch (PDFDocumentException e) { - throw new CorrectorException(e.getErrorCode(), e); - } - } - -} diff --git a/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java b/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java deleted file mode 100644 index 76a5f99..0000000 --- a/src/main/java/at/gv/egiz/pdfas/impl/input/helper/DataSourceHelper.java +++ /dev/null @@ -1,148 +0,0 @@ -/** - * Copyright 2006 by Know-Center, Graz, Austria - * PDF-AS has been contracted by the E-Government Innovation Center EGIZ, a - * joint initiative of the Federal Chancellery Austria and Graz University of - * Technology. - * - * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by - * the European Commission - subsequent versions of the EUPL (the "Licence"); - * You may not use this work except in compliance with the Licence. - * You may obtain a copy of the Licence at: - * http://www.osor.eu/eupl/ - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the Licence is distributed on an "AS IS" basis, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Licence for the specific language governing permissions and - * limitations under the Licence. - * - * This product combines work with different licenses. See the "NOTICE" text - * file for details on the various modules and licenses. - * The "NOTICE" text file is part of the distribution. Any derivative works - * that you distribute must include a readable copy of the "NOTICE" text file. - */ -package at.gv.egiz.pdfas.impl.input.helper; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; - -import at.gv.egiz.pdfas.performance.PerformanceCounters; -import at.gv.egiz.pdfas.framework.input.DataSource; -import at.gv.egiz.pdfas.framework.input.PdfDataSource; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/** - * @author wprinz - * - */ -public class DataSourceHelper -{ - /** - * The log. - */ - private static final Log log = LogFactory.getLog(DataSourceHelper.class); - - /** - * Converts a PdfDataSource to a byte array. - * - *

- * Note that this function is very memory intensive. Use the Streams whereever - * possible. - *

- * - * @deprecated - * - * @param pdfDataSource - * @return - * @throws IOException - */ - public static byte[] convertDataSourceToByteArray(DataSource pdfDataSource) - { - return pdfDataSource.getAsByteArray(); -// try -// { -// PerformanceCounters.byteArrays.increment(); -// -// byte[] data = new byte[pdfDataSource.getLength()]; -// -// int bytes_written = 0; -// -// InputStream is = pdfDataSource.createInputStream(); -// int n = 0; -// while ((n = is.read(data, bytes_written, data.length - bytes_written)) > 0) -// { -// bytes_written += n; -// } -// is.close(); -// -// assert bytes_written == data.length; -// -// return data; -// } -// catch (IOException e) -// { -// log.error(e); -// throw new RuntimeException(e); -// } - } - - public static byte [] convertInputStreamToByteArray(InputStream inputStream) - { - try - { - return convertInputStreamToByteArrayIOEx(inputStream); - } - catch (IOException e) - { - log.error(e); - throw new RuntimeException(e); - } - } - - public static byte [] convertInputStreamToByteArrayIOEx(InputStream inputStream) throws IOException - { - PerformanceCounters.byteArrays.increment(); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(4096); - - byte[] temp = new byte[4096]; - - int n = 0; - while ((n = inputStream.read(temp)) > 0) - { - baos.write(temp, 0, n); - } - inputStream.close(); - - baos.close(); - byte [] data = baos.toByteArray(); - - return data; - } - - public static void debugDataSourceToFile(DataSource dataSource, File file) - { - try - { - InputStream is = dataSource.createInputStream(); - FileOutputStream fos = new FileOutputStream(file); - byte[] data = new byte[2048]; - int n = -1; - while ((n = is.read(data)) > 0) - { - fos.write(data, 0, n); - } - is.close(); - fos.close(); - } - catch (IOException e) - { - log.error(e); - } - } -} -- cgit v1.2.3