From 6025b6016517c6d898d8957d1d7e03ba71431912 Mon Sep 17 00:00:00 2001 From: tknall Date: Fri, 1 Dec 2006 12:20:24 +0000 Subject: Initial import of release 2.2. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../java/test/pdfbox/util/TestTextStripper.java | 371 +++++++++++++++++++++ .../pdfbox/util/TestTextStripperPerformance.java | 173 ++++++++++ src/main/java/test/pdfbox/util/package.html | 9 + 3 files changed, 553 insertions(+) create mode 100644 src/main/java/test/pdfbox/util/TestTextStripper.java create mode 100644 src/main/java/test/pdfbox/util/TestTextStripperPerformance.java create mode 100644 src/main/java/test/pdfbox/util/package.html (limited to 'src/main/java/test/pdfbox/util') diff --git a/src/main/java/test/pdfbox/util/TestTextStripper.java b/src/main/java/test/pdfbox/util/TestTextStripper.java new file mode 100644 index 0000000..c425f38 --- /dev/null +++ b/src/main/java/test/pdfbox/util/TestTextStripper.java @@ -0,0 +1,371 @@ +/** + * Copyright (c) 2003-2005, www.pdfbox.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. Neither the name of pdfbox; nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILIT, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * http://www.pdfbox.org + */ +package test.pdfbox.util; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FilenameFilter; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +import org.apache.log4j.Logger; +import org.apache.log4j.Level; + +import org.pdfbox.pdmodel.PDDocument; + +import org.pdfbox.util.PDFTextStripper; + +/** + * Test suite for PDFTextStripper. + * + * FILE SET VALIDATION + * + * This test suite is designed to test PDFTextStripper using a set of PDF + * files and known good output for each. The default mode of testAll() + * is to process each *.pdf file in "test/input". An output file is + * created in "test/output" with the same name as the PDF file, plus an + * additional ".txt" suffix. + * + * The output file is then tested against a known good result file from + * the input directory (again, with the same name as the tested PDF file, + * but with the additional ".txt" suffix). + * + * So for the file "test/input/hello.pdf", an output file will be generated + * named "test/output/hello.pdf.txt". Then that file will be compared to + * the known good file "test/input/hello.pdf.txt", if it exists. + * + * Any errors are logged, and at the end of processing all *.pdf files, if + * there were any errors, the test fails. The logging is at INFO, as the + * general goal is overall validation, and on failure, the indication of + * which file or files failed. + * + * When processing new PDF files, you may use testAll() to generate output, + * verify the output manually, then move the output file to the test input + * directory to use as the basis for future validations. + * + * SINGLE FILE VALIDATION + * + * To further research individual failures, the test.pdfbox.util.TextStripper.file + * system property may be set with the name of a single file in the "test/input" + * directory. In this mode, testAll() will evaluate only that file, and will + * do so with DEBUG level logging. You can set this property from ant by + * defining "file", as in: + * + * ant testextract -Dfile=hello.pdf + * + * @author Robert Dickinson (bob@brutesquadlabs.com) + * @author Ben Litchfield (ben@benlitchfield.com) + * @version $Revision: 1.14 $ + */ +public class TestTextStripper extends TestCase +{ + private static Logger log = Logger.getLogger(TestTextStripper.class); + + private boolean bFail = false; + private PDFTextStripper stripper = null; + + /** + * Test class constructor. + * + * @param name The name of the test class. + * + * @throws IOException If there is an error creating the test. + */ + public TestTextStripper( String name ) throws IOException + { + super( name ); + stripper = new PDFTextStripper(); + stripper.setLineSeparator("\n"); + } + + /** + * Test suite setup. + */ + public void setUp() + { + // If you want to test a single file using DEBUG logging, from an IDE, + // you can do something like this: + // + // System.setProperty("test.pdfbox.util.TextStripper.file", "FVS318Ref.pdf"); + } + + /** + * Determine whether two strings are equal, where two null strings are + * considered equal. + * + * @param expected Excpected string + * @param actual Actual String + * @return true is the strings are both null, + * or if their contents are the same, otherwise false. + */ + private boolean stringsEqual(String expected, String actual) + { + boolean equals = true; + if( (expected == null) && (actual == null) ) + { + return true; + } + else if( expected != null && actual != null ) + { + expected = expected.trim(); + actual = actual.trim(); + char[] expectedArray = expected.toCharArray(); + char[] actualArray = actual.toCharArray(); + int expectedIndex = 0; + int actualIndex = 0; + while( expectedIndex 256 ) + { + while( index < array.length && (array[index] == ' ' || array[index] > 256)) + { + index++; + } + index--; + } + return index; + } + + /** + * Validate text extraction on a single file. + * + * @param file The file to validate + * @param bLogResult Whether to log the extracted text + * @throws Exception when there is an exception + */ + public void doTestFile(File file, boolean bLogResult) + throws Exception + { + log.info("Preparing to parse " + file.getName()); + + + OutputStream os = null; + Writer writer = null; + PDDocument document = null; + try + { + document = PDDocument.load(file); + + File outFile = new File(file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt"); + os = new FileOutputStream(outFile); + os.write( 0xFF ); + os.write( 0xFE ); + writer = new OutputStreamWriter(os,"UTF-16LE"); + + stripper.writeText(document, writer); + + + + if (bLogResult) + { + log.info("Text for " + file.getName() + ":\r\n" + stripper.getText(document)); + } + + File expectedFile = new File(file.getParentFile().getParentFile(), "input/" + file.getName() + ".txt"); + File actualFile = new File(file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt"); + + if (!expectedFile.exists()) + { + this.bFail = true; + log.error("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() + " did not exist"); + return; + } + + LineNumberReader expectedReader = + new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile),"UTF-16")); + LineNumberReader actualReader = + new LineNumberReader(new InputStreamReader(new FileInputStream(actualFile), "UTF-16")); + + while (true) + { + String expectedLine = expectedReader.readLine(); + while( expectedLine != null && expectedLine.trim().length() == 0 ) + { + expectedLine = expectedReader.readLine(); + } + String actualLine = actualReader.readLine(); + while( actualLine != null && actualLine.trim().length() == 0 ) + { + actualLine = actualReader.readLine(); + } + if (!stringsEqual(expectedLine, actualLine)) + { + this.bFail = true; + log.error("FAILURE: Line mismatch for file " + file.getName() + + " at expected line: " + expectedReader.getLineNumber() + + " at actual line: " + actualReader.getLineNumber() + + "\r\n expected line was: \"" + expectedLine + "\"" + + "\r\n actual line was: \"" + actualLine + "\""); + //lets report all lines, even though this might produce some verbose logging + //break; + } + + if( expectedLine == null || actualLine==null) + { + break; + } + } + } + finally + { + if( writer != null ) + { + writer.close(); + } + if( os != null ) + { + os.close(); + } + if( document != null ) + { + document.close(); + } + } + } + + /** + * Test to validate text extraction of file set. + * + * @throws Exception when there is an exception + */ + public void testExtract() + throws Exception + { + String filename = System.getProperty("test.pdfbox.util.TextStripper.file"); + File testDir = new File("test/input"); + + if ((filename == null) || (filename.length() == 0)) + { + Logger.getRootLogger().setLevel( Level.INFO ); + + File[] testFiles = testDir.listFiles(new FilenameFilter() + { + public boolean accept(File dir, String name) + { + return (name.endsWith(".pdf")); + } + }); + + for (int n = 0; n < testFiles.length; n++) + { + doTestFile(testFiles[n], false); + } + } + else + { + doTestFile(new File(testDir, filename), true); + } + + if (this.bFail) + { + fail("One or more failures, see test log for details"); + } + } + + /** + * Set the tests in the suite for this test class. + * + * @return the Suite. + */ + public static Test suite() + { + return new TestSuite( TestTextStripper.class ); + } + + /** + * Command line execution. + * + * @param args Command line arguments. + */ + public static void main( String[] args ) + { + String[] arg = {TestTextStripper.class.getName() }; + junit.textui.TestRunner.main( arg ); + } +} \ No newline at end of file diff --git a/src/main/java/test/pdfbox/util/TestTextStripperPerformance.java b/src/main/java/test/pdfbox/util/TestTextStripperPerformance.java new file mode 100644 index 0000000..0aadb30 --- /dev/null +++ b/src/main/java/test/pdfbox/util/TestTextStripperPerformance.java @@ -0,0 +1,173 @@ +/** + * Copyright (c) 2003-2004, www.pdfbox.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. Neither the name of pdfbox; nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILIT, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * http://www.pdfbox.org + */ +package test.pdfbox.util; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +import org.apache.log4j.Logger; + +import org.pdfbox.pdmodel.PDDocument; + +import org.pdfbox.util.PDFTextStripper; + +/** + * Test the performance of the PDF text stripper utility. + * + * @author Ben Litchfield (ben@csh.rit.edu) + * @version $Revision: 1.2 $ + */ +public class TestTextStripperPerformance extends TestCase +{ + private static Logger log = Logger.getLogger(TestTextStripperPerformance.class); + + private boolean bFail = false; + + /** + * Test class constructor. + * + * @param name The name of the test class. + */ + public TestTextStripperPerformance( String name ) + { + super( name ); + } + + /** + * Test suite setup. + */ + public void setUp() + { + } + + + /** + * Validate text extraction on a single file. + * + * @param file The file to validate + * @param bLogResult Whether to log the extracted text + * @throws Exception when there is an exception + */ + public void doTestFile(File file, boolean bLogResult) + throws Exception + { + + PDFTextStripper stripper = new PDFTextStripper(); + OutputStream os = null; + Writer writer = null; + PDDocument document = null; + try + { + document = PDDocument.load(file); + + File outFile = new File(file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt"); + os = new FileOutputStream(outFile); + writer = new OutputStreamWriter(os); + + stripper.writeText(document, writer); + } + finally + { + if( writer != null ) + { + writer.close(); + } + if( os != null ) + { + os.close(); + } + if( document != null ) + { + document.close(); + } + } + } + + /** + * Test to validate text extraction of file set. + * + * @throws Exception when there is an exception + */ + public void testExtract() + throws Exception + { + String filename = System.getProperty("test.pdfbox.util.TextStripper.file"); + File testDir = new File("test/input"); + + if ((filename == null) || (filename.length() == 0)) + { + File[] testFiles = testDir.listFiles(new FilenameFilter() + { + public boolean accept(File dir, String name) + { + return (name.endsWith(".pdf")); + } + }); + + for (int n = 0; n < testFiles.length; n++) + { + doTestFile(testFiles[n], false); + } + } + else + { + //doTestFile(new File(testDir, filename), true); + } + } + + /** + * Set the tests in the suite for this test class. + * + * @return the Suite. + */ + public static Test suite() + { + return new TestSuite( TestTextStripperPerformance.class ); + } + + /** + * Command line execution. + * + * @param args Command line arguments. + */ + public static void main( String[] args ) + { + String[] arg = {TestTextStripperPerformance.class.getName() }; + junit.textui.TestRunner.main( arg ); + } +} \ No newline at end of file diff --git a/src/main/java/test/pdfbox/util/package.html b/src/main/java/test/pdfbox/util/package.html new file mode 100644 index 0000000..8d98577 --- /dev/null +++ b/src/main/java/test/pdfbox/util/package.html @@ -0,0 +1,9 @@ + + + + + + +These classes will be used to test the text extraction capabilities that are available with PDFBox. + + -- cgit v1.2.3