1 files changed, 371 insertions, 0 deletions
diff --git a/src/main/java/test/pdfbox/util/TestTextStripper.java b/src/main/java/test/pdfbox/util/TestTextStripper.java
new file mode 100644
index 0000000..c425f38
--- /dev/null
+++ b/src/main/java/test/pdfbox/util/TestTextStripper.java
@@ -0,0 +1,371 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILIT, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ */
+package test.pdfbox.util;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FilenameFilter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+import org.apache.log4j.Logger;
+import org.apache.log4j.Level;
+
+import org.pdfbox.pdmodel.PDDocument;
+
+import org.pdfbox.util.PDFTextStripper;
+
+/**
+ * Test suite for PDFTextStripper.
+ *
+ * FILE SET VALIDATION
+ *
+ * This test suite is designed to test PDFTextStripper using a set of PDF
+ * files and known good output for each.  The default mode of testAll()
+ * is to process each *.pdf file in "test/input".  An output file is
+ * created in "test/output" with the same name as the PDF file, plus an
+ * additional ".txt" suffix.
+ *
+ * The output file is then tested against a known good result file from
+ * the input directory (again, with the same name as the tested PDF file,
+ * but with the additional ".txt" suffix).
+ *
+ * So for the file "test/input/hello.pdf", an output file will be generated
+ * named "test/output/hello.pdf.txt".  Then that file will be compared to
+ * the known good file "test/input/hello.pdf.txt", if it exists.
+ *
+ * Any errors are logged, and at the end of processing all *.pdf files, if
+ * there were any errors, the test fails.  The logging is at INFO, as the
+ * general goal is overall validation, and on failure, the indication of
+ * which file or files failed.
+ *
+ * When processing new PDF files, you may use testAll() to generate output,
+ * verify the output manually, then move the output file to the test input
+ * directory to use as the basis for future validations.
+ *
+ * SINGLE FILE VALIDATION
+ *
+ * To further research individual failures, the test.pdfbox.util.TextStripper.file
+ * system property may be set with the name of a single file in the "test/input"
+ * directory.  In this mode, testAll() will evaluate only that file, and will
+ * do so with DEBUG level logging.  You can set this property from ant by
+ * defining "file", as in:
+ *
+ *    ant testextract -Dfile=hello.pdf
+ *
+ * @author Robert Dickinson (bob@brutesquadlabs.com)
+ * @author Ben Litchfield (ben@benlitchfield.com)
+ * @version $Revision: 1.14 $
+ */
+public class TestTextStripper extends TestCase
+{
+    private static Logger log = Logger.getLogger(TestTextStripper.class);
+
+    private boolean bFail = false;
+    private PDFTextStripper stripper = null;
+
+    /**
+     * Test class constructor.
+     *
+     * @param name The name of the test class.
+     * 
+     * @throws IOException If there is an error creating the test.
+     */
+    public TestTextStripper( String name ) throws IOException
+    {
+        super( name );
+        stripper = new PDFTextStripper();
+        stripper.setLineSeparator("\n");
+    }
+
+    /**
+     * Test suite setup.
+     */
+    public void setUp()
+    {
+        // If you want to test a single file using DEBUG logging, from an IDE,
+        // you can do something like this:
+        //
+        // System.setProperty("test.pdfbox.util.TextStripper.file", "FVS318Ref.pdf");
+    }
+
+    /**
+     * Determine whether two strings are equal, where two null strings are
+     * considered equal.
+     *
+     * @param expected Excpected string
+     * @param actual Actual String
+     * @return <code>true</code> is the strings are both null,
+     * or if their contents are the same, otherwise <code>false</code>.
+     */
+    private boolean stringsEqual(String expected, String actual)
+    {
+        boolean equals = true;
+        if( (expected == null) && (actual == null) )
+        {
+            return true;
+        }
+        else if( expected != null && actual != null )
+        {
+            expected = expected.trim();
+            actual = actual.trim();
+            char[] expectedArray = expected.toCharArray();
+            char[] actualArray = actual.toCharArray();
+            int expectedIndex = 0;
+            int actualIndex = 0;
+            while( expectedIndex<expectedArray.length && actualIndex<actualArray.length )
+            {
+                if( expectedArray[expectedIndex] != actualArray[actualIndex] )
+                {
+                    equals = false;
+                    log.error("Lines differ at index"
+                     + " expected:" + expectedIndex + "-" + (int)expectedArray[expectedIndex]
+                     + " actual:" + actualIndex + "-" + (int)actualArray[actualIndex] );
+                    break;
+                }
+                expectedIndex = skipWhitespace( expectedArray, expectedIndex );
+                actualIndex = skipWhitespace( actualArray, actualIndex );
+                expectedIndex++;
+                actualIndex++;
+            }
+            if( equals )
+            {
+                if( expectedIndex != expectedArray.length )
+                {
+                    equals = false;
+                    log.error("Expected line is longer at:" + expectedIndex );
+                }
+                if( actualIndex != actualArray.length )
+                {
+                    equals = false;
+                    log.error("Actual line is longer at:" + actualIndex );
+                }
+            }
+        }
+        else if( ( expected == null && actual != null && actual.trim().equals( "" ) ) ||
+            ( actual == null && expected != null && expected.trim().equals( "" ) ) )
+        {
+            //basically there are some cases where pdfbox will put an extra line
+            //at the end of the file, who cares, this is not enough to report
+            // a failure
+            equals = true;
+        }
+        else
+        {
+            equals = false;
+        }
+        return equals;
+    }
+
+    /**
+     * If the current index is whitespace then skip any subsequent whitespace.
+     */
+    private int skipWhitespace( char[] array, int index )
+    {
+        //if we are at a space character then skip all space
+        //characters, but when all done rollback 1 because stringsEqual
+        //will roll forward 1
+        if( array[index] == ' ' || array[index] > 256 )
+        {
+            while( index < array.length && (array[index] == ' ' || array[index] > 256))
+            {
+                index++;
+            }
+            index--;
+        }
+        return index;
+    }
+
+    /**
+     * Validate text extraction on a single file.
+     *
+     * @param file The file to validate
+     * @param bLogResult Whether to log the extracted text
+     * @throws Exception when there is an exception
+     */
+    public void doTestFile(File file, boolean bLogResult)
+        throws Exception
+    {
+        log.info("Preparing to parse " + file.getName());
+
+        
+        OutputStream os = null;
+        Writer writer = null;
+        PDDocument document = null;
+        try
+        {
+            document = PDDocument.load(file);
+
+            File outFile = new File(file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt");
+            os = new FileOutputStream(outFile);
+            os.write( 0xFF );
+            os.write( 0xFE );
+            writer = new OutputStreamWriter(os,"UTF-16LE");
+
+            stripper.writeText(document, writer);
+
+
+
+            if (bLogResult)
+            {
+                log.info("Text for " + file.getName() + ":\r\n" + stripper.getText(document));
+            }
+
+            File expectedFile = new File(file.getParentFile().getParentFile(), "input/" + file.getName() + ".txt");
+            File actualFile = new File(file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt");
+
+            if (!expectedFile.exists())
+            {
+                this.bFail = true;
+                log.error("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() + " did not exist");
+                return;
+            }
+
+            LineNumberReader expectedReader =
+                new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile),"UTF-16"));
+            LineNumberReader actualReader =
+                new LineNumberReader(new InputStreamReader(new FileInputStream(actualFile), "UTF-16"));
+
+            while (true)
+            {
+                String expectedLine = expectedReader.readLine();
+                while( expectedLine != null && expectedLine.trim().length() == 0 )
+                {
+                    expectedLine = expectedReader.readLine();
+                }
+                String actualLine = actualReader.readLine();
+                while( actualLine != null && actualLine.trim().length() == 0 )
+                {
+                    actualLine = actualReader.readLine();
+                }
+                if (!stringsEqual(expectedLine, actualLine))
+                {
+                    this.bFail = true;
+                    log.error("FAILURE: Line mismatch for file " + file.getName() +
+                              " at expected line: " + expectedReader.getLineNumber() +
+                              " at actual line: " + actualReader.getLineNumber() +
+                              "\r\n  expected line was: \"" + expectedLine + "\"" +
+                              "\r\n  actual line was:   \"" + actualLine + "\"");
+                    //lets report all lines, even though this might produce some verbose logging
+                    //break;
+                }
+
+                if( expectedLine == null || actualLine==null)
+                {
+                    break;
+                }
+            }
+        }
+        finally
+        {
+            if( writer != null )
+            {
+                writer.close();
+            }
+            if( os != null )
+            {
+                os.close();
+            }
+            if( document != null )
+            {
+                document.close();
+            }
+        }
+    }
+
+    /**
+     * Test to validate text extraction of file set.
+     *
+     * @throws Exception when there is an exception
+     */
+    public void testExtract()
+        throws Exception
+    {
+        String filename = System.getProperty("test.pdfbox.util.TextStripper.file");
+        File testDir = new File("test/input");
+
+        if ((filename == null) || (filename.length() == 0))
+        {
+            Logger.getRootLogger().setLevel( Level.INFO );
+
+            File[] testFiles = testDir.listFiles(new FilenameFilter()
+            {
+                public boolean accept(File dir, String name)
+                {
+                    return (name.endsWith(".pdf"));
+                }
+            });
+
+            for (int n = 0; n < testFiles.length; n++)
+            {
+                doTestFile(testFiles[n], false);
+            }
+        }
+        else
+        {
+            doTestFile(new File(testDir, filename), true);
+        }
+
+        if (this.bFail)
+        {
+            fail("One or more failures, see test log for details");
+        }
+    }
+
+    /**
+     * Set the tests in the suite for this test class.
+     *
+     * @return the Suite.
+     */
+    public static Test suite()
+    {
+        return new TestSuite( TestTextStripper.class );
+    }
+    
+    /**
+     * Command line execution.
+     * 
+     * @param args Command line arguments.
+     */
+    public static void main( String[] args )
+    {
+        String[] arg = {TestTextStripper.class.getName() };
+        junit.textui.TestRunner.main( arg );
+    }
+}
+\ No newline at end of file