1 files changed, 271 insertions, 0 deletions
diff --git a/src/main/java/org/pdfbox/util/PDFText2HTML.java b/src/main/java/org/pdfbox/util/PDFText2HTML.java
new file mode 100644
index 0000000..0409eaa
--- /dev/null
+++ b/src/main/java/org/pdfbox/util/PDFText2HTML.java
@@ -0,0 +1,271 @@
+/**
+ * Copyright (c) 2003-2004, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ *    contributors may be used to endorse or promote products derived from this
+ *    software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.util;
+
+import java.io.IOException;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.pdfbox.pdmodel.PDDocument;
+
+import org.apache.log4j.Logger;
+
+/**
+ * Wrap stripped text in simple HTML, trying to form HTML paragraphs.
+ * Paragraphs broken by pages, columns, or figures are not mended.
+ * 
+ * 
+ * @author jjb - http://www.johnjbarton.com
+ * @version  $Revision: 1.1 $
+ * 
+ */
+public class PDFText2HTML extends PDFTextStripper 
+{
+    private static Logger log = Logger.getLogger(PDFText2HTML.class);
+    private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
+
+    private TextPosition beginTitle;
+    private TextPosition afterEndTitle;
+    private String titleGuess;
+    private boolean suppressParagraphs;
+    private boolean onFirstPage = true;
+   
+    /**
+     * Constructor.
+     * 
+     * @throws IOException If there is an error during initialization.
+     */
+    public PDFText2HTML() throws IOException 
+    {
+        titleGuess = "";
+        beginTitle = null;
+        afterEndTitle = null;
+        suppressParagraphs = false;
+    }
+
+    /**
+     * Write the header to the output document.
+     * 
+     * @throws IOException If there is a problem writing out the header to the document.
+     */
+    protected void writeHeader() throws IOException 
+    {
+        StringBuffer buf = new StringBuffer(INITIAL_PDF_TO_HTML_BYTES);
+        buf.append("<html><head>");
+        buf.append("<title>");
+        buf.append(getTitleGuess());
+        buf.append("</title>");
+        buf.append("</head>");
+        buf.append("<body>\n");
+        getOutput().write(buf.toString());
+    }
+   
+    /**
+     * The guess to the document title.
+     * 
+     * @return A string that is the title of this document.
+     */
+    protected String getTitleGuess() 
+    {
+        return titleGuess;
+    }
+   
+    /**
+     * @see PDFTextStripper#flushText
+     */
+    protected void flushText() throws IOException 
+    {
+        Iterator textIter = getCharactersByArticle().iterator();
+      
+        if (onFirstPage) 
+        {
+            guessTitle(textIter);
+            writeHeader();
+            onFirstPage = false;
+        }
+        super.flushText();
+    }
+    
+    /**
+     * @see PDFTextStripper#endDocument( PDDocument )
+     */
+    public void endDocument(PDDocument pdf) throws IOException 
+    {
+        output.write("</body></html>");      
+    }
+
+    /**
+     * This method will attempt to guess the title of the document.
+     * 
+     * @param textIter The characters on the first page.
+     * @return The text position that is guessed to be the title.
+     */
+    protected TextPosition guessTitle(Iterator textIter) 
+    {
+        float lastFontSize = -1.0f;
+        int stringsInFont = 0;
+        StringBuffer titleText = new StringBuffer();
+        while (textIter.hasNext()) 
+        {
+            Iterator textByArticle = ((List)textIter.next()).iterator();
+            while( textByArticle.hasNext() )
+            {
+                TextPosition position = (TextPosition) textByArticle.next();
+                float currentFontSize = position.getFontSize();
+                if (currentFontSize != lastFontSize) 
+                {
+                    if (beginTitle != null) 
+                    { // font change in candidate title.
+                        if (stringsInFont == 0) 
+                        {
+                            beginTitle = null; // false alarm
+                            titleText.setLength(0);
+                        } 
+                        else 
+                        {
+                            // had a significant font with some words: call it a title
+                            titleGuess = titleText.toString();
+                            log.debug("Title candidate =" + titleGuess);
+                            afterEndTitle = position;
+                            return beginTitle;
+                        }
+                    } 
+                    else 
+                    { // font change and begin == null
+                        if (currentFontSize > 13.0f) 
+                        { // most body text is 12pt max I guess
+                            beginTitle = position;
+                        }
+                    }
+         
+                    lastFontSize = currentFontSize;
+                    stringsInFont = 0;
+                } 
+                stringsInFont++;
+                if (beginTitle != null)
+                {
+                    titleText.append(position.getCharacter()+" ");
+                }
+            }
+        }
+        return beginTitle; // null
+    }
+    
+    /**
+     * Write out the paragraph separator.
+     * 
+     * @throws IOException If there is an error writing to the stream.
+     */
+    protected void startParagraph() throws IOException 
+    {
+        if (! suppressParagraphs) 
+        {
+            getOutput().write("<p>");
+        }
+    }
+    /**
+     * Write out the paragraph separator.
+     * 
+     * @throws IOException If there is an error writing to the stream.
+     */
+    protected void endParagraph() throws IOException 
+    {
+        if (! suppressParagraphs) 
+        {
+            getOutput().write("</p>");
+        }
+    }
+    
+    /**
+     * @see PDFTextStripper#writeCharacters( TextPosition )
+     */
+    protected void writeCharacters(TextPosition position ) throws IOException 
+    {
+        if (position == beginTitle) 
+        {
+            output.write("<H1>");
+            suppressParagraphs = true;
+        } 
+        if (position == afterEndTitle) 
+        {
+            output.write("</H1>");  // end title and start first paragraph
+            suppressParagraphs = false;
+        }
+      
+        String chars = position.getCharacter();
+
+        for (int i = 0; i < chars.length(); i++) 
+        {
+            char c = chars.charAt(i);
+            if ((c < 32) || (c > 126)) 
+            {
+                int charAsInt = c;
+                output.write("&#" + charAsInt + ";");
+            } 
+            else 
+            {
+                switch (c) 
+                {
+                    case 34:
+                        output.write("&quot;");
+                        break;
+                    case 38:
+                        output.write("&amp;");
+                        break;
+                    case 60:
+                        output.write("&lt;");
+                        break;
+                    case 62:
+                        output.write("&gt;");
+                        break;
+                    default:
+                        output.write(c);
+                }
+            }
+        }
+    }
+    
+    /**
+     * @return Returns the suppressParagraphs.
+     */
+    public boolean isSuppressParagraphs()
+    {
+        return suppressParagraphs;
+    }
+    /**
+     * @param shouldSuppressParagraphs The suppressParagraphs to set.
+     */
+    public void setSuppressParagraphs(boolean shouldSuppressParagraphs)
+    {
+        this.suppressParagraphs = shouldSuppressParagraphs;
+    }
+}
+\ No newline at end of file