aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/pdfbox/util/PDFText2HTML.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/pdfbox/util/PDFText2HTML.java')
-rw-r--r--src/main/java/org/pdfbox/util/PDFText2HTML.java271
1 files changed, 271 insertions, 0 deletions
diff --git a/src/main/java/org/pdfbox/util/PDFText2HTML.java b/src/main/java/org/pdfbox/util/PDFText2HTML.java
new file mode 100644
index 0000000..0409eaa
--- /dev/null
+++ b/src/main/java/org/pdfbox/util/PDFText2HTML.java
@@ -0,0 +1,271 @@
+/**
+ * Copyright (c) 2003-2004, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.util;
+
+import java.io.IOException;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.pdfbox.pdmodel.PDDocument;
+
+import org.apache.log4j.Logger;
+
+/**
+ * Wrap stripped text in simple HTML, trying to form HTML paragraphs.
+ * Paragraphs broken by pages, columns, or figures are not mended.
+ *
+ *
+ * @author jjb - http://www.johnjbarton.com
+ * @version $Revision: 1.1 $
+ *
+ */
+public class PDFText2HTML extends PDFTextStripper
+{
+ private static Logger log = Logger.getLogger(PDFText2HTML.class);
+ private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
+
+ private TextPosition beginTitle;
+ private TextPosition afterEndTitle;
+ private String titleGuess;
+ private boolean suppressParagraphs;
+ private boolean onFirstPage = true;
+
+ /**
+ * Constructor.
+ *
+ * @throws IOException If there is an error during initialization.
+ */
+ public PDFText2HTML() throws IOException
+ {
+ titleGuess = "";
+ beginTitle = null;
+ afterEndTitle = null;
+ suppressParagraphs = false;
+ }
+
+ /**
+ * Write the header to the output document.
+ *
+ * @throws IOException If there is a problem writing out the header to the document.
+ */
+ protected void writeHeader() throws IOException
+ {
+ StringBuffer buf = new StringBuffer(INITIAL_PDF_TO_HTML_BYTES);
+ buf.append("<html><head>");
+ buf.append("<title>");
+ buf.append(getTitleGuess());
+ buf.append("</title>");
+ buf.append("</head>");
+ buf.append("<body>\n");
+ getOutput().write(buf.toString());
+ }
+
+ /**
+ * The guess to the document title.
+ *
+ * @return A string that is the title of this document.
+ */
+ protected String getTitleGuess()
+ {
+ return titleGuess;
+ }
+
+ /**
+ * @see PDFTextStripper#flushText
+ */
+ protected void flushText() throws IOException
+ {
+ Iterator textIter = getCharactersByArticle().iterator();
+
+ if (onFirstPage)
+ {
+ guessTitle(textIter);
+ writeHeader();
+ onFirstPage = false;
+ }
+ super.flushText();
+ }
+
+ /**
+ * @see PDFTextStripper#endDocument( PDDocument )
+ */
+ public void endDocument(PDDocument pdf) throws IOException
+ {
+ output.write("</body></html>");
+ }
+
+ /**
+ * This method will attempt to guess the title of the document.
+ *
+ * @param textIter The characters on the first page.
+ * @return The text position that is guessed to be the title.
+ */
+ protected TextPosition guessTitle(Iterator textIter)
+ {
+ float lastFontSize = -1.0f;
+ int stringsInFont = 0;
+ StringBuffer titleText = new StringBuffer();
+ while (textIter.hasNext())
+ {
+ Iterator textByArticle = ((List)textIter.next()).iterator();
+ while( textByArticle.hasNext() )
+ {
+ TextPosition position = (TextPosition) textByArticle.next();
+ float currentFontSize = position.getFontSize();
+ if (currentFontSize != lastFontSize)
+ {
+ if (beginTitle != null)
+ { // font change in candidate title.
+ if (stringsInFont == 0)
+ {
+ beginTitle = null; // false alarm
+ titleText.setLength(0);
+ }
+ else
+ {
+ // had a significant font with some words: call it a title
+ titleGuess = titleText.toString();
+ log.debug("Title candidate =" + titleGuess);
+ afterEndTitle = position;
+ return beginTitle;
+ }
+ }
+ else
+ { // font change and begin == null
+ if (currentFontSize > 13.0f)
+ { // most body text is 12pt max I guess
+ beginTitle = position;
+ }
+ }
+
+ lastFontSize = currentFontSize;
+ stringsInFont = 0;
+ }
+ stringsInFont++;
+ if (beginTitle != null)
+ {
+ titleText.append(position.getCharacter()+" ");
+ }
+ }
+ }
+ return beginTitle; // null
+ }
+
+ /**
+ * Write out the paragraph separator.
+ *
+ * @throws IOException If there is an error writing to the stream.
+ */
+ protected void startParagraph() throws IOException
+ {
+ if (! suppressParagraphs)
+ {
+ getOutput().write("<p>");
+ }
+ }
+ /**
+ * Write out the paragraph separator.
+ *
+ * @throws IOException If there is an error writing to the stream.
+ */
+ protected void endParagraph() throws IOException
+ {
+ if (! suppressParagraphs)
+ {
+ getOutput().write("</p>");
+ }
+ }
+
+ /**
+ * @see PDFTextStripper#writeCharacters( TextPosition )
+ */
+ protected void writeCharacters(TextPosition position ) throws IOException
+ {
+ if (position == beginTitle)
+ {
+ output.write("<H1>");
+ suppressParagraphs = true;
+ }
+ if (position == afterEndTitle)
+ {
+ output.write("</H1>"); // end title and start first paragraph
+ suppressParagraphs = false;
+ }
+
+ String chars = position.getCharacter();
+
+ for (int i = 0; i < chars.length(); i++)
+ {
+ char c = chars.charAt(i);
+ if ((c < 32) || (c > 126))
+ {
+ int charAsInt = c;
+ output.write("&#" + charAsInt + ";");
+ }
+ else
+ {
+ switch (c)
+ {
+ case 34:
+ output.write("&quot;");
+ break;
+ case 38:
+ output.write("&amp;");
+ break;
+ case 60:
+ output.write("&lt;");
+ break;
+ case 62:
+ output.write("&gt;");
+ break;
+ default:
+ output.write(c);
+ }
+ }
+ }
+ }
+
+ /**
+ * @return Returns the suppressParagraphs.
+ */
+ public boolean isSuppressParagraphs()
+ {
+ return suppressParagraphs;
+ }
+ /**
+ * @param shouldSuppressParagraphs The suppressParagraphs to set.
+ */
+ public void setSuppressParagraphs(boolean shouldSuppressParagraphs)
+ {
+ this.suppressParagraphs = shouldSuppressParagraphs;
+ }
+} \ No newline at end of file