aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/pdfbox/util/PDFText2HTML.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/pdfbox/util/PDFText2HTML.java')
-rw-r--r--src/main/java/org/pdfbox/util/PDFText2HTML.java271
1 files changed, 0 insertions, 271 deletions
diff --git a/src/main/java/org/pdfbox/util/PDFText2HTML.java b/src/main/java/org/pdfbox/util/PDFText2HTML.java
deleted file mode 100644
index 0409eaa..0000000
--- a/src/main/java/org/pdfbox/util/PDFText2HTML.java
+++ /dev/null
@@ -1,271 +0,0 @@
-/**
- * Copyright (c) 2003-2004, www.pdfbox.org
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of pdfbox; nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * http://www.pdfbox.org
- *
- */
-package org.pdfbox.util;
-
-import java.io.IOException;
-
-import java.util.Iterator;
-import java.util.List;
-
-import org.pdfbox.pdmodel.PDDocument;
-
-import org.apache.log4j.Logger;
-
-/**
- * Wrap stripped text in simple HTML, trying to form HTML paragraphs.
- * Paragraphs broken by pages, columns, or figures are not mended.
- *
- *
- * @author jjb - http://www.johnjbarton.com
- * @version $Revision: 1.1 $
- *
- */
-public class PDFText2HTML extends PDFTextStripper
-{
- private static Logger log = Logger.getLogger(PDFText2HTML.class);
- private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
-
- private TextPosition beginTitle;
- private TextPosition afterEndTitle;
- private String titleGuess;
- private boolean suppressParagraphs;
- private boolean onFirstPage = true;
-
- /**
- * Constructor.
- *
- * @throws IOException If there is an error during initialization.
- */
- public PDFText2HTML() throws IOException
- {
- titleGuess = "";
- beginTitle = null;
- afterEndTitle = null;
- suppressParagraphs = false;
- }
-
- /**
- * Write the header to the output document.
- *
- * @throws IOException If there is a problem writing out the header to the document.
- */
- protected void writeHeader() throws IOException
- {
- StringBuffer buf = new StringBuffer(INITIAL_PDF_TO_HTML_BYTES);
- buf.append("<html><head>");
- buf.append("<title>");
- buf.append(getTitleGuess());
- buf.append("</title>");
- buf.append("</head>");
- buf.append("<body>\n");
- getOutput().write(buf.toString());
- }
-
- /**
- * The guess to the document title.
- *
- * @return A string that is the title of this document.
- */
- protected String getTitleGuess()
- {
- return titleGuess;
- }
-
- /**
- * @see PDFTextStripper#flushText
- */
- protected void flushText() throws IOException
- {
- Iterator textIter = getCharactersByArticle().iterator();
-
- if (onFirstPage)
- {
- guessTitle(textIter);
- writeHeader();
- onFirstPage = false;
- }
- super.flushText();
- }
-
- /**
- * @see PDFTextStripper#endDocument( PDDocument )
- */
- public void endDocument(PDDocument pdf) throws IOException
- {
- output.write("</body></html>");
- }
-
- /**
- * This method will attempt to guess the title of the document.
- *
- * @param textIter The characters on the first page.
- * @return The text position that is guessed to be the title.
- */
- protected TextPosition guessTitle(Iterator textIter)
- {
- float lastFontSize = -1.0f;
- int stringsInFont = 0;
- StringBuffer titleText = new StringBuffer();
- while (textIter.hasNext())
- {
- Iterator textByArticle = ((List)textIter.next()).iterator();
- while( textByArticle.hasNext() )
- {
- TextPosition position = (TextPosition) textByArticle.next();
- float currentFontSize = position.getFontSize();
- if (currentFontSize != lastFontSize)
- {
- if (beginTitle != null)
- { // font change in candidate title.
- if (stringsInFont == 0)
- {
- beginTitle = null; // false alarm
- titleText.setLength(0);
- }
- else
- {
- // had a significant font with some words: call it a title
- titleGuess = titleText.toString();
- log.debug("Title candidate =" + titleGuess);
- afterEndTitle = position;
- return beginTitle;
- }
- }
- else
- { // font change and begin == null
- if (currentFontSize > 13.0f)
- { // most body text is 12pt max I guess
- beginTitle = position;
- }
- }
-
- lastFontSize = currentFontSize;
- stringsInFont = 0;
- }
- stringsInFont++;
- if (beginTitle != null)
- {
- titleText.append(position.getCharacter()+" ");
- }
- }
- }
- return beginTitle; // null
- }
-
- /**
- * Write out the paragraph separator.
- *
- * @throws IOException If there is an error writing to the stream.
- */
- protected void startParagraph() throws IOException
- {
- if (! suppressParagraphs)
- {
- getOutput().write("<p>");
- }
- }
- /**
- * Write out the paragraph separator.
- *
- * @throws IOException If there is an error writing to the stream.
- */
- protected void endParagraph() throws IOException
- {
- if (! suppressParagraphs)
- {
- getOutput().write("</p>");
- }
- }
-
- /**
- * @see PDFTextStripper#writeCharacters( TextPosition )
- */
- protected void writeCharacters(TextPosition position ) throws IOException
- {
- if (position == beginTitle)
- {
- output.write("<H1>");
- suppressParagraphs = true;
- }
- if (position == afterEndTitle)
- {
- output.write("</H1>"); // end title and start first paragraph
- suppressParagraphs = false;
- }
-
- String chars = position.getCharacter();
-
- for (int i = 0; i < chars.length(); i++)
- {
- char c = chars.charAt(i);
- if ((c < 32) || (c > 126))
- {
- int charAsInt = c;
- output.write("&#" + charAsInt + ";");
- }
- else
- {
- switch (c)
- {
- case 34:
- output.write("&quot;");
- break;
- case 38:
- output.write("&amp;");
- break;
- case 60:
- output.write("&lt;");
- break;
- case 62:
- output.write("&gt;");
- break;
- default:
- output.write(c);
- }
- }
- }
- }
-
- /**
- * @return Returns the suppressParagraphs.
- */
- public boolean isSuppressParagraphs()
- {
- return suppressParagraphs;
- }
- /**
- * @param shouldSuppressParagraphs The suppressParagraphs to set.
- */
- public void setSuppressParagraphs(boolean shouldSuppressParagraphs)
- {
- this.suppressParagraphs = shouldSuppressParagraphs;
- }
-} \ No newline at end of file