From 6025b6016517c6d898d8957d1d7e03ba71431912 Mon Sep 17 00:00:00 2001 From: tknall Date: Fri, 1 Dec 2006 12:20:24 +0000 Subject: Initial import of release 2.2. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- src/main/java/org/pdfbox/util/PDFHighlighter.java | 213 ++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 src/main/java/org/pdfbox/util/PDFHighlighter.java (limited to 'src/main/java/org/pdfbox/util/PDFHighlighter.java') diff --git a/src/main/java/org/pdfbox/util/PDFHighlighter.java b/src/main/java/org/pdfbox/util/PDFHighlighter.java new file mode 100644 index 0000000..6c27225 --- /dev/null +++ b/src/main/java/org/pdfbox/util/PDFHighlighter.java @@ -0,0 +1,213 @@ +package org.pdfbox.util; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.pdfbox.pdmodel.PDDocument; +import org.pdfbox.pdmodel.PDPage; + + +/** + * Highlighting of words in a PDF document with an XML file. + * + * @author slagraulet (slagraulet@cardiweb.com) + * @author Ben Litchfield (ben@csh.rit.edu) + * @version $Revision: 1.6 $ + * + * @see + * Adobe Highlight File Format + */ +public class PDFHighlighter extends PDFTextStripper +{ + private Writer highlighterOutput = null; + //private Color highlightColor = Color.YELLOW; + + private String[] searchedWords; + private ByteArrayOutputStream textOS = null; + private Writer textWriter = null; + + /** + * Default constructor. + * + * @throws IOException If there is an error constructing this class. + */ + public PDFHighlighter() throws IOException + { + super(); + super.setLineSeparator( "" ); + super.setPageSeparator( "" ); + super.setWordSeparator( "" ); + super.setShouldSeparateByBeads( false ); + super.setSuppressDuplicateOverlappingText( false ); + } + + /** + * Generate an XML highlight string based on the PDF. + * + * @param pdDocument The PDF to find words in. + * @param highlightWord The word to search for. + * @param xmlOutput The resulting output xml file. + * + * @throws IOException If there is an error reading from the PDF, or writing to the XML. + */ + public void generateXMLHighlight(PDDocument pdDocument, String highlightWord, Writer xmlOutput ) throws IOException + { + generateXMLHighlight( pdDocument, new String[] { highlightWord }, xmlOutput ); + } + + /** + * Generate an XML highlight string based on the PDF. + * + * @param pdDocument The PDF to find words in. + * @param sWords The words to search for. + * @param xmlOutput The resulting output xml file. + * + * @throws IOException If there is an error reading from the PDF, or writing to the XML. + */ + public void generateXMLHighlight(PDDocument pdDocument, String[] sWords, Writer xmlOutput ) throws IOException + { + highlighterOutput = xmlOutput; + searchedWords = sWords; + highlighterOutput.write("\n\n\n"); + textOS = new ByteArrayOutputStream(); + textWriter = new OutputStreamWriter( textOS, "UTF-16" ); + writeText(pdDocument, textWriter); + highlighterOutput.write("\n\n"); + highlighterOutput.flush(); + } + + /** + * @see PDFTextStripper#endPage( PDPage ) + */ + protected void endPage( PDPage pdPage ) throws IOException + { + textWriter.flush(); + + String page = new String( textOS.toByteArray(), "UTF-16" ); + textOS.reset(); + //page = page.replaceAll( "\n", "" ); + //page = page.replaceAll( "\r", "" ); + //page = CCRStringUtil.stripChar(page, '\n'); + //page = CCRStringUtil.stripChar(page, '\r'); + + // Traitement des listes à puces (caractères spéciaux) + if (page.indexOf("a") != -1) + { + page = page.replaceAll("a[0-9]{1,3}", "."); + } + + for (int i = 0; i < searchedWords.length; i++) + { + Pattern pattern = Pattern.compile(searchedWords[i], Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(page); + while( matcher.find() ) + { + int begin = matcher.start(); + int end = matcher.end(); + highlighterOutput.write(" \n"); + } + } + } + + /** + * Command line application. + * + * @param args The command line arguments to the application. + * + * @throws IOException If there is an error generating the highlight file. + */ + public static void main(String[] args) throws IOException + { + PDFHighlighter xmlExtractor = new PDFHighlighter(); + PDDocument doc = null; + try + { + if( args.length < 2 ) + { + usage(); + } + String[] highlightStrings = new String[ args.length - 1]; + System.arraycopy( args, 1, highlightStrings, 0, highlightStrings.length ); + doc = PDDocument.load( args[0] ); + + xmlExtractor.generateXMLHighlight( + doc, + highlightStrings, + new OutputStreamWriter( System.out ) ); + } + finally + { + if( doc != null ) + { + doc.close(); + } + } + } + + private static void usage() + { + System.err.println( "usage: java " + PDFHighlighter.class.getName() + " word1 word2 word3 ..." ); + System.exit( 1 ); + } + + + /** + * Get the color to highlight the strings with. Default is Color.YELLOW. + * + * @return The color to highlight strings with. + */ + /*public Color getHighlightColor() + { + return highlightColor; + }**/ + + /** + * Get the color to highlight the strings with. Default is Color.YELLOW. + * + * @param color The color to highlight strings with. + */ + /*public void setHighlightColor(Color color) + { + this.highlightColor = color; + }**/ + + /** + * Set the highlight color using HTML like rgb string. The string must be 6 characters long. + * + * @param color The color to use for highlighting. Should be in the format of "FF0000". + */ + /*public void setHighlightColor( String color ) + { + highlightColor = Color.decode( color ); + }**/ + + /** + * Get the highlight color as an HTML like string. This will return a string of six characters. + * + * @return The current highlight color. For example FF0000 + */ + /*public String getHighlightColorAsString() + { + //BJL: kudos to anyone that has a cleaner way of doing this! + String red = Integer.toHexString( highlightColor.getRed() ); + String green = Integer.toHexString( highlightColor.getGreen() ); + String blue = Integer.toHexString( highlightColor.getBlue() ); + + return (red.length() < 2 ? "0" + red : red) + + (green.length() < 2 ? "0" + green : green) + + (blue.length() < 2 ? "0" + blue : blue); + }**/ +} \ No newline at end of file -- cgit v1.2.3