From 6025b6016517c6d898d8957d1d7e03ba71431912 Mon Sep 17 00:00:00 2001 From: tknall Date: Fri, 1 Dec 2006 12:20:24 +0000 Subject: Initial import of release 2.2. git-svn-id: https://joinup.ec.europa.eu/svn/pdf-as/trunk@4 7b5415b0-85f9-ee4d-85bd-d5d0c3b42d1c --- .../org/pdfbox/searchengine/lucene/IndexFiles.java | 308 ++++++++++++++++ .../searchengine/lucene/LucenePDFDocument.java | 387 +++++++++++++++++++++ .../org/pdfbox/searchengine/lucene/package.html | 9 + 3 files changed, 704 insertions(+) create mode 100644 src/main/java/org/pdfbox/searchengine/lucene/IndexFiles.java create mode 100644 src/main/java/org/pdfbox/searchengine/lucene/LucenePDFDocument.java create mode 100644 src/main/java/org/pdfbox/searchengine/lucene/package.html (limited to 'src/main/java/org/pdfbox/searchengine/lucene') diff --git a/src/main/java/org/pdfbox/searchengine/lucene/IndexFiles.java b/src/main/java/org/pdfbox/searchengine/lucene/IndexFiles.java new file mode 100644 index 0000000..dd36dd9 --- /dev/null +++ b/src/main/java/org/pdfbox/searchengine/lucene/IndexFiles.java @@ -0,0 +1,308 @@ +package org.pdfbox.searchengine.lucene; + +/* + * This source was originally written as an example for the lucene project. + * It has been modified to use PDFBox as a lucene document creator. + * -Ben Litchfield + * + *==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +import org.apache.lucene.demo.HTMLDocument; + +import org.apache.lucene.document.Document; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; + +import java.util.Arrays; + + +import java.io.File; +import java.io.IOException; + +import java.util.Date; + + +/** + * This is a class that will index some files on a local filesystem. This code + * was modified from a demo that comes with the lucene search engine. + * + * @author Lucene team + * @author Ben Litchfield (ben@csh.rit.edu) + * + * @version $Revision: 1.6 $ + */ +public class IndexFiles +{ + private boolean deleting = false; // true during deletion pass + private IndexReader reader; // existing index + private IndexWriter writer; // new index being built + private TermEnum uidIter; // document id iterator + + /** + * This is the main entry point for the indexer. + * + * @param argv The command line arguments. + */ + public static void main(String[] argv) + { + + String index = "index"; + boolean create = false; + File root = null; + + String usage = "org.pdfbox.searchengine.lucene.IndexFiles [-create] [-index ] "; + + if (argv.length == 0) + { + System.err.println("Usage: " + usage); + return; + } + + for (int i = 0; i < argv.length; i++) + { + if (argv[i].equals("-index")) + { // parse -index option + index = argv[++i]; + } + else if (argv[i].equals("-create")) + { // parse -create option + create = true; + } + else if (i != argv.length-1) + { + System.err.println("Usage: " + usage); + return; + } + else + { + System.out.println( "root=" +argv[i] ); + root = new File(argv[i]); + } + } + IndexFiles indexer = new IndexFiles(); + indexer.index( root, create, index ); + } + + /** + * This will index a directory. + * + * @param root The root directory to start indexing. + * @param create Should we create a new index? + * @param index The name of the index. + */ + public void index( File root, boolean create, String index ) + { + + try + { + Date start = new Date(); + + writer = new IndexWriter(index, new StandardAnalyzer(), create); + writer.maxFieldLength = 1000000; + + if (!create) + { // delete stale docs + deleting = true; + indexDocs(root, index, create); + } + + indexDocs(root, index, create); // add new docs + + System.out.println("Optimizing index..."); + writer.optimize(); + writer.close(); + + Date end = new Date(); + + System.out.print(end.getTime() - start.getTime()); + System.out.println(" total milliseconds"); + + } + catch( Exception e ) + { + e.printStackTrace(); + } + } + + /** + * Walk directory hierarchy in uid order, while keeping uid iterator from + * existing index in sync. Mismatches indicate one of: (a) old documents to + * be deleted; (b) unchanged documents, to be left alone; or (c) new + * documents, to be indexed. + * + * @param file The directory to index. + * @param index The index to add the file to. + * @param create A flag telling if we should create the index. + * + * @throws Exception If there is any error indexing the directory. + */ + private void indexDocs(File file, String index, boolean create) throws Exception + { + if (!create) + { // incrementally update + + reader = IndexReader.open(index); // open existing index + uidIter = reader.terms(new Term("uid", "")); // init uid iterator + + indexDocs(file); + + if (deleting) + { // delete rest of stale docs + while (uidIter.term() != null && uidIter.term().field().equals( "uid" ) ) + { + System.out.println("deleting " + + HTMLDocument.uid2url(uidIter.term().text())); + reader.delete(uidIter.term()); + uidIter.next(); + } + deleting = false; + } + + uidIter.close(); // close uid iterator + reader.close(); // close existing index + + } + else + { + indexDocs(file); + } + } + + + private void indexDocs(File file) throws Exception + { + if (file.isDirectory()) + { // if a directory + String[] files = file.list(); // list its files + Arrays.sort(files); // sort the files + for (int i = 0; i < files.length; i++) // recursively index them + { + indexDocs(new File(file, files[i])); + } + } + else + { + if (uidIter != null) + { + String uid = HTMLDocument.uid(file); // construct uid for doc + + while( uidIter.term() != null && + uidIter.term().field().equals( "uid" ) && + uidIter.term().text().compareTo(uid) < 0) + { + if (deleting) + { // delete stale docs + System.out.println("deleting " + + HTMLDocument.uid2url(uidIter.term().text())); + reader.delete(uidIter.term()); + } + uidIter.next(); + } + if( uidIter.term() != null && + uidIter.term().field().equals( "uid" ) && + uidIter.term().text().compareTo(uid) == 0) + { + System.out.println( "Next uid=" +uidIter ); + uidIter.next(); // keep matching docs + } + } + else + { + try + { + addDocument( file ); + } + catch( IOException e ) + { + //catch exception and move onto the next document + System.out.println( e.getMessage() ); + } + } + } + } + + private void addDocument( File file ) throws IOException, InterruptedException + { + String path = file.getName().toUpperCase(); + Document doc = null; + //Gee, this would be a great place for a command pattern + if( path.endsWith(".HTML") || // index .html files + path.endsWith(".HTM") || // index .htm files + path.endsWith(".TXT")) + { + System.out.println( "Indexing Text document: " + file ); + doc = HTMLDocument.Document(file); + } + else if( path.endsWith( ".PDF" ) ) + { + System.out.println( "Indexing PDF document: " + file ); + doc = LucenePDFDocument.getDocument( file ); + } + else + { + System.out.println( "Skipping " + file ); + } + + if( doc != null ) + { + writer.addDocument(doc); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/pdfbox/searchengine/lucene/LucenePDFDocument.java b/src/main/java/org/pdfbox/searchengine/lucene/LucenePDFDocument.java new file mode 100644 index 0000000..1bb8d0c --- /dev/null +++ b/src/main/java/org/pdfbox/searchengine/lucene/LucenePDFDocument.java @@ -0,0 +1,387 @@ +/** + * Copyright (c) 2003, www.pdfbox.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. Neither the name of pdfbox; nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * http://www.pdfbox.org + * + */ +package org.pdfbox.searchengine.lucene; + +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.IOException; +import java.io.StringReader; +import java.io.StringWriter; + +import java.net.URL; +import java.net.URLConnection; + +import java.util.Date; + +import org.apache.lucene.document.DateField; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import org.pdfbox.pdmodel.PDDocument; +import org.pdfbox.pdmodel.PDDocumentInformation; + +import org.pdfbox.exceptions.CryptographyException; +import org.pdfbox.exceptions.InvalidPasswordException; + +import org.pdfbox.util.PDFTextStripper; + +/** + * This class is used to create a document for the lucene search engine. + * This should easily plug into the IndexHTML or IndexFiles that comes with + * the lucene project. This class will populate the following fields. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Lucene Field NameDescription
pathFile system path if loaded from a file
urlURL to PDF document
contentsEntire contents of PDF document, indexed but not stored
summaryFirst 500 characters of content
modifiedThe modified date/time according to the url or path
uidA unique identifier for the Lucene document.
CreationDateFrom PDF meta-data if available
CreatorFrom PDF meta-data if available
KeywordsFrom PDF meta-data if available
ModificationDateFrom PDF meta-data if available
ProducerFrom PDF meta-data if available
SubjectFrom PDF meta-data if available
TrappedFrom PDF meta-data if available
+ * + * @author Ben Litchfield + * @version $Revision: 1.18 $ + */ +public final class LucenePDFDocument +{ + private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); + + + /** + * private constructor because there are only static methods. + */ + private LucenePDFDocument() + { + //utility class should not be instantiated + } + + /** + * This will get a lucene document from a PDF file. + * + * @param is The stream to read the PDF from. + * + * @return The lucene document. + * + * @throws IOException If there is an error parsing or indexing the document. + */ + public static Document getDocument( InputStream is ) throws IOException + { + Document document = new Document(); + addContent( document, is, "" ); + return document; + } + + /** + * This will get a lucene document from a PDF file. + * + * @param file The file to get the document for. + * + * @return The lucene document. + * + * @throws IOException If there is an error parsing or indexing the document. + */ + public static Document getDocument( File file ) throws IOException + { + Document document = new Document(); + + // Add the url as a field named "url". Use an UnIndexed field, so + // that the url is just stored with the document, but is not searchable. + document.add( Field.UnIndexed("path", file.getPath() ) ); + document.add(Field.UnIndexed("url", file.getPath().replace(FILE_SEPARATOR, '/'))); + + // Add the last modified date of the file a field named "modified". Use a + // Keyword field, so that it's searchable, but so that no attempt is made + // to tokenize the field into words. + document.add(Field.Keyword("modified", DateField.timeToString( file.lastModified() ))); + + String uid = file.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + + DateField.timeToString(file.lastModified() ); + + // Add the uid as a field, so that index can be incrementally maintained. + // This field is not stored with document, it is indexed, but it is not + // tokenized prior to indexing. + document.add(new Field("uid", uid, false, true, false)); + + FileInputStream input = null; + try + { + input = new FileInputStream( file ); + addContent( document, input, file.getPath() ); + } + finally + { + if( input != null ) + { + input.close(); + } + } + + + // return the document + + return document; + } + + /** + * This will get a lucene document from a PDF file. + * + * @param url The file to get the document for. + * + * @return The lucene document. + * + * @throws IOException If there is an error parsing or indexing the document. + */ + public static Document getDocument( URL url ) throws IOException + { + Document document = new Document(); + URLConnection connection = url.openConnection(); + connection.connect(); + // Add the url as a field named "url". Use an UnIndexed field, so + // that the url is just stored with the document, but is not searchable. + document.add( Field.UnIndexed("url", url.toExternalForm() ) ); + + // Add the last modified date of the file a field named "modified". Use a + // Keyword field, so that it's searchable, but so that no attempt is made + // to tokenize the field into words. + document.add(Field.Keyword("modified", DateField.timeToString( connection.getLastModified()))); + + String uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + + DateField.timeToString( connection.getLastModified() ); + + // Add the uid as a field, so that index can be incrementally maintained. + // This field is not stored with document, it is indexed, but it is not + // tokenized prior to indexing. + document.add(new Field("uid", uid, false, true, false)); + + InputStream input = null; + try + { + input = connection.getInputStream(); + addContent( document, input,url.toExternalForm() ); + } + finally + { + if( input != null ) + { + input.close(); + } + } + + // return the document + return document; + } + + /** + * This will add the contents to the lucene document. + * + * @param document The document to add the contents to. + * @param is The stream to get the contents from. + * @param documentLocation The location of the document, used just for debug messages. + * + * @throws IOException If there is an error parsing the document. + */ + private static void addContent( Document document, InputStream is, String documentLocation ) throws IOException + { + PDDocument pdfDocument = null; + try + { + pdfDocument = PDDocument.load( is ); + + + if( pdfDocument.isEncrypted() ) + { + //Just try using the default password and move on + pdfDocument.decrypt( "" ); + } + + //create a writer where to append the text content. + StringWriter writer = new StringWriter(); + PDFTextStripper stripper = new PDFTextStripper(); + stripper.writeText( pdfDocument, writer ); + + // Note: the buffer to string operation is costless; + // the char array value of the writer buffer and the content string + // is shared as long as the buffer content is not modified, which will + // not occur here. + String contents = writer.getBuffer().toString(); + + StringReader reader = new StringReader( contents ); + + // Add the tag-stripped contents as a Reader-valued Text field so it will + // get tokenized and indexed. + document.add( Field.Text( "contents", reader ) ); + + PDDocumentInformation info = pdfDocument.getDocumentInformation(); + if( info.getAuthor() != null ) + { + document.add(Field.Text( "Author", info.getAuthor() ) ); + } + if( info.getCreationDate() != null ) + { + Date date = info.getCreationDate().getTime(); + //for some reason lucene cannot handle dates before the epoch + //and throws a nasty RuntimeException, so we will check and + //verify that this does not happen + if( date.getTime() >= 0 ) + { + document.add(Field.Text("CreationDate", DateField.dateToString( date ) ) ); + } + } + if( info.getCreator() != null ) + { + document.add( Field.Text( "Creator", info.getCreator() ) ); + } + if( info.getKeywords() != null ) + { + document.add( Field.Text( "Keywords", info.getKeywords() ) ); + } + if( info.getModificationDate() != null ) + { + Date date = info.getModificationDate().getTime(); + //for some reason lucene cannot handle dates before the epoch + //and throws a nasty RuntimeException, so we will check and + //verify that this does not happen + if( date.getTime() >= 0 ) + { + document.add(Field.Text("ModificationDate", DateField.dateToString( date ) ) ); + } + } + if( info.getProducer() != null ) + { + document.add( Field.Text( "Producer", info.getProducer() ) ); + } + if( info.getSubject() != null ) + { + document.add( Field.Text( "Subject", info.getSubject() ) ); + } + if( info.getTitle() != null ) + { + document.add( Field.Text( "Title", info.getTitle() ) ); + } + if( info.getTrapped() != null ) + { + document.add( Field.Text( "Trapped", info.getTrapped() ) ); + } + + int summarySize = Math.min( contents.length(), 500 ); + String summary = contents.substring( 0, summarySize ); + // Add the summary as an UnIndexed field, so that it is stored and returned + // with hit documents for display. + document.add( Field.UnIndexed( "summary", summary ) ); + } + catch( CryptographyException e ) + { + throw new IOException( "Error decrypting document(" + documentLocation + "): " + e ); + } + catch( InvalidPasswordException e ) + { + //they didn't suppply a password and the default of "" was wrong. + throw new IOException( "Error: The document(" + documentLocation + + ") is encrypted and will not be indexed." ); + } + finally + { + if( pdfDocument != null ) + { + pdfDocument.close(); + } + } + } + + /** + * This will test creating a document. + * + * usage: java pdfparser.searchengine.lucene.LucenePDFDocument <pdf-document> + * + * @param args command line arguments. + * + * @throws IOException If there is an error. + */ + public static void main( String[] args ) throws IOException + { + if( args.length != 1 ) + { + System.err.println( "usage: java org.pdfbox.searchengine.lucene.LucenePDFDocument " ); + System.exit( 1 ); + } + System.out.println( "Document=" + getDocument( new File( args[0] ) ) ); + } +} \ No newline at end of file diff --git a/src/main/java/org/pdfbox/searchengine/lucene/package.html b/src/main/java/org/pdfbox/searchengine/lucene/package.html new file mode 100644 index 0000000..fbf3a38 --- /dev/null +++ b/src/main/java/org/pdfbox/searchengine/lucene/package.html @@ -0,0 +1,9 @@ + + + + + + +This package holds classes that are used to integrate the PDFBox project with lucene. + + -- cgit v1.2.3